[PATCH v4 6/7] sched/deadline: Deferrable dl server

Daniel Bristot de Oliveira posted 7 patches 2 years, 5 months ago
There is a newer version of this series
[PATCH v4 6/7] sched/deadline: Deferrable dl server
Posted by Daniel Bristot de Oliveira 2 years, 5 months ago
Among the motivations for the DL servers is the real-time throttling
mechanism. This mechanism works by throttling the rt_rq after
running for a long period without leaving space for fair tasks.

The base dl server avoids this problem by boosting fair tasks instead
of throttling the rt_rq. The point is that it boosts without waiting
for potential starvation, causing some non-intuitive cases.

For example, an IRQ dispatches two tasks on an idle system, a fair
and an RT. The DL server will be activated, running the fair task
before the RT one. This problem can be avoided by deferring the
dl server activation.

By passing the deferring option, the dl_server will dispatch an
SCHED_DEADLINE reservation throttled, and the replenishment
timer set for (period - runtime) ns from start time. Thus,
boosting the fair rq on its 0-laxity time with respect
to rt_rq.

The fair server will be scheduled under EDF, with a new
a period at the replenishment time, thus not breaking dl tasks.

Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
---
 include/linux/sched.h   |  7 +++++
 kernel/sched/deadline.c | 61 ++++++++++++++++++++++++++++++++++++++---
 kernel/sched/fair.c     | 10 ++++---
 kernel/sched/rt.c       |  6 ++++
 kernel/sched/sched.h    | 12 +++++++-
 5 files changed, 87 insertions(+), 9 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 40fbf3f034e0..38d0b3de03b2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -609,6 +609,12 @@ struct sched_rt_entity {
 typedef bool (*dl_server_has_tasks_f)(struct sched_dl_entity *);
 typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *);
 
+enum dl_server_state {
+	DL_SERVER_STOPPED = 0,
+	DL_SERVER_DEFER,
+	DL_SERVER_RUNNING
+};
+
 struct sched_dl_entity {
 	struct rb_node			rb_node;
 
@@ -685,6 +691,7 @@ struct sched_dl_entity {
 	struct rq			*rq;
 	dl_server_has_tasks_f		server_has_tasks;
 	dl_server_pick_f		server_pick;
+	enum dl_server_state			server_state;
 
 #ifdef CONFIG_RT_MUTEXES
 	/*
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 7844cfb73029..7f1c52bfe78f 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -422,7 +422,7 @@ static void task_non_contending(struct sched_dl_entity *dl_se)
 	if (dl_entity_is_special(dl_se))
 		return;
 
-	WARN_ON(dl_se->dl_non_contending);
+	WARN_ON_ONCE(dl_se->dl_non_contending);
 
 	zerolag_time = dl_se->deadline -
 		 div64_long((dl_se->runtime * dl_se->dl_period),
@@ -1155,6 +1155,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
 		struct rq_flags rf;
 
 		rq_lock(rq, &rf);
+
 		if (dl_se->dl_throttled) {
 			sched_clock_tick();
 			update_rq_clock(rq);
@@ -1165,9 +1166,12 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
 				__push_dl_task(rq, &rf);
 			} else {
 				replenish_dl_entity(dl_se);
+				task_non_contending(dl_se);
 			}
 
 		}
+
+		dl_se->server_state = DL_SERVER_RUNNING;
 		rq_unlock(rq, &rf);
 
 		return HRTIMER_NORESTART;
@@ -1441,18 +1445,65 @@ void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec)
 	update_curr_dl_se(dl_se->rq, dl_se, delta_exec);
 }
 
-void dl_server_start(struct sched_dl_entity *dl_se)
+void dl_server_start(struct sched_dl_entity *dl_se, int defer)
 {
+	if (dl_se->server_state != DL_SERVER_STOPPED) {
+		WARN_ON_ONCE(!(on_dl_rq(dl_se) || dl_se->dl_throttled));
+		return;
+	}
+
+	if (defer) {
+		/*
+		 * Postpone the replenishment to the (next period - the execution time)
+		 *
+		 * With this in place, we have two cases:
+		 *
+		 * On the absence of DL tasks:
+		 *	The server will start at the replenishment time, getting
+		 *	its runtime before now + period. This is the expected
+		 *	throttling behavior.
+		 *
+		 * In the presense of DL tasks:
+		 *	The server will be replenished, and then it will be
+		 *	schedule according to EDF, not breaking SCHED_DEADLINE.
+		 *
+		 *	In the first cycle the server will be postponed at most
+		 *	at period + period - runtime at most. But then the
+		 *	server will receive its runtime/period.
+		 *
+		 *	The server will, however, run on top of any RT task, which
+		 *	is the expected throttling behavior.
+		 */
+		dl_se->deadline = rq_clock(dl_se->rq) + dl_se->dl_period - dl_se->dl_runtime;
+		/* Zero the runtime */
+		dl_se->runtime = 0;
+		/* throttle the server */
+		dl_se->dl_throttled = 1;
+
+		dl_se->server_state = DL_SERVER_DEFER;
+		start_dl_timer(dl_se);
+		return;
+	}
+
 	if (!dl_server(dl_se)) {
 		dl_se->dl_server = 1;
 		setup_new_dl_entity(dl_se);
 	}
+
+	dl_se->server_state = DL_SERVER_RUNNING;
 	enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP);
 }
 
 void dl_server_stop(struct sched_dl_entity *dl_se)
 {
+	if (dl_se->server_state == DL_SERVER_STOPPED)
+		return;
+
+	hrtimer_try_to_cancel(&dl_se->dl_timer);
 	dequeue_dl_entity(dl_se, DEQUEUE_SLEEP);
+
+	dl_se->dl_throttled = 0;
+	dl_se->server_state = DL_SERVER_STOPPED;
 }
 
 void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
@@ -1462,6 +1513,8 @@ void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
 	dl_se->rq = rq;
 	dl_se->server_has_tasks = has_tasks;
 	dl_se->server_pick = pick;
+	dl_se->server_state = DL_SERVER_STOPPED;
+	dl_se->dl_server = 1;
 }
 
 /*
@@ -1817,8 +1870,9 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags)
 	 * (the task moves from "active contending" to "active non contending"
 	 * or "inactive")
 	 */
-	if (flags & DEQUEUE_SLEEP)
+	if (flags & DEQUEUE_SLEEP && !dl_server(dl_se))
 		task_non_contending(dl_se);
+
 }
 
 static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
@@ -1875,7 +1929,6 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 		enqueue_pushable_dl_task(rq, p);
 }
 
-
 static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 {
 	update_curr_dl(rq);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 580e6764a68b..b9d0f08dc8ca 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6499,9 +6499,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	 */
 	util_est_enqueue(&rq->cfs, p);
 
-	if (!rq->cfs.h_nr_running)
-		dl_server_start(&rq->fair_server);
-
 	/*
 	 * If in_iowait is set, the code below may not trigger any cpufreq
 	 * utilization updates, so do it here explicitly with the IOWAIT flag
@@ -6568,6 +6565,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		update_overutilized_status(rq);
 
 enqueue_throttle:
+	if (sched_fair_server_needed(rq))
+		dl_server_start(&rq->fair_server, rq->fair_server_defer);
+
 	assert_list_leaf_cfs_rq(rq);
 
 	hrtick_update(rq);
@@ -6646,7 +6646,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		rq->next_balance = jiffies;
 
 dequeue_throttle:
-	if (!rq->cfs.h_nr_running)
+	if (!sched_fair_server_needed(rq))
 		dl_server_stop(&rq->fair_server);
 
 	util_est_update(&rq->cfs, p, task_sleep);
@@ -8317,6 +8317,8 @@ void fair_server_init(struct rq *rq)
 	dl_se->dl_deadline = 1000 * NSEC_PER_MSEC;
 	dl_se->dl_period = 1000 * NSEC_PER_MSEC;
 
+	rq->fair_server_defer = 1;
+
 	dl_server_init(dl_se, rq, fair_server_has_tasks, fair_server_pick);
 }
 
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index e23cc67c9467..7595110a5a3e 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1537,6 +1537,9 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 
 	if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
 		enqueue_pushable_task(rq, p);
+
+	if (sched_fair_server_needed(rq))
+		dl_server_start(&rq->fair_server, rq->fair_server_defer);
 }
 
 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
@@ -1547,6 +1550,9 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 	dequeue_rt_entity(rt_se, flags);
 
 	dequeue_pushable_task(rq, p);
+
+	if (!sched_fair_server_needed(rq))
+		dl_server_stop(&rq->fair_server);
 }
 
 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ac94c386741c..510c4db379be 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -345,7 +345,7 @@ extern int  dl_bw_check_overflow(int cpu);
  *   dl_server_init() -- initializes the server.
  */
 extern void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec);
-extern void dl_server_start(struct sched_dl_entity *dl_se);
+extern void dl_server_start(struct sched_dl_entity *dl_se, int defer);
 extern void dl_server_stop(struct sched_dl_entity *dl_se);
 extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
 		    dl_server_has_tasks_f has_tasks,
@@ -1027,6 +1027,7 @@ struct rq {
 	struct dl_rq		dl;
 
 	struct sched_dl_entity	fair_server;
+	int			fair_server_defer;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* list of leaf cfs_rq on this CPU: */
@@ -2394,6 +2395,15 @@ static inline bool sched_fair_runnable(struct rq *rq)
 	return rq->cfs.nr_running > 0;
 }
 
+static inline bool sched_fair_server_needed(struct rq *rq)
+{
+	/*
+	 * The fair server will activate anytime a fair task can starve
+	 * because real-time tasks.
+	 */
+	return (sched_rt_runnable(rq) && sched_fair_runnable(rq));
+}
+
 extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
 extern struct task_struct *pick_next_task_idle(struct rq *rq);
 
-- 
2.40.1
Re: [PATCH v4 6/7] sched/deadline: Deferrable dl server
Posted by Peter Zijlstra 2 years, 5 months ago
On Thu, Aug 31, 2023 at 10:28:57PM +0200, Daniel Bristot de Oliveira wrote:
> +void dl_server_start(struct sched_dl_entity *dl_se, int defer)
>  {
> +	if (dl_se->server_state != DL_SERVER_STOPPED) {
> +		WARN_ON_ONCE(!(on_dl_rq(dl_se) || dl_se->dl_throttled));
> +		return;
> +	}
> +
> +	if (defer) {
> +		/*
> +		 * Postpone the replenishment to the (next period - the execution time)
> +		 *
> +		 * With this in place, we have two cases:
> +		 *
> +		 * On the absence of DL tasks:
> +		 *	The server will start at the replenishment time, getting
> +		 *	its runtime before now + period. This is the expected
> +		 *	throttling behavior.
> +		 *
> +		 * In the presense of DL tasks:
> +		 *	The server will be replenished, and then it will be
> +		 *	schedule according to EDF, not breaking SCHED_DEADLINE.
> +		 *
> +		 *	In the first cycle the server will be postponed at most
> +		 *	at period + period - runtime at most. But then the
> +		 *	server will receive its runtime/period.
> +		 *
> +		 *	The server will, however, run on top of any RT task, which
> +		 *	is the expected throttling behavior.
> +		 */
> +		dl_se->deadline = rq_clock(dl_se->rq) + dl_se->dl_period - dl_se->dl_runtime;

I was confused by this, but this is an instance of
replenish_dl_new_period(), where we explicitly do not preserve the
period.

> +		/* Zero the runtime */
> +		dl_se->runtime = 0;
> +		/* throttle the server */
> +		dl_se->dl_throttled = 1;

These comments are both obvious and placed so as to make everything
unreadable :/ 

> +
> +		dl_se->server_state = DL_SERVER_DEFER;
> +		start_dl_timer(dl_se);
> +		return;
> +	}
> +
>  	if (!dl_server(dl_se)) {
>  		dl_se->dl_server = 1;
>  		setup_new_dl_entity(dl_se);
>  	}
> +
> +	dl_se->server_state = DL_SERVER_RUNNING;
>  	enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP);
>  }


> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 580e6764a68b..b9d0f08dc8ca 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6499,9 +6499,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>  	 */
>  	util_est_enqueue(&rq->cfs, p);
>  
> -	if (!rq->cfs.h_nr_running)
> -		dl_server_start(&rq->fair_server);
> -
>  	/*
>  	 * If in_iowait is set, the code below may not trigger any cpufreq
>  	 * utilization updates, so do it here explicitly with the IOWAIT flag
> diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
> index e23cc67c9467..7595110a5a3e 100644
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -1537,6 +1537,9 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
>  
>  	if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
>  		enqueue_pushable_task(rq, p);
> +
> +	if (sched_fair_server_needed(rq))
> +		dl_server_start(&rq->fair_server, rq->fair_server_defer);
>  }
>  
>  static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
> @@ -1547,6 +1550,9 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
>  	dequeue_rt_entity(rt_se, flags);
>  
>  	dequeue_pushable_task(rq, p);
> +
> +	if (!sched_fair_server_needed(rq))
> +		dl_server_stop(&rq->fair_server);
>  }
>  
>  /*

I'm thinking this is wrong -- ISTR there definite benefits to explicit
slack time scheduling, meaning the server should also run while DL tasks
are on. Additionally, running the server when there are only fair tasks
is mostly harmless, right? So why this change?

One of the benefits was -- IIRC, that we no longer need to subtract some
random margin from the reservation limit, but there were more I think.
Re: [PATCH v4 6/7] sched/deadline: Deferrable dl server
Posted by Daniel Bristot de Oliveira 2 years, 5 months ago
On 9/5/23 15:42, Peter Zijlstra wrote:
> On Thu, Aug 31, 2023 at 10:28:57PM +0200, Daniel Bristot de Oliveira wrote:
>> +void dl_server_start(struct sched_dl_entity *dl_se, int defer)
>>  {
>> +	if (dl_se->server_state != DL_SERVER_STOPPED) {
>> +		WARN_ON_ONCE(!(on_dl_rq(dl_se) || dl_se->dl_throttled));
>> +		return;
>> +	}
>> +
>> +	if (defer) {
>> +		/*
>> +		 * Postpone the replenishment to the (next period - the execution time)
>> +		 *
>> +		 * With this in place, we have two cases:
>> +		 *
>> +		 * On the absence of DL tasks:
>> +		 *	The server will start at the replenishment time, getting
>> +		 *	its runtime before now + period. This is the expected
>> +		 *	throttling behavior.
>> +		 *
>> +		 * In the presense of DL tasks:
>> +		 *	The server will be replenished, and then it will be
>> +		 *	schedule according to EDF, not breaking SCHED_DEADLINE.
>> +		 *
>> +		 *	In the first cycle the server will be postponed at most
>> +		 *	at period + period - runtime at most. But then the
>> +		 *	server will receive its runtime/period.
>> +		 *
>> +		 *	The server will, however, run on top of any RT task, which
>> +		 *	is the expected throttling behavior.
>> +		 */
>> +		dl_se->deadline = rq_clock(dl_se->rq) + dl_se->dl_period - dl_se->dl_runtime;
> 
> I was confused by this, but this is an instance of
> replenish_dl_new_period(), where we explicitly do not preserve the
> period.

yeah, it is two operations in one (so not straight forward). It sets the period as the now - runtime, so..


>> +		/* Zero the runtime */
>> +		dl_se->runtime = 0;
>> +		/* throttle the server */
>> +		dl_se->dl_throttled = 1;

as the server is throttled, it will set the replenishing timer for now - runtime + period because
of the deadline.

> These comments are both obvious and placed so as to make everything
> unreadable :/ 

I can't disagree :-) I will remove.

>> +
>> +		dl_se->server_state = DL_SERVER_DEFER;
>> +		start_dl_timer(dl_se);
>> +		return;
>> +	}
>> +
>>  	if (!dl_server(dl_se)) {
>>  		dl_se->dl_server = 1;
>>  		setup_new_dl_entity(dl_se);
>>  	}
>> +
>> +	dl_se->server_state = DL_SERVER_RUNNING;
>>  	enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP);
>>  }
> 
> 
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 580e6764a68b..b9d0f08dc8ca 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -6499,9 +6499,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>>  	 */
>>  	util_est_enqueue(&rq->cfs, p);
>>  
>> -	if (!rq->cfs.h_nr_running)
>> -		dl_server_start(&rq->fair_server);
>> -
>>  	/*
>>  	 * If in_iowait is set, the code below may not trigger any cpufreq
>>  	 * utilization updates, so do it here explicitly with the IOWAIT flag
>> diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
>> index e23cc67c9467..7595110a5a3e 100644
>> --- a/kernel/sched/rt.c
>> +++ b/kernel/sched/rt.c
>> @@ -1537,6 +1537,9 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
>>  
>>  	if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
>>  		enqueue_pushable_task(rq, p);
>> +
>> +	if (sched_fair_server_needed(rq))
>> +		dl_server_start(&rq->fair_server, rq->fair_server_defer);
>>  }
>>  
>>  static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
>> @@ -1547,6 +1550,9 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
>>  	dequeue_rt_entity(rt_se, flags);
>>  
>>  	dequeue_pushable_task(rq, p);
>> +
>> +	if (!sched_fair_server_needed(rq))
>> +		dl_server_stop(&rq->fair_server);
>>  }
>>  
>>  /*
> 
> I'm thinking this is wrong -- ISTR there definite benefits to explicit
> slack time scheduling, meaning the server should also run while DL tasks
> are on.

I can add the check to enable it also in the presence of DL tasks, we just need
to have consider that the dl server is a dl task as well when disabling.

It is a benefit because it will fix the case in which a CPU full of DL tasks
(possible with global).

Additionally, running the server when there are only fair tasks
> is mostly harmless, right? So why this change?

Rhe problem is that we never know when a RT one will arrive :-/

E.g., only fair tasks, server enabled. All fine :-) Then an RT arrives, and gets
postponed by the server... RT people complaining (for those thinking on adding
a server for RT, we will have a similar problem as we have with throttling today +
DL has no fixed priority).

We will still face the same problem with defferable server, and the example is the same,
just with a shift in the RT arrival. e.g., only fair task for (server period - runtime),
then the server gets enabled, and a 1us after the RT arrives and gets postponed... again.

So the need to enable it only after the dispatch of a kind of RT (or DL to be added)
tasks that can cause starvation.

We could simplify it by enabling only on RT/DL arrival but... if there is nothing to
starve... it is not needed anyways so less overhead avoiding the enqueue...

> 
> One of the benefits was -- IIRC, that we no longer need to subtract some
> random margin from the reservation limit, but there were more I think.
> 

We can simplify things because we can start ignoring the rt throttling limit when
there is no need to throttle... like on grub (but, only after we remove the rt
throttling).

Thoughts?
-- Daniel
Re: [PATCH v4 6/7] sched/deadline: Deferrable dl server
Posted by Peter Zijlstra 2 years, 5 months ago
On Tue, Sep 05, 2023 at 05:24:40PM +0200, Daniel Bristot de Oliveira wrote:
> On 9/5/23 15:42, Peter Zijlstra wrote:
> > On Thu, Aug 31, 2023 at 10:28:57PM +0200, Daniel Bristot de Oliveira wrote:
> >> +void dl_server_start(struct sched_dl_entity *dl_se, int defer)
> >>  {
> >> +	if (dl_se->server_state != DL_SERVER_STOPPED) {
> >> +		WARN_ON_ONCE(!(on_dl_rq(dl_se) || dl_se->dl_throttled));
> >> +		return;
> >> +	}
> >> +
> >> +	if (defer) {
> >> +		/*
> >> +		 * Postpone the replenishment to the (next period - the execution time)

perhaps explicitly mention zero-laxity here, then we all know what is
meant, no?

> >> +		 *
> >> +		 * With this in place, we have two cases:
> >> +		 *
> >> +		 * On the absence of DL tasks:
> >> +		 *	The server will start at the replenishment time, getting
> >> +		 *	its runtime before now + period. This is the expected
> >> +		 *	throttling behavior.
> >> +		 *
> >> +		 * In the presense of DL tasks:
> >> +		 *	The server will be replenished, and then it will be
> >> +		 *	schedule according to EDF, not breaking SCHED_DEADLINE.
> >> +		 *
> >> +		 *	In the first cycle the server will be postponed at most
> >> +		 *	at period + period - runtime at most. But then the
> >> +		 *	server will receive its runtime/period.
> >> +		 *
> >> +		 *	The server will, however, run on top of any RT task, which
> >> +		 *	is the expected throttling behavior.
> >> +		 */
> >> +		dl_se->deadline = rq_clock(dl_se->rq) + dl_se->dl_period - dl_se->dl_runtime;
> > 
> > I was confused by this, but this is an instance of
> > replenish_dl_new_period(), where we explicitly do not preserve the
> > period.
> 
> yeah, it is two operations in one (so not straight forward). It sets
> the period as the now - runtime, so..

Right. I'll just attribute it to me being generally confused about
everything after holidays ;-)

The thing that tripped me was that the use of rq_clock() breaks
periodicy, except of course you want/need that when you start a new
activation.

> >> +		/* Zero the runtime */
> >> +		dl_se->runtime = 0;
> >> +		/* throttle the server */
> >> +		dl_se->dl_throttled = 1;
> 
> as the server is throttled, it will set the replenishing timer for now - runtime + period because
> of the deadline.

Yeah, it's a wee hack to move it to the zero-laxity point. I was
considering if it makes sense to push that down and make it available
for all DL tasks, but I'm not sure..

> > I'm thinking this is wrong -- ISTR there definite benefits to explicit
> > slack time scheduling, meaning the server should also run while DL tasks
> > are on.
> 
> I can add the check to enable it also in the presence of DL tasks, we just need
> to have consider that the dl server is a dl task as well when disabling.

Why not keep what was there, always run the thing when there's FAIR
tasks around? Or do you see severe overhead from running it with only
FAIR tasks on?

> It is a benefit because it will fix the case in which a CPU full of DL tasks
> (possible with global).
> 
> Additionally, running the server when there are only fair tasks
> > is mostly harmless, right? So why this change?
> 
> Rhe problem is that we never know when a RT one will arrive :-/
> 
> E.g., only fair tasks, server enabled. All fine :-) Then an RT arrives, and gets
> postponed by the server... RT people complaining (for those thinking on adding
> a server for RT, we will have a similar problem as we have with throttling today +
> DL has no fixed priority).

Well, the thing is, the moment you run DL tasks on that machine those RT
tasks will get delayed *anyway*. RT people need to stop pretending FIFO
is the highest class.

But yeah, I can see why they might get upset if the time is then used to
run FAIR tasks while their RT task gets to wait.

> We will still face the same problem with defferable server, and the example is the same,
> just with a shift in the RT arrival. e.g., only fair task for (server period - runtime),
> then the server gets enabled, and a 1us after the RT arrives and gets postponed... again.
> 
> So the need to enable it only after the dispatch of a kind of RT (or DL to be added)
> tasks that can cause starvation.
> 
> We could simplify it by enabling only on RT/DL arrival but... if there is nothing to
> starve... it is not needed anyways so less overhead avoiding the enqueue...

So one thing we could do is have update_curr_fair() decrement
fair_server's runtime and yield the period then it hits 0 (and capping
it at 0, not allowing it to go negative or so).

That way you only force the situation when FAIR hasn't had it's allotted
time this perio, and only for as much as to make up for the time it
lacks.

> > 
> > One of the benefits was -- IIRC, that we no longer need to subtract some
> > random margin from the reservation limit, but there were more I think.
> > 
> 
> We can simplify things because we can start ignoring the rt throttling limit when
> there is no need to throttle... like on grub (but, only after we remove the rt
> throttling).
> 
> Thoughts?

The moment the fair server is there (these patches) you should also kill
the throttling. It doesn't make sense to have both.
Re: [PATCH v4 6/7] sched/deadline: Deferrable dl server
Posted by Daniel Bristot de Oliveira 2 years, 5 months ago
On 9/6/23 10:29, Peter Zijlstra wrote:
> On Tue, Sep 05, 2023 at 05:24:40PM +0200, Daniel Bristot de Oliveira wrote:
>> On 9/5/23 15:42, Peter Zijlstra wrote:
>>> On Thu, Aug 31, 2023 at 10:28:57PM +0200, Daniel Bristot de Oliveira wrote:
>>>> +void dl_server_start(struct sched_dl_entity *dl_se, int defer)
>>>>  {
>>>> +	if (dl_se->server_state != DL_SERVER_STOPPED) {
>>>> +		WARN_ON_ONCE(!(on_dl_rq(dl_se) || dl_se->dl_throttled));
>>>> +		return;
>>>> +	}
>>>> +
>>>> +	if (defer) {
>>>> +		/*
>>>> +		 * Postpone the replenishment to the (next period - the execution time)
> 
> perhaps explicitly mention zero-laxity here, then we all know what is
> meant, no?

Last time I used that word it caused more problems than helped :-) But I will
add it and specify that is "for this task".

>>>> +		 *
>>>> +		 * With this in place, we have two cases:
>>>> +		 *
>>>> +		 * On the absence of DL tasks:
>>>> +		 *	The server will start at the replenishment time, getting
>>>> +		 *	its runtime before now + period. This is the expected
>>>> +		 *	throttling behavior.
>>>> +		 *
>>>> +		 * In the presense of DL tasks:
>>>> +		 *	The server will be replenished, and then it will be
>>>> +		 *	schedule according to EDF, not breaking SCHED_DEADLINE.
>>>> +		 *
>>>> +		 *	In the first cycle the server will be postponed at most
>>>> +		 *	at period + period - runtime at most. But then the
>>>> +		 *	server will receive its runtime/period.
>>>> +		 *
>>>> +		 *	The server will, however, run on top of any RT task, which
>>>> +		 *	is the expected throttling behavior.
>>>> +		 */
>>>> +		dl_se->deadline = rq_clock(dl_se->rq) + dl_se->dl_period - dl_se->dl_runtime;
>>>
>>> I was confused by this, but this is an instance of
>>> replenish_dl_new_period(), where we explicitly do not preserve the
>>> period.
>>
>> yeah, it is two operations in one (so not straight forward). It sets
>> the period as the now - runtime, so..
> 
> Right. I'll just attribute it to me being generally confused about
> everything after holidays ;-)
> 
> The thing that tripped me was that the use of rq_clock() breaks
> periodicy, except of course you want/need that when you start a new
> activation.

that rq_clock() is only used at the time in which we start the deferred
server. If the throttling condition stays one, the server will act as a
regular periodic DL task...

> 
>>>> +		/* Zero the runtime */
>>>> +		dl_se->runtime = 0;
>>>> +		/* throttle the server */
>>>> +		dl_se->dl_throttled = 1;
>>
>> as the server is throttled, it will set the replenishing timer for now - runtime + period because
>> of the deadline.
> 
> Yeah, it's a wee hack to move it to the zero-laxity point. I was
> considering if it makes sense to push that down and make it available
> for all DL tasks, but I'm not sure..

It might be useful in the future, like when DL dominates all other schedulers, so
we can have a way to schedule a deferred work, like kworkers... :-) But it might be
too early for that..

>>> I'm thinking this is wrong -- ISTR there definite benefits to explicit
>>> slack time scheduling, meaning the server should also run while DL tasks
>>> are on.
>>
>> I can add the check to enable it also in the presence of DL tasks, we just need
>> to have consider that the dl server is a dl task as well when disabling.
> 
> Why not keep what was there, always run the thing when there's FAIR
> tasks around? Or do you see severe overhead from running it with only
> FAIR tasks on?

Assuming that most of the cases people only have fair tasks, which is probably
true in the regular kernel... (like, no threaded IRQs).

>> It is a benefit because it will fix the case in which a CPU full of DL tasks
>> (possible with global).
>>
>> Additionally, running the server when there are only fair tasks
>>> is mostly harmless, right? So why this change?
>>
>> Rhe problem is that we never know when a RT one will arrive :-/
>>
>> E.g., only fair tasks, server enabled. All fine :-) Then an RT arrives, and gets
>> postponed by the server... RT people complaining (for those thinking on adding
>> a server for RT, we will have a similar problem as we have with throttling today +
>> DL has no fixed priority).
> 
> Well, the thing is, the moment you run DL tasks on that machine those RT
> tasks will get delayed *anyway*. RT people need to stop pretending FIFO
> is the highest class.
> 
> But yeah, I can see why they might get upset if the time is then used to
> run FAIR tasks while their RT task gets to wait.

right

>> We will still face the same problem with defferable server, and the example is the same,
>> just with a shift in the RT arrival. e.g., only fair task for (server period - runtime),
>> then the server gets enabled, and a 1us after the RT arrives and gets postponed... again.
>>
>> So the need to enable it only after the dispatch of a kind of RT (or DL to be added)
>> tasks that can cause starvation.
>>
>> We could simplify it by enabling only on RT/DL arrival but... if there is nothing to
>> starve... it is not needed anyways so less overhead avoiding the enqueue...
> 
> So one thing we could do is have update_curr_fair() decrement
> fair_server's runtime and yield the period then it hits 0 (and capping
> it at 0, not allowing it to go negative or so).
> 
> That way you only force the situation when FAIR hasn't had it's allotted
> time this perio, and only for as much as to make up for the time it
> lacks.

We can also decrease the runtime to a negative number while in defer/throttle state, and let
the while in replenish_dl_entity() to replenish with the += runtime;

here:
--- %< ---
        /*
         * We keep moving the deadline away until we get some
         * available runtime for the entity. This ensures correct
         * handling of situations where the runtime overrun is
         * arbitrary large.
         */
        while (dl_se->runtime <= 0) {
                dl_se->deadline += pi_of(dl_se)->dl_period;
                dl_se->runtime += pi_of(dl_se)->dl_runtime;
        }

--- >% ---

it is already done... no?

> 
>>>
>>> One of the benefits was -- IIRC, that we no longer need to subtract some
>>> random margin from the reservation limit, but there were more I think.
>>>
>>
>> We can simplify things because we can start ignoring the rt throttling limit when
>> there is no need to throttle... like on grub (but, only after we remove the rt
>> throttling).
>>
>> Thoughts?
> 
> The moment the fair server is there (these patches) you should also kill
> the throttling. It doesn't make sense to have both.
>
Re: [PATCH v4 6/7] sched/deadline: Deferrable dl server
Posted by Peter Zijlstra 2 years, 5 months ago
On Wed, Sep 06, 2023 at 04:58:11PM +0200, Daniel Bristot de Oliveira wrote:

> > Yeah, it's a wee hack to move it to the zero-laxity point. I was
> > considering if it makes sense to push that down and make it available
> > for all DL tasks, but I'm not sure..
> 
> It might be useful in the future, like when DL dominates all other schedulers, so
> we can have a way to schedule a deferred work, like kworkers... :-) But it might be
> too early for that..

So... that scheme I was pushing where we unconditionally decrement
fair_server.dl_runtime from update_curr_fair(), that relies on it being
a proper zero-laxity scheduler, and doesn't work with the proposed defer
hack.

That is, it relies on dl_runtime > 0 during throttle, and you explicitly
set it 0.

Now, I've not looked at all this code in detail in a minute, but would
not something like the below work?

AFAICT the regular dl_task_timer() callback works to make it go, because
replenish will see positive runtime (or not, when already consumed) and
DTRT.


Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -657,6 +657,7 @@ struct sched_dl_entity {
 	unsigned int			dl_non_contending : 1;
 	unsigned int			dl_overrun	  : 1;
 	unsigned int			dl_server         : 1;
+	unsigned int			dl_zerolax        : 1;
 
 	/*
 	 * Bandwidth enforcement timer. Each -deadline task has its
Index: linux-2.6/kernel/sched/deadline.c
===================================================================
--- linux-2.6.orig/kernel/sched/deadline.c
+++ linux-2.6/kernel/sched/deadline.c
@@ -895,6 +895,16 @@ static void replenish_dl_entity(struct s
 		dl_se->dl_yielded = 0;
 	if (dl_se->dl_throttled)
 		dl_se->dl_throttled = 0;
+
+	/*
+	 * If this is a zero-laxity task, and we're before the zero-laxity
+	 * point, throttle it.
+	 */
+	if (dl_se->dl_zerolax &&
+	    dl_time_before(dl_se->deadline - dl_se->runtime, rq_clock(rq))) {
+		if (!is_dl_boosted(dl_se) && start_dl_timer(dl_se))
+			dl_se->dl_throttled = 1;
+	}
 }
 
 /*
@@ -1078,7 +1088,12 @@ static int start_dl_timer(struct sched_d
 	 * that it is actually coming from rq->clock and not from
 	 * hrtimer's time base reading.
 	 */
-	act = ns_to_ktime(dl_next_period(dl_se));
+	if (dl_se->dl_zerolax && !dl_se->dl_throttled) {
+		act = ns_to_ktime(dl_se->deadline - dl_se->runtime);
+	} else {
+		act = ns_to_ktime(dl_next_period(dl_se));
+	}
+
 	now = hrtimer_cb_get_time(timer);
 	delta = ktime_to_ns(now) - rq_clock(rq);
 	act = ktime_add_ns(act, delta);
@@ -1794,6 +1809,13 @@ enqueue_dl_entity(struct sched_dl_entity
 		setup_new_dl_entity(dl_se);
 	}
 
+	/*
+	 * If we are still throttled, eg. we got replenished but are a
+	 * zero-laxity task and still got to wait, don't enqueue.
+	 */
+	if (dl_se->dl_throttled)
+		return;
+
 	__enqueue_dl_entity(dl_se);
 }
Re: [PATCH v4 6/7] sched/deadline: Deferrable dl server
Posted by Daniel Bristot de Oliveira 2 years, 5 months ago
On 9/7/23 10:07, Peter Zijlstra wrote:
> On Wed, Sep 06, 2023 at 04:58:11PM +0200, Daniel Bristot de Oliveira wrote:
> 
>>> Yeah, it's a wee hack to move it to the zero-laxity point. I was
>>> considering if it makes sense to push that down and make it available
>>> for all DL tasks, but I'm not sure..
>>
>> It might be useful in the future, like when DL dominates all other schedulers, so
>> we can have a way to schedule a deferred work, like kworkers... :-) But it might be
>> too early for that..
> 
> So... that scheme I was pushing where we unconditionally decrement
> fair_server.dl_runtime from update_curr_fair(), that relies on it being
> a proper zero-laxity scheduler, and doesn't work with the proposed defer
> hack.
> 
> That is, it relies on dl_runtime > 0 during throttle, and you explicitly
> set it 0.
> 
> Now, I've not looked at all this code in detail in a minute, but would
> not something like the below work?
> 
> AFAICT the regular dl_task_timer() callback works to make it go, because
> replenish will see positive runtime (or not, when already consumed) and
> DTRT.
> 
> 
> Index: linux-2.6/include/linux/sched.h
> ===================================================================
> --- linux-2.6.orig/include/linux/sched.h
> +++ linux-2.6/include/linux/sched.h
> @@ -657,6 +657,7 @@ struct sched_dl_entity {
>  	unsigned int			dl_non_contending : 1;
>  	unsigned int			dl_overrun	  : 1;
>  	unsigned int			dl_server         : 1;
> +	unsigned int			dl_zerolax        : 1;
>  
>  	/*
>  	 * Bandwidth enforcement timer. Each -deadline task has its
> Index: linux-2.6/kernel/sched/deadline.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched/deadline.c
> +++ linux-2.6/kernel/sched/deadline.c
> @@ -895,6 +895,16 @@ static void replenish_dl_entity(struct s
>  		dl_se->dl_yielded = 0;
>  	if (dl_se->dl_throttled)
>  		dl_se->dl_throttled = 0;
> +
> +	/*
> +	 * If this is a zero-laxity task, and we're before the zero-laxity
> +	 * point, throttle it.
> +	 */
> +	if (dl_se->dl_zerolax &&
> +	    dl_time_before(dl_se->deadline - dl_se->runtime, rq_clock(rq))) {
> +		if (!is_dl_boosted(dl_se) && start_dl_timer(dl_se))
> +			dl_se->dl_throttled = 1;
> +	}
>  }
>  
>  /*
> @@ -1078,7 +1088,12 @@ static int start_dl_timer(struct sched_d
>  	 * that it is actually coming from rq->clock and not from
>  	 * hrtimer's time base reading.
>  	 */
> -	act = ns_to_ktime(dl_next_period(dl_se));
> +	if (dl_se->dl_zerolax && !dl_se->dl_throttled) {
> +		act = ns_to_ktime(dl_se->deadline - dl_se->runtime);
> +	} else {
> +		act = ns_to_ktime(dl_next_period(dl_se));
> +	}
> +
>  	now = hrtimer_cb_get_time(timer);
>  	delta = ktime_to_ns(now) - rq_clock(rq);
>  	act = ktime_add_ns(act, delta);
> @@ -1794,6 +1809,13 @@ enqueue_dl_entity(struct sched_dl_entity
>  		setup_new_dl_entity(dl_se);
>  	}
>  
> +	/*
> +	 * If we are still throttled, eg. we got replenished but are a
> +	 * zero-laxity task and still got to wait, don't enqueue.
> +	 */
> +	if (dl_se->dl_throttled)
> +		return;
> +
>  	__enqueue_dl_entity(dl_se);
>  }

Let me see if I got it:

	- Always start the server, but throttled with full runtime...
	- Unconditionally decrement fair_server.dl_runtime from update_curr_fair()
		(check if it is not decremented twice as it runs)
	- When the dl timer fire, replenish or throttle for the next period?

is that the base for it?

-- Daniel
Re: [PATCH v4 6/7] sched/deadline: Deferrable dl server
Posted by Peter Zijlstra 2 years, 5 months ago
On Fri, Sep 08, 2023 at 05:28:46PM +0200, Daniel Bristot de Oliveira wrote:

> Let me see if I got it:
> 
> 	- Always start the server, but throttled with full runtime...
> 	- Unconditionally decrement fair_server.dl_runtime from update_curr_fair()
> 		(check if it is not decremented twice as it runs)
> 	- When the dl timer fire, replenish or throttle for the next period?
> 
> is that the base for it?

Yes. So if dl timer fires and it still has runtime replenish will not
move the deadline and it will become eligible to run. If it has 0
runtime, replenish does it's thing, ups runtime and moves the deadline
forward and then the zero-laxity condition will re-throttle, goto 1
etc..
Re: [PATCH v4 6/7] sched/deadline: Deferrable dl server
Posted by Peter Zijlstra 2 years, 5 months ago
On Wed, Sep 06, 2023 at 04:58:11PM +0200, Daniel Bristot de Oliveira wrote:

> > So one thing we could do is have update_curr_fair() decrement
> > fair_server's runtime and yield the period then it hits 0 (and capping
> > it at 0, not allowing it to go negative or so).
> > 
> > That way you only force the situation when FAIR hasn't had it's allotted
> > time this perio, and only for as much as to make up for the time it
> > lacks.
> 
> We can also decrease the runtime to a negative number while in
> defer/throttle state, and let the while in replenish_dl_entity() to
> replenish with the += runtime;

Yes, but my point was that fair_server gives a lower bound of runtime
per period, more -- if available -- is fine.

If we allow negative runtime, you'll affect future periods, and that is
not desired in this case.

Or am I still confused?
Re: [PATCH v4 6/7] sched/deadline: Deferrable dl server
Posted by Daniel Bristot de Oliveira 2 years, 5 months ago
On 9/6/23 22:04, Peter Zijlstra wrote:
> On Wed, Sep 06, 2023 at 04:58:11PM +0200, Daniel Bristot de Oliveira wrote:
> 
>>> So one thing we could do is have update_curr_fair() decrement
>>> fair_server's runtime and yield the period then it hits 0 (and capping
>>> it at 0, not allowing it to go negative or so).
>>>
>>> That way you only force the situation when FAIR hasn't had it's allotted
>>> time this perio, and only for as much as to make up for the time it
>>> lacks.
>>
>> We can also decrease the runtime to a negative number while in
>> defer/throttle state, and let the while in replenish_dl_entity() to
>> replenish with the += runtime;

Repying in the sequence...but mostly to try to understand/explain my point (we might even
be in agreement, but touching different parts of the code).

> Yes, but my point was that fair_server gives a lower bound of runtime
> per period, more -- if available -- is fine.

I am targeting that as well, and it works for the case in which we have only RT
tasks causing starvation.

If we have other DL tasks, we cannot force the fair server to run till
completion because it would add a U=1 to the system. Like, if we have a
50ms server runtime... BOOM, we will miss lots of regular DL tasks deadline with
1 ms period. I do not think it is worth to break deadline to give fair server
time immediately. So, the fair server is scheduled as a periodic DL task.

After the initial defer state, the DL server will get the runtime/period
even with the CPU load of DL tasks. But:

	- We do not have such high load of DL tasks as well
	- If one cares about it more, they can reduce the runtime/period
	  granularity to mitigate the defer time
	- If one do not care about RT tasks, just disable the defer mechanism

So I think we are well covered, without having to break the basis of CBS+EDF assumptions
(like that task will not add a higher load than U).


> If we allow negative runtime, you'll affect future periods, and that is
> not desired in this case.

I think that I need to clarify this. I was thinking on this case:

	- Fair server deffered
	- If the server gets some time to run while waiting for the 0-lax
	  we decrease the runtime...
	- When the defer starts, the replenish will happen, and will +=
	  runtime, giving it the correct proportional time left for the
	  period the timer was armed. So it is not the next period, it is
	  the delayed period.

So I think we are thinking the same thing... just with a shift.

> 
> Or am I still confused?
> 

You are not alone... there are many states and I fear that I might be focusing
on a different state.

-- Daniel
Re: [PATCH v4 6/7] sched/deadline: Deferrable dl server
Posted by Peter Zijlstra 2 years, 5 months ago
On Wed, Sep 06, 2023 at 10:04:06PM +0200, Peter Zijlstra wrote:
> On Wed, Sep 06, 2023 at 04:58:11PM +0200, Daniel Bristot de Oliveira wrote:
> 
> > > So one thing we could do is have update_curr_fair() decrement
> > > fair_server's runtime and yield the period then it hits 0 (and capping
> > > it at 0, not allowing it to go negative or so).
> > > 
> > > That way you only force the situation when FAIR hasn't had it's allotted
> > > time this perio, and only for as much as to make up for the time it
> > > lacks.
> > 
> > We can also decrease the runtime to a negative number while in
> > defer/throttle state, and let the while in replenish_dl_entity() to
> > replenish with the += runtime;
> 
> Yes, but my point was that fair_server gives a lower bound of runtime
> per period, more -- if available -- is fine.
> 
> If we allow negative runtime, you'll affect future periods, and that is
> not desired in this case.
> 
> Or am I still confused?

That is, let update_curr_fair() decrement fair_server runtime
*unconditionally* -- even if the task was not selected through the
server.

Specifically, if the fair task is selected normally due to lack of
deadline tasks, that runtime *still* counts towards the fair-server and
have the server yield the period when zero.

This means that fair_server is only effective IFF 'normal' execution
doesn't match the fair_server.runtime execution.
Re: [PATCH v4 6/7] sched/deadline: Deferrable dl server
Posted by Daniel Bristot de Oliveira 2 years, 5 months ago
On 9/6/23 22:08, Peter Zijlstra wrote:
> On Wed, Sep 06, 2023 at 10:04:06PM +0200, Peter Zijlstra wrote:
>> On Wed, Sep 06, 2023 at 04:58:11PM +0200, Daniel Bristot de Oliveira wrote:
>>
>>>> So one thing we could do is have update_curr_fair() decrement
>>>> fair_server's runtime and yield the period then it hits 0 (and capping
>>>> it at 0, not allowing it to go negative or so).
>>>>
>>>> That way you only force the situation when FAIR hasn't had it's allotted
>>>> time this perio, and only for as much as to make up for the time it
>>>> lacks.
>>>
>>> We can also decrease the runtime to a negative number while in
>>> defer/throttle state, and let the while in replenish_dl_entity() to
>>> replenish with the += runtime;
>>
>> Yes, but my point was that fair_server gives a lower bound of runtime
>> per period, more -- if available -- is fine.
>>
>> If we allow negative runtime, you'll affect future periods, and that is
>> not desired in this case.
>>
>> Or am I still confused?
> 
> That is, let update_curr_fair() decrement fair_server runtime
> *unconditionally* -- even if the task was not selected through the
> server.

Ah, I see! but then we would have to also consider the period, and control a
period... without SCHED_DEADLINE watching us...

I was considering only the "waiting for the 0-lag time to start running (after
being armed)"

If there is no need for the server to be armed... do nothing :-) If it is armed,
reduce the amount of time the fair server could get.

> Specifically, if the fair task is selected normally due to lack of
> deadline tasks, that runtime *still* counts towards the fair-server and
> have the server yield the period when zero.
> 
> This means that fair_server is only effective IFF 'normal' execution
> doesn't match the fair_server.runtime execution.

I see! I *think* it will cause more overhead than doing nothing unless there
is something that can cause starvation. But I need to think more.

-- Daniel