[PATCH v5 6/7] sched/deadline: Deferrable dl server

Daniel Bristot de Oliveira posted 7 patches 2 years, 1 month ago
There is a newer version of this series
[PATCH v5 6/7] sched/deadline: Deferrable dl server
Posted by Daniel Bristot de Oliveira 2 years, 1 month ago
Among the motivations for the DL servers is the real-time throttling
mechanism. This mechanism works by throttling the rt_rq after
running for a long period without leaving space for fair tasks.

The base dl server avoids this problem by boosting fair tasks instead
of throttling the rt_rq. The point is that it boosts without waiting
for potential starvation, causing some non-intuitive cases.

For example, an IRQ dispatches two tasks on an idle system, a fair
and an RT. The DL server will be activated, running the fair task
before the RT one. This problem can be avoided by deferring the
dl server activation.

By setting the zerolax option, the dl_server will dispatch an
SCHED_DEADLINE reservation with replenished runtime, but throttled.

The dl_timer will be set for (period - runtime) ns from start time.
Thus boosting the fair rq on its 0-laxity time with respect to
rt_rq.

If the fair scheduler has the opportunity to run while waiting
for zerolax time, the dl server runtime will be consumed. If
the runtime is completely consumed before the zerolax time, the
server will be replenished while still in a throttled state. Then,
the dl_timer will be reset to the new zerolax time

If the fair server reaches the zerolax time without consuming
its runtime, the server will be boosted, following CBS rules
(thus without breaking SCHED_DEADLINE).

Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
---
 include/linux/sched.h   |   2 +
 kernel/sched/deadline.c | 100 +++++++++++++++++++++++++++++++++++++++-
 kernel/sched/fair.c     |   3 ++
 3 files changed, 103 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5ac1f252e136..56e53e6fd5a0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -660,6 +660,8 @@ struct sched_dl_entity {
 	unsigned int			dl_non_contending : 1;
 	unsigned int			dl_overrun	  : 1;
 	unsigned int			dl_server         : 1;
+	unsigned int			dl_zerolax	  : 1;
+	unsigned int			dl_zerolax_armed  : 1;
 
 	/*
 	 * Bandwidth enforcement timer. Each -deadline task has its
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 1d7b96ca9011..69ee1fbd60e4 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -772,6 +772,14 @@ static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se,
 	/* for non-boosted task, pi_of(dl_se) == dl_se */
 	dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
 	dl_se->runtime = pi_of(dl_se)->dl_runtime;
+
+	/*
+	 * If it is a zerolax reservation, throttle it.
+	 */
+	if (dl_se->dl_zerolax) {
+		dl_se->dl_throttled = 1;
+		dl_se->dl_zerolax_armed = 1;
+	}
 }
 
 /*
@@ -828,6 +836,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
  * could happen are, typically, a entity voluntarily trying to overcome its
  * runtime, or it just underestimated it during sched_setattr().
  */
+static int start_dl_timer(struct sched_dl_entity *dl_se);
 static void replenish_dl_entity(struct sched_dl_entity *dl_se)
 {
 	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
@@ -874,6 +883,28 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se)
 		dl_se->dl_yielded = 0;
 	if (dl_se->dl_throttled)
 		dl_se->dl_throttled = 0;
+
+	/*
+	 * If this is the replenishment of a zerolax reservation,
+	 * clear the flag and return.
+	 */
+	if (dl_se->dl_zerolax_armed) {
+		dl_se->dl_zerolax_armed = 0;
+		return;
+	}
+
+	/*
+	 * A this point, if the zerolax server is not armed, and the deadline
+	 * is in the future, throttle the server and arm the zerolax timer.
+	 */
+	if (dl_se->dl_zerolax &&
+	    dl_time_before(dl_se->deadline - dl_se->runtime, rq_clock(rq))) {
+		if (!is_dl_boosted(dl_se)) {
+			dl_se->dl_zerolax_armed = 1;
+			dl_se->dl_throttled = 1;
+			start_dl_timer(dl_se);
+		}
+	}
 }
 
 /*
@@ -1024,6 +1055,13 @@ static void update_dl_entity(struct sched_dl_entity *dl_se)
 		}
 
 		replenish_dl_new_period(dl_se, rq);
+	} else if (dl_server(dl_se) && dl_se->dl_zerolax) {
+		/*
+		 * The server can still use its previous deadline, so throttle
+		 * and arm the zero-laxity timer.
+		 */
+		dl_se->dl_zerolax_armed = 1;
+		dl_se->dl_throttled = 1;
 	}
 }
 
@@ -1056,8 +1094,20 @@ static int start_dl_timer(struct sched_dl_entity *dl_se)
 	 * We want the timer to fire at the deadline, but considering
 	 * that it is actually coming from rq->clock and not from
 	 * hrtimer's time base reading.
+	 *
+	 * The zerolax reservation will have its timer set to the
+	 * deadline - runtime. At that point, the CBS rule will decide
+	 * if the current deadline can be used, or if a replenishment
+	 * is required to avoid add too much pressure on the system
+	 * (current u > U).
 	 */
-	act = ns_to_ktime(dl_next_period(dl_se));
+	if (dl_se->dl_zerolax_armed) {
+		WARN_ON_ONCE(!dl_se->dl_throttled);
+		act = ns_to_ktime(dl_se->deadline - dl_se->runtime);
+	} else {
+		act = ns_to_ktime(dl_next_period(dl_se));
+	}
+
 	now = hrtimer_cb_get_time(timer);
 	delta = ktime_to_ns(now) - rq_clock(rq);
 	act = ktime_add_ns(act, delta);
@@ -1333,6 +1383,9 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64
 		return;
 	}
 
+	if (dl_server(dl_se) && dl_se->dl_throttled && !dl_se->dl_zerolax)
+		return;
+
 	if (dl_entity_is_special(dl_se))
 		return;
 
@@ -1356,6 +1409,39 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64
 
 	dl_se->runtime -= scaled_delta_exec;
 
+	/*
+	 * The fair server can consume its runtime while throttled (not queued).
+	 *
+	 * If the server consumes its entire runtime in this state. The server
+	 * is not required for the current period. Thus, reset the server by
+	 * starting a new period, pushing the activation to the zero-lax time.
+	 */
+	if (dl_se->dl_zerolax && dl_se->dl_throttled && dl_runtime_exceeded(dl_se)) {
+		s64 runtime_diff = dl_se->runtime + dl_se->dl_runtime;
+
+		/*
+		 * If this is a regular throttling case, let it run negative until
+		 * the dl_runtime - runtime > 0. The reason being is that the next
+		 * replenishment will result in a positive runtime one period ahead.
+		 *
+		 * Otherwise, the deadline will be pushed more than one period, not
+		 * providing runtime/period anymore.
+		 *
+		 * If the dl_runtime - runtime < 0, then the server was able to get
+		 * the runtime/period before the replenishment. So it is safe
+		 * to start a new deffered period.
+		 */
+		if (!dl_se->dl_zerolax_armed && runtime_diff > 0)
+			return;
+
+		hrtimer_try_to_cancel(&dl_se->dl_timer);
+
+		replenish_dl_new_period(dl_se, dl_se->rq);
+		start_dl_timer(dl_se);
+
+		return;
+	}
+
 throttle:
 	if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) {
 		dl_se->dl_throttled = 1;
@@ -1432,6 +1518,9 @@ void dl_server_start(struct sched_dl_entity *dl_se)
 void dl_server_stop(struct sched_dl_entity *dl_se)
 {
 	dequeue_dl_entity(dl_se, DEQUEUE_SLEEP);
+	hrtimer_try_to_cancel(&dl_se->dl_timer);
+	dl_se->dl_zerolax_armed = 0;
+	dl_se->dl_throttled = 0;
 }
 
 void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
@@ -1743,7 +1832,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)
 	 * be counted in the active utilization; hence, we need to call
 	 * add_running_bw().
 	 */
-	if (dl_se->dl_throttled && !(flags & ENQUEUE_REPLENISH)) {
+	if (!dl_se->dl_zerolax && dl_se->dl_throttled && !(flags & ENQUEUE_REPLENISH)) {
 		if (flags & ENQUEUE_WAKEUP)
 			task_contending(dl_se, flags);
 
@@ -1765,6 +1854,13 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)
 		setup_new_dl_entity(dl_se);
 	}
 
+	/*
+	 * If we are still throttled, eg. we got replenished but are a
+	 * zero-laxity task and still got to wait, don't enqueue.
+	 */
+	if (dl_se->dl_throttled && start_dl_timer(dl_se))
+		return;
+
 	__enqueue_dl_entity(dl_se);
 }
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b15f7f376a67..399237cd9f59 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1201,6 +1201,8 @@ static void update_curr(struct cfs_rq *cfs_rq)
 		account_group_exec_runtime(curtask, delta_exec);
 		if (curtask->server)
 			dl_server_update(curtask->server, delta_exec);
+		else
+			dl_server_update(&rq_of(cfs_rq)->fair_server, delta_exec);
 	}
 
 	account_cfs_rq_runtime(cfs_rq, delta_exec);
@@ -8421,6 +8423,7 @@ void fair_server_init(struct rq *rq)
 	dl_se->dl_runtime = 50 * NSEC_PER_MSEC;
 	dl_se->dl_deadline = 1000 * NSEC_PER_MSEC;
 	dl_se->dl_period = 1000 * NSEC_PER_MSEC;
+	dl_se->dl_zerolax = 1;
 
 	dl_server_init(dl_se, rq, fair_server_has_tasks, fair_server_pick);
 }
-- 
2.40.1
Re: [PATCH v5 6/7] sched/deadline: Deferrable dl server
Posted by Joel Fernandes 1 year, 9 months ago

On 11/4/2023 6:59 AM, Daniel Bristot de Oliveira wrote:
> Among the motivations for the DL servers is the real-time throttling
> mechanism. This mechanism works by throttling the rt_rq after
> running for a long period without leaving space for fair tasks.
> 
> The base dl server avoids this problem by boosting fair tasks instead
> of throttling the rt_rq. The point is that it boosts without waiting
> for potential starvation, causing some non-intuitive cases.
> 
> For example, an IRQ dispatches two tasks on an idle system, a fair
> and an RT. The DL server will be activated, running the fair task
> before the RT one. This problem can be avoided by deferring the
> dl server activation.
> 
> By setting the zerolax option, the dl_server will dispatch an
> SCHED_DEADLINE reservation with replenished runtime, but throttled.
> 
> The dl_timer will be set for (period - runtime) ns from start time.
> Thus boosting the fair rq on its 0-laxity time with respect to
> rt_rq.
> 
> If the fair scheduler has the opportunity to run while waiting
> for zerolax time, the dl server runtime will be consumed. If
> the runtime is completely consumed before the zerolax time, the
> server will be replenished while still in a throttled state. Then,
> the dl_timer will be reset to the new zerolax time
> 
> If the fair server reaches the zerolax time without consuming
> its runtime, the server will be boosted, following CBS rules
> (thus without breaking SCHED_DEADLINE).
> 
> Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>

Hi, Daniel,
We have one additional patch (other than the 15 I just sent). Since I have just
3 more working days for the next 3 weeks, I thought I might as well reply inline
here since it might be unnecessary to resend all 15 patches so soon just for the
one new addition below. I am replying to this patch here, because the new patch
is related (to 0-laxity).  But once I am back from holiday, I can resend it with
the set I have unless you've applied it.

So, Vineeth and me came up with a patch below to "max cap" the DL server 0-lax
time (max cap is default off keeping the regular behavior). This is needed to
guarantee bandwidth for periodic CFS runners/sleepers.

The example usecase is:

Consider DL server params 25ms / 50ms.

Consider CFS task with duty cycle of 25ms / 76ms (run 25ms sleep 51ms).

         run 25ms                    run 25ms
         _______                     _______
        |       | sleep 51          |       |  sleep 51
-|------|-------|---------|---------|-------|----------|--------|------> t
 0     25      50       101        126      151       202      227
                          \ 0-lax /                    \ 0-lax /

Here the 0-lax addition in the original v5's zero-lax patch causes lesser bandwidth.

So the task runs 50ms every 227ms, instead of 50ms every 152ms.

A simple unit test confirms the issue, and it is fixed by Vineeth's patch below:

Please take a look at the patch below (applies only to v5.15 but Vineeth is
rebase on mainline as we speak), thanks.

-----8<--------
From: Vineeth Pillai (Google) <vineeth@bitbyteword.org>
Subject: [PATCH] sched/deadline/dlserver: sysctl for dlserver maxdefer time

Inorder to avoid dlserver preempting RT tasks when it wakes up, dlserver
is throttled(deferred) until zero lax time. This is the farthest time
before deadline where dlserver can meet its deadline.

Zero lax time causes cfs tasks with sleep/run pattern where the cfs
tasks doesn't get the bandwidth promised by dlserver. So introduce a
sysctl for limiting the defer time of dlserver.

Suggested-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Vineeth Pillai (Google) <vineeth@bitbyteword.org>
---
 include/linux/sched/sysctl.h | 2 ++
 kernel/sched/deadline.c      | 6 ++++++
 kernel/sysctl.c              | 7 +++++++
 3 files changed, 15 insertions(+)

diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 4939e6128840..a27fba6fe0ab 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -41,6 +41,8 @@ extern unsigned int sysctl_iowait_apply_ticks;
 extern unsigned int sysctl_sched_dl_period_max;
 extern unsigned int sysctl_sched_dl_period_min;
 +extern unsigned int sysctl_sched_dlserver_maxdefer_ms;
+
 #ifdef CONFIG_UCLAMP_TASK
 extern unsigned int sysctl_sched_uclamp_util_min;
 extern unsigned int sysctl_sched_uclamp_util_max;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index d638cc5b45c7..69c9fd80a67d 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1071,6 +1071,11 @@ static int start_dl_timer(struct sched_dl_entity *dl_se)
 	if (dl_se->dl_defer_armed) {
 		WARN_ON_ONCE(!dl_se->dl_throttled);
 		act = ns_to_ktime(dl_se->deadline - dl_se->runtime);
+		if (sysctl_sched_dlserver_maxdefer_ms) {
+			ktime_t dlserver_maxdefer = rq_clock(rq) +
ms_to_ktime(sysctl_sched_dlserver_maxdefer_ms);
+			if (ktime_after(act, dlserver_maxdefer))
+				act = dlserver_maxdefer;
+		}
 	} else {
 		act = ns_to_ktime(dl_next_period(dl_se));
 	}
@@ -3099,6 +3104,7 @@ void __getparam_dl(struct task_struct *p, struct
sched_attr *attr)
  */
 unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */
 unsigned int sysctl_sched_dl_period_min = 100;     /* 100 us */
+unsigned int sysctl_sched_dlserver_maxdefer_ms = 2;
  /*
  * This function validates the new parameters of a -deadline task.
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 39f47a871fb4..027193302e7e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1842,6 +1842,13 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.procname	= "sched_dlserver_maxdefer_ms",
+		.data		= &sysctl_sched_dlserver_maxdefer_ms,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 	{
 		.procname	= "sched_rr_timeslice_ms",
 		.data		= &sysctl_sched_rr_timeslice,
-- 
2.40.1
Re: [PATCH v5 6/7] sched/deadline: Deferrable dl server
Posted by Daniel Bristot de Oliveira 1 year, 9 months ago
On 3/20/24 01:03, Joel Fernandes wrote:
> 
> 
> On 11/4/2023 6:59 AM, Daniel Bristot de Oliveira wrote:
>> Among the motivations for the DL servers is the real-time throttling
>> mechanism. This mechanism works by throttling the rt_rq after
>> running for a long period without leaving space for fair tasks.
>>
>> The base dl server avoids this problem by boosting fair tasks instead
>> of throttling the rt_rq. The point is that it boosts without waiting
>> for potential starvation, causing some non-intuitive cases.
>>
>> For example, an IRQ dispatches two tasks on an idle system, a fair
>> and an RT. The DL server will be activated, running the fair task
>> before the RT one. This problem can be avoided by deferring the
>> dl server activation.
>>
>> By setting the zerolax option, the dl_server will dispatch an
>> SCHED_DEADLINE reservation with replenished runtime, but throttled.
>>
>> The dl_timer will be set for (period - runtime) ns from start time.
>> Thus boosting the fair rq on its 0-laxity time with respect to
>> rt_rq.
>>
>> If the fair scheduler has the opportunity to run while waiting
>> for zerolax time, the dl server runtime will be consumed. If
>> the runtime is completely consumed before the zerolax time, the
>> server will be replenished while still in a throttled state. Then,
>> the dl_timer will be reset to the new zerolax time
>>
>> If the fair server reaches the zerolax time without consuming
>> its runtime, the server will be boosted, following CBS rules
>> (thus without breaking SCHED_DEADLINE).

notice: at this point in history, the term zero-laxity was removed from
the latest code we have, the term was moved to defer server... I will
remove from the long in the next time I send it.

>> Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
> 
> Hi, Daniel,
> We have one additional patch (other than the 15 I just sent).

Those 15 are the next thing I will review... I was working in the merge window.

Since I have just
> 3 more working days for the next 3 weeks, I thought I might as well reply inline
> here since it might be unnecessary to resend all 15 patches so soon just for the
> one new addition below. I am replying to this patch here, because the new patch
> is related (to 0-laxity).  But once I am back from holiday, I can resend it with
> the set I have unless you've applied it.

before starting... in three weeks, we will be very far still from any attempt to
ping Peter & ingo to ask if they could think about putting us into a queue.
Have fun :-)

as I explained at LPC.. and in chat... there is no "0-laxity", and repeating it
only creates more confusion. Let's use a deferred server... a regular deadline
reservation with deferred starting time at (now + period - runtime), and at that point
il will receive a new deadline one period away - (not one runtime away).

There will always be a person reading these emails and echoing the wrong things...
using 0-lax/0-laxity term here is a lose-lose.

> So, Vineeth and me came up with a patch below to "max cap" the DL server 0-lax
> time (max cap is default off keeping the regular behavior). This is needed to
> guarantee bandwidth for periodic CFS runners/sleepers.

Another point... "guarantee bandwidth"... the bandwith is provided under certain conditions.
If the conditions are not respected, the guarantee a dl reservation will provide is that
the task will not put a utilization higher than the one requested, so yes, a dl reservation
can and will enforce a lower utilization if the task does not respect the conditions.
Also, if the reservation is ready, but no task is ready...

> 
> The example usecase is:
> 
> Consider DL server params 25ms / 50ms.
> 
> Consider CFS task with duty cycle of 25ms / 76ms (run 25ms sleep 51ms).

define duty... like, runtime 25, period 76? sleeps for 51 relative to a starting time
or not?

there are some holes in your explanation, it is tricky to reply inline for these cases...
I am continuing but....

> 
>          run 25ms                    run 25ms
>          _______                     _______
>         |       | sleep 51          |       |  sleep 51
> -|------|-------|---------|---------|-------|----------|--------|------> t
>  0     25      50       101        126      151       202      227
>                           \ 0-lax /                    \ 0-lax /


trying to understand...

at time 0 the task is activated... RT tasks are spinning... assuming that so the
server was deferred to 25?

At 25, it becomes DL, with a new deadline of 75. If there is no other DL
task, run [25..50].

at 75 the task would have another 25 ms... but it decided not to run, throwing
away 25 ms until 100. At this point, I would say: is not an odd setup....

At point 100, the deferred server assumes that the starvation condition is gone,
and goes to the initial state.

now, that 0-lax means what? the zero-laxity time for the task... but not
from its start time, but from the beginning of the deferred server = 25 + 76 = 101
... also 0-lax is a range?

and here, the system seems to start the same cycle with a 1 shift to repeat the
case that the task and the reservation did not match.

looking back, for the first cycle... with defer at 75, without at point 50, there
would be time for the server to run, but the task is just ready, so runtime is
thrown away.

So, this miss match between the configuration and the task setup is... clearly
causing runtime to be wasted... but one can change the parameter for them
to be better in sync...

and here, I assume that there is something missing in the explanation because...


> Here the 0-lax addition in the original v5's zero-lax patch causes lesser bandwidth.

which addition?

> So the task runs 50ms every 227ms, instead of 50ms every 152ms.

...

> 
> A simple unit test confirms the issue, and it is fixed by Vineeth's patch below:
> 
> Please take a look at the patch below (applies only to v5.15 but Vineeth is
> rebase on mainline as we speak), thanks.
> 
> -----8<--------
> From: Vineeth Pillai (Google) <vineeth@bitbyteword.org>
> Subject: [PATCH] sched/deadline/dlserver: sysctl for dlserver maxdefer time
> 
> Inorder to avoid dlserver preempting RT tasks when it wakes up, dlserver
> is throttled(deferred) until zero lax time. This is the farthest time
> before deadline where dlserver can meet its deadline.
> 
> Zero lax time causes cfs tasks with sleep/run pattern where the cfs
> tasks doesn't get the bandwidth promised by dlserver. So introduce a
> sysctl for limiting the defer time of dlserver.

so... that explanation before to reach to the conclusion that limiting the
amount of time to defer the server is a fix? there is a huge gap here.

> 
> Suggested-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> Signed-off-by: Vineeth Pillai (Google) <vineeth@bitbyteword.org>
> ---
>  include/linux/sched/sysctl.h | 2 ++
>  kernel/sched/deadline.c      | 6 ++++++
>  kernel/sysctl.c              | 7 +++++++
>  3 files changed, 15 insertions(+)
> 
> diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
> index 4939e6128840..a27fba6fe0ab 100644
> --- a/include/linux/sched/sysctl.h
> +++ b/include/linux/sched/sysctl.h
> @@ -41,6 +41,8 @@ extern unsigned int sysctl_iowait_apply_ticks;
>  extern unsigned int sysctl_sched_dl_period_max;
>  extern unsigned int sysctl_sched_dl_period_min;
>  +extern unsigned int sysctl_sched_dlserver_maxdefer_ms;
> +
>  #ifdef CONFIG_UCLAMP_TASK
>  extern unsigned int sysctl_sched_uclamp_util_min;
>  extern unsigned int sysctl_sched_uclamp_util_max;
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index d638cc5b45c7..69c9fd80a67d 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -1071,6 +1071,11 @@ static int start_dl_timer(struct sched_dl_entity *dl_se)
>  	if (dl_se->dl_defer_armed) {
>  		WARN_ON_ONCE(!dl_se->dl_throttled);
>  		act = ns_to_ktime(dl_se->deadline - dl_se->runtime);
> +		if (sysctl_sched_dlserver_maxdefer_ms) {
> +			ktime_t dlserver_maxdefer = rq_clock(rq) +
> ms_to_ktime(sysctl_sched_dlserver_maxdefer_ms);
> +			if (ktime_after(act, dlserver_maxdefer))
> +				act = dlserver_maxdefer;


that is, having a global limit... we have a per-cpu set of variables,
that is bounded by a global limit.

<joking>
It is already hard to put in sync with two parameters. Now we need a
third one that is global :-)
<joking>

reading the code was actually more instructive than reading the comments.
The good point is that this puts a nail in the coffin of "zerolax" :-)

that is, you all want also to control for how long the defer happens. Now
it is fixed... (period - runtime). You all would like to have the ability
to set it so something closer, so to defer less.

That phrase is simple :-)

It is already possible! how? This can be done by adjusting runtime/period.
Which are already per CPU. The DL server does not need to give an entire
chuck all at once; one can split it into smaller runtime/period slices,
as EEVDF does. And here we get back to the things we talked about when
trying to use EDF... but now it is only for one task :-) It is easier.

This also reduces the amount of time thrown away because there is no
task ready. It is the main cause of time wasted anyway. And the CFS
scheduler is really aperiodic, in practice, these more corner
case timelines will hardly happen... unless you try to force with
a simple test case.

now... could I change the defer per-cpu option to have a value between
0 and (period - runtime) so we could defer less than (period - runtime),
with proper check and granularity (per cpu) in the next version I send
for the interface.... maybe... it is less prone to have a no no from
people who care a lot about the interface.

But I am asking myself: is it worth the complexity? I would try first
getting used with the runtime/period setup only...

I will start reviewing the other patches as soon as the worries
of the merge window passes away. Hopefully tomorrow.

-- Daniel
Re: [PATCH v5 6/7] sched/deadline: Deferrable dl server
Posted by Joel Fernandes 1 year, 9 months ago
Hello Daniel,
Thank you for the reply and I replied below.

On 3/20/2024 3:24 PM, Daniel Bristot de Oliveira wrote:
> On 3/20/24 01:03, Joel Fernandes wrote:
>>
>>
>> On 11/4/2023 6:59 AM, Daniel Bristot de Oliveira wrote:
>>> Among the motivations for the DL servers is the real-time throttling
>>> mechanism. This mechanism works by throttling the rt_rq after
>>> running for a long period without leaving space for fair tasks.
>>>
>>> The base dl server avoids this problem by boosting fair tasks instead
>>> of throttling the rt_rq. The point is that it boosts without waiting
>>> for potential starvation, causing some non-intuitive cases.
>>>
>>> For example, an IRQ dispatches two tasks on an idle system, a fair
>>> and an RT. The DL server will be activated, running the fair task
>>> before the RT one. This problem can be avoided by deferring the
>>> dl server activation.
>>>
>>> By setting the zerolax option, the dl_server will dispatch an
>>> SCHED_DEADLINE reservation with replenished runtime, but throttled.
>>>
>>> The dl_timer will be set for (period - runtime) ns from start time.
>>> Thus boosting the fair rq on its 0-laxity time with respect to 

Note your patch changelog. --- (1)

>>> rt_rq.
>>>
>>> If the fair scheduler has the opportunity to run while waiting
>>> for zerolax time, the dl server runtime will be consumed. If
>>> the runtime is completely consumed before the zerolax time, the

Note your patch changelog. --- (2)

>>> server will be replenished while still in a throttled state. Then,
>>> the dl_timer will be reset to the new zerolax time
>>>
>>> If the fair server reaches the zerolax time without consuming
>>> its runtime, the server will be boosted, following CBS rules
>>> (thus without breaking SCHED_DEADLINE).
> 
> notice: at this point in history, the term zero-laxity was removed from
> the latest code we have, the term was moved to defer server... I will
> remove from the long in the next time I send it.

I am confused because your change log in your v5 (and the v6 in kernel.org)
mentions "zero-laxity" all over the place (example see above for (1) and (2)).
Further, the terminology is the least of the problems unfortunately (more
below). Call it defer or whatever but it has to work. :)

>>> Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
>>
>> Hi, Daniel,
>> We have one additional patch (other than the 15 I just sent).
> 
> Those 15 are the next thing I will review... I was working in the merge window.

Me too ;-) It has been a fun merge window.

> Since I have just
>> 3 more working days for the next 3 weeks, I thought I might as well reply inline
>> here since it might be unnecessary to resend all 15 patches so soon just for the
>> one new addition below. I am replying to this patch here, because the new patch
>> is related (to 0-laxity).  But once I am back from holiday, I can resend it with
>> the set I have unless you've applied it.
> 
> before starting... in three weeks, we will be very far still from any attempt to
> ping Peter & ingo to ask if they could think about putting us into a queue.

I wasn't really expecting an imminent "ping for merge" to be honest. My main
motivation with posting it before my 3 week off time was to trigger some review
and discussion here (which we desperately need more off IMO ;)). I am not in a
hurry to request for a merge without it working correctly either.

> Have fun :-)

Thanks!

> as I explained at LPC.. and in chat... there is no "0-laxity", and repeating it
> only creates more confusion.

We can stick to defer but could you please update your changelog as well. I
don't think you posted an update to the patch since November either so maybe you
should do that as well, with the patches adjusted with the bugs we found. I do
agree that using zero-laxity creates a source of confusion and prefer "defer".

> Let's use a deferred server... a regular deadline
> reservation with deferred starting time at (now + period - runtime), and at that point
> il will receive a new deadline one period away - (not one runtime away).
> 
> There will always be a person reading these emails and echoing the wrong things...
> using 0-lax/0-laxity term here is a lose-lose.

Agreed, so why not update your patch changelog to correct that (or post a new
revision)?

>> So, Vineeth and me came up with a patch below to "max cap" the DL server 0-lax
>> time (max cap is default off keeping the regular behavior). This is needed to
>> guarantee bandwidth for periodic CFS runners/sleepers.
> 
> Another point... "guarantee bandwidth"... the bandwith is provided under certain conditions.
> If the conditions are not respected, the guarantee a dl reservation will provide is that
> the task will not put a utilization higher than the one requested, so yes, a dl reservation
> can and will enforce a lower utilization if the task does not respect the conditions.
> Also, if the reservation is ready, but no task is ready...

Please clarify what conditions you mean? The conditions I am looking for are
those given by RT throttling (see below). i.e., if RT takes up all of the CPU in
a certain amount of time, then there is a certain amount reserved for CFS.

>> The example usecase is:
>>
>> Consider DL server params 25ms / 50ms.
>>
>> Consider CFS task with duty cycle of 25ms / 76ms (run 25ms sleep 51ms).
> 
> define duty... like, runtime 25, period 76? sleeps for 51 relative to a starting time
> or not?

There is no starting time. The CFS task in the quoted example only does run +
sleep. Run for 25ms and sleep for 51ms, then run again for 25ms, etc.

> 
> there are some holes in your explanation, it is tricky to reply inline for these cases...
> I am continuing but....
> 
>>
>>          run 25ms                    run 25ms
>>          _______                     _______
>>         |       | sleep 51          |       |  sleep 51
>> -|------|-------|---------|---------|-------|----------|--------|------> t
>>  0     25      50       101        126      151       202      227
>>                           \ 0-lax /                    \ 0-lax /

So before going into the discussion below, I just want to mention that if the
DL-server cannot provide the same bandwidth that the RT throttling provided,
then its broken by definition.  And the breakage comes specifically because of
this patch and nothing else. There are many breakages with this patch and I will
go over some of them with unit tests below. Basically, in my view -- if a test
case shows it works with RT throttling, but not with the DL-server, then its
broken. Period. And most of those functional "breakages" come about because of
this patch (6/7) and not the initial series actually.

Here are some cases to shed some light:

Case 1. Consider a CFS task with runtime 15ms and period 50ms. With the
parameters set to 25ms runtime and 50ms period.

The test fails with DL server (because of 6/7), and passes with RT throttling.
See results below. For this test's code, see: https://shorturl.at/rwW07

Specifically, it breaks because of this patch (6/7). If you revert the patch,
the issue goes away.

With the patch 6/7:
# ./dlserver_test
# Runtime of PID 85 is 0.430000 seconds
Bail out! Runtime of PID 85 is not within 10% of expected runtime 0.900000

Without the patch 6/7:
# ./dlserver_test
# Runtime of PID 87 is 0.900000 seconds
ok 1 PASS

So basically, because of defer (or whatever you want to call it ;)), it gives
less than 50% of the bandwidth that it gave without the defer.

And here it is with vanilla RT throttling:
# ./dlserver_test
[   44.968900] sched: RT throttling activated
# Runtime of PID 87 is 0.880000 seconds

Vineeth's patch I shared fixes the issue.

Case 2:
For this case, please run the core scheduling test I provided with
CONFIG_SCHED_CORE=y. It is basically like Case 1 but with some slightly changes.

You will see that because of patch 6/7, that test also breaks and gets lesser
bandwidth. And it happens of the negative runtime stuff you added to
update_curr_dl_se(). Deleting that fixes this, but is indication of yet another
problem with this patch.

The patch fixing this issue (by deleting that block) is also included in the set
of 15 I posted earlier.
-------

Further, my impression is this patch (6/7) does not even solve all the issue it
intended. For example, consider that a CFS task is in the boosted phase, and now
an RT task wakes up. That RT task *will wait* for possibly the whole runtime
granted to CFS, so it might not always help. Contrasting that with RT
throttling, if an RT task is very well behaved (well behaved defined as not
running to the limit that RT throttling should kick in), and it wakes up, it
will run right away without any wait time, regardless of what CFS was or was not
doing.

thanks,

 - Joel
Re: [PATCH v5 6/7] sched/deadline: Deferrable dl server
Posted by Daniel Bristot de Oliveira 1 year, 8 months ago

>> There will always be a person reading these emails and echoing the wrong things...
>> using 0-lax/0-laxity term here is a lose-lose.
> 
> Agreed, so why not update your patch changelog to correct that (or post a new
> revision)?

That v6 is the repo was a partial update, that I sent there to sync our work. The v6 that I will
send already removed that.

>>> So, Vineeth and me came up with a patch below to "max cap" the DL server 0-lax
>>> time (max cap is default off keeping the regular behavior). This is needed to
>>> guarantee bandwidth for periodic CFS runners/sleepers.
>>
>> Another point... "guarantee bandwidth"... the bandwith is provided under certain conditions.
>> If the conditions are not respected, the guarantee a dl reservation will provide is that
>> the task will not put a utilization higher than the one requested, so yes, a dl reservation
>> can and will enforce a lower utilization if the task does not respect the conditions.
>> Also, if the reservation is ready, but no task is ready...
> 
> Please clarify what conditions you mean? The conditions I am looking for are
> those given by RT throttling (see below). i.e., if RT takes up all of the CPU in
> a certain amount of time, then there is a certain amount reserved for CFS.
> 
>>> The example usecase is:
>>>
>>> Consider DL server params 25ms / 50ms.
>>>
>>> Consider CFS task with duty cycle of 25ms / 76ms (run 25ms sleep 51ms).
>>
>> define duty... like, runtime 25, period 76? sleeps for 51 relative to a starting time
>> or not?
> 
> There is no starting time. The CFS task in the quoted example only does run +
> sleep. Run for 25ms and sleep for 51ms, then run again for 25ms, etc.
> 
>>
>> there are some holes in your explanation, it is tricky to reply inline for these cases...
>> I am continuing but....
>>
>>>
>>>          run 25ms                    run 25ms
>>>          _______                     _______
>>>         |       | sleep 51          |       |  sleep 51
>>> -|------|-------|---------|---------|-------|----------|--------|------> t
>>>  0     25      50       101        126      151       202      227
>>>                           \ 0-lax /                    \ 0-lax /
> 
> So before going into the discussion below, I just want to mention that if the
> DL-server cannot provide the same bandwidth that the RT throttling provided,
> then its broken by definition.  And the breakage comes specifically because of
> this patch and nothing else. There are many breakages with this patch and I will
> go over some of them with unit tests below. Basically, in my view -- if a test
> case shows it works with RT throttling, but not with the DL-server, then its
> broken. Period. And most of those functional "breakages" come about because of
> this patch (6/7) and not the initial series actually.
> 
> Here are some cases to shed some light:
> 
> Case 1. Consider a CFS task with runtime 15ms and period 50ms. With the
> parameters set to 25ms runtime and 50ms period.
> 
> The test fails with DL server (because of 6/7), and passes with RT throttling.
> See results below. For this test's code, see: https://shorturl.at/rwW07

A reproducer always helps. So, your task there is not a periodic task... it is
a sporadic task because it sleeps for a fixed amount of time after the runtime.

A periodic task with period 76 would wake at 0, 76, 152 - like cyclictest...
so consuming at a fixed time rate if the scheduler allows it.

In the case of a fixed sleep time at the end of the execution, it will end up
"throwing away bandwidth" if the runtime is not given at the beginning of the
period because it will run slower... accumulating error. But that was not the
main point here...

The problem here was more like: if a fair task goes to sleep in the middle of
the server activation (for a lock?), and then wakes up again, the code in v5 is
forcing it to defer... again. Thus, it is getting less bandwidth... notice that
it does not even need to be at the start of the period. It is the middle of the
execution.

Intuitively, reducing the deferred time would help there. But the best thing to do is:

If the fair task waited for the defer, and the real-time tasks are still using all
CPU time, do not defer the activation again, and keep the defer mechanism disabled
until the real-time tasks allow the fair scheduler to run in the background. So,
making the defer mode equivalent to the non-defer mode until the RT tasks start
to behave again.

For that, in the v6, there is a variable (dl_defer_running), once the dl_server
is enqueued after the defer time, the variable dl_defer_running is set.

If the fair task sleeps in the middle of the period, that variable do not change.

If the fair task wakes up and the dl_defer_running is still set, do not defer.
Keep running until you consume the reservation.

The variable dl_defer_running is set to 0 only after the fair tasks consume
its runtime without being in a dl_server... IOW, when the RT tasks start to
behave.

No interface change.

With that in place, your reproducers are working. I have a periodic version
of your reproducer, also improving how the task consumes the runtime,.. I
will send it to you so you can have a look.

> 
> Specifically, it breaks because of this patch (6/7). If you revert the patch,
> the issue goes away.
> 
> With the patch 6/7:
> # ./dlserver_test
> # Runtime of PID 85 is 0.430000 seconds
> Bail out! Runtime of PID 85 is not within 10% of expected runtime 0.900000
> 
> Without the patch 6/7:
> # ./dlserver_test
> # Runtime of PID 87 is 0.900000 seconds
> ok 1 PASS
> 
> So basically, because of defer (or whatever you want to call it ;)), it gives
> less than 50% of the bandwidth that it gave without the defer.

There was a problem with the non-defer mode as well, the dl_server_start() was
missing a set need resched. Fixed that in v6.

> Further, my impression is this patch (6/7) does not even solve all the issue it
> intended. For example, consider that a CFS task is in the boosted phase, and now
> an RT task wakes up. That RT task *will wait* for possibly the whole runtime
> granted to CFS, so it might not always help. Contrasting that with RT
> throttling, if an RT task is very well behaved (well behaved defined as not
> running to the limit that RT throttling should kick in), and it wakes up, it
> will run right away without any wait time, regardless of what CFS was or was not
> doing.


I fixed that as well.

The problem happens when a DL server has a large runtime (>=~ 50%).
Let's say 25 ms runtime, 50 ms period.

At time 0, the defer timer will be set at 25 ms (50 - 25).

From 0 to 25, the RT task would consume, for instance, only 2 ms... so
it is behaving...

At time 25, the defer timer fires... and as the fair task ran for 23 ms
(25 - 2 ms taken by RT) it still has 2 ms runtime to run... so the server
is activated... it is not correct.

The change I made in v6 is:

Same case...

At time 25, the defer timer fires...
	Then, the timer will re-compute the defer time:
		If the RT tasks are behaving, forward the timer for the
		new (deadline - runtime).
		return;

For instance, in the previous case, the new defer timer would be: 50 ms - 2 ms.

CFS will continue working, consuming runtime and resetting the period to avoid
activating the dl server.

The idea of forwarding the timer was taken from the cfs period timer. It is also
possible to forward the timer on other points... if necessary...

I did more testing, with different task sets, including tasks that goes to sleep...
it is working as expected.

-- Daniel


> thanks,
> 
>  - Joel
>
Re: [PATCH v5 6/7] sched/deadline: Deferrable dl server
Posted by Steven Rostedt 1 year, 8 months ago
On Fri, 5 Apr 2024 16:35:49 +0200
Daniel Bristot de Oliveira <bristot@redhat.com> wrote:

> A reproducer always helps. So, your task there is not a periodic task... it is
> a sporadic task because it sleeps for a fixed amount of time after the runtime.
> 
> A periodic task with period 76 would wake at 0, 76, 152 - like cyclictest...
> so consuming at a fixed time rate if the scheduler allows it.
> 
> In the case of a fixed sleep time at the end of the execution, it will end up
> "throwing away bandwidth" if the runtime is not given at the beginning of the
> period because it will run slower... accumulating error. But that was not the
> main point here...
> 
> The problem here was more like: if a fair task goes to sleep in the middle of
> the server activation (for a lock?), and then wakes up again, the code in v5 is
> forcing it to defer... again. Thus, it is getting less bandwidth... notice that
> it does not even need to be at the start of the period. It is the middle of the
> execution.
> 
> Intuitively, reducing the deferred time would help there. But the best thing to do is:
> 
> If the fair task waited for the defer, and the real-time tasks are still using all
> CPU time, do not defer the activation again, and keep the defer mechanism disabled
> until the real-time tasks allow the fair scheduler to run in the background. So,
> making the defer mode equivalent to the non-defer mode until the RT tasks start
> to behave again.
> 
> For that, in the v6, there is a variable (dl_defer_running), once the dl_server
> is enqueued after the defer time, the variable dl_defer_running is set.
> 
> If the fair task sleeps in the middle of the period, that variable do not change.
> 
> If the fair task wakes up and the dl_defer_running is still set, do not defer.
> Keep running until you consume the reservation.
> 
> The variable dl_defer_running is set to 0 only after the fair tasks consume
> its runtime without being in a dl_server... IOW, when the RT tasks start to
> behave.

Very nice explanation! Thanks Daniel.

> 
> No interface change.
> 
> With that in place, your reproducers are working. I have a periodic version
> of your reproducer, also improving how the task consumes the runtime,.. I
> will send it to you so you can have a look.

Looking forward to reviewing your patches when I'm back from PTO.

-- Steve
Re: [PATCH v5 6/7] sched/deadline: Deferrable dl server
Posted by Joel Fernandes 1 year, 9 months ago
> On 3/20/2024 3:24 PM, Daniel Bristot de Oliveira wrote:
>>> On 11/4/2023 6:59 AM, Daniel Bristot de Oliveira wrote:
>>>> Among the motivations for the DL servers is the real-time throttling
>>>> mechanism. This mechanism works by throttling the rt_rq after
>>>> running for a long period without leaving space for fair tasks.
>>>>
>>>> The base dl server avoids this problem by boosting fair tasks instead
>>>> of throttling the rt_rq. The point is that it boosts without waiting
>>>> for potential starvation, causing some non-intuitive cases.
>>>>
>>>> For example, an IRQ dispatches two tasks on an idle system, a fair
>>>> and an RT. The DL server will be activated, running the fair task
>>>> before the RT one. This problem can be avoided by deferring the
>>>> dl server activation.
>>>>
>>>> By setting the zerolax option, the dl_server will dispatch an
>>>> SCHED_DEADLINE reservation with replenished runtime, but throttled.
>>>>
>>>> The dl_timer will be set for (period - runtime) ns from start time.
>>>> Thus boosting the fair rq on its 0-laxity time with respect to 
> 

Hi,
Upon reflection, might we simplify the solution by treating RT as a deadline
reservation as well?

The RT deadline reservation can have shorter deadline so it will be interrupted
less immediately by CFS due to EDF. Would that work, or was that already tried
and has other dragons?

If we could pull that off, then we do not need all the deferral/timer stuff and
could considering dropping this patch. Yes it is more code, but this 6th patch
is also big and non trivial.

Juri, Daniel, all, what do you think?

(On the other hand if we want to keep this patch as a first step, and
incrementally improve that is Ok, but  I believe we do need to make a decision..)

By the way, is there a still a slot in OSPM available to discuss these? If so
that would be great. I can put up some slides.

cheers,

 - Joel
Re: [PATCH v5 6/7] sched/deadline: Deferrable dl server
Posted by kernel test robot 2 years, 1 month ago

Hello,

kernel test robot noticed "WARNING:at_kernel/sched/deadline.c:#enqueue_dl_entity" on:

commit: dea46af8e193ed4f23c37123bfd4a825399aedfe ("[PATCH v5 6/7] sched/deadline: Deferrable dl server")
url: https://github.com/intel-lab-lkp/linux/commits/Daniel-Bristot-de-Oliveira/sched-Unify-runtime-accounting-across-classes/20231104-201952
base: https://git.kernel.org/cgit/linux/kernel/git/tip/tip.git 984ffb6a4366752c949f7b39640aecdce222607f
patch link: https://lore.kernel.org/all/c7b706d30d6316c52853ca056db5beb82ba72863.1699095159.git.bristot@kernel.org/
patch subject: [PATCH v5 6/7] sched/deadline: Deferrable dl server

in testcase: trinity
version: trinity-i386-abe9de86-1_20230429
with following parameters:

	runtime: 600s

test-description: Trinity is a linux system call fuzz tester.
test-url: http://codemonkey.org.uk/projects/trinity/


compiler: gcc-9
test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 16G

(please refer to attached dmesg/kmsg for entire log/backtrace)


the issue does not always happen in our tests (show 9 times out of 20 runs),
but keeps clean on parent.

6f69498ee58c052e dea46af8e193ed4f23c37123bfd
---------------- ---------------------------
       fail:runs  %reproduction    fail:runs
           |             |             |
           :20          45%           9:20    dmesg.RIP:enqueue_dl_entity
           :20          45%           9:20    dmesg.WARNING:at_kernel/sched/deadline.c:#enqueue_dl_entity




If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <oliver.sang@intel.com>
| Closes: https://lore.kernel.org/oe-lkp/202311132217.2a9a4aac-oliver.sang@intel.com


[   59.623267][    C0] ------------[ cut here ]------------
[ 59.627229][ C0] WARNING: CPU: 0 PID: 1 at kernel/sched/deadline.c:1803 enqueue_dl_entity (kernel/sched/deadline.c:1803 (discriminator 1)) 
[   59.627229][    C0] Modules linked in:
[   59.627229][    C0] CPU: 0 PID: 1 Comm: swapper/0 Tainted: G                T  6.6.0-rc7-00090-gdea46af8e193 #1
[   59.627229][    C0] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
[ 59.627229][ C0] RIP: 0010:enqueue_dl_entity (kernel/sched/deadline.c:1803 (discriminator 1)) 
[ 59.627229][ C0] Code: 8e 74 ed ff ff 45 84 f6 0f 85 fd 08 00 00 48 8d b5 08 d8 ff ff 48 8d 95 c8 ee ff ff 4c 89 ff e8 40 f8 01 00 e9 50 ed ff ff 90 <0f> 0b 90 e9 9b ec ff ff 48 8d bd 1c d8 ff ff 48 c7 c3 40 fa 1f 00
All code
========
   0:	8e 74 ed ff          	mov    -0x1(%rbp,%rbp,8),%?
   4:	ff 45 84             	incl   -0x7c(%rbp)
   7:	f6 0f 85             	testb  $0x85,(%rdi)
   a:	fd                   	std    
   b:	08 00                	or     %al,(%rax)
   d:	00 48 8d             	add    %cl,-0x73(%rax)
  10:	b5 08                	mov    $0x8,%ch
  12:	d8 ff                	fdivr  %st(7),%st
  14:	ff 48 8d             	decl   -0x73(%rax)
  17:	95                   	xchg   %eax,%ebp
  18:	c8 ee ff ff          	enterq $0xffee,$0xff
  1c:	4c 89 ff             	mov    %r15,%rdi
  1f:	e8 40 f8 01 00       	callq  0x1f864
  24:	e9 50 ed ff ff       	jmpq   0xffffffffffffed79
  29:	90                   	nop
  2a:*	0f 0b                	ud2    		<-- trapping instruction
  2c:	90                   	nop
  2d:	e9 9b ec ff ff       	jmpq   0xffffffffffffeccd
  32:	48 8d bd 1c d8 ff ff 	lea    -0x27e4(%rbp),%rdi
  39:	48 c7 c3 40 fa 1f 00 	mov    $0x1ffa40,%rbx

Code starting with the faulting instruction
===========================================
   0:	0f 0b                	ud2    
   2:	90                   	nop
   3:	e9 9b ec ff ff       	jmpq   0xffffffffffffeca3
   8:	48 8d bd 1c d8 ff ff 	lea    -0x27e4(%rbp),%rdi
   f:	48 c7 c3 40 fa 1f 00 	mov    $0x1ffa40,%rbx
[   59.627229][    C0] RSP: 0000:ffffc90000007d28 EFLAGS: 00010092
[   59.627229][    C0] RAX: dffffc0000000000 RBX: ffff8883aec00418 RCX: 1ffffffff096d168
[   59.627229][    C0] RDX: 1ffff11075d80078 RSI: 0000000000000020 RDI: ffff8883aec003c0
[   59.627229][    C0] RBP: ffff8883aec003c0 R08: ffff8883aec004f0 R09: ffff8883aec00500
[   59.627229][    C0] R10: 0000000000000000 R11: ffffffff873fba8f R12: ffff8883aec00414
[   59.627229][    C0] R13: 0000000000000020 R14: ffff8883aebffa58 R15: ffff8883aec003c0
[   59.627229][    C0] FS:  0000000000000000(0000) GS:ffff8883aea00000(0000) knlGS:0000000000000000
[   59.627229][    C0] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   59.627229][    C0] CR2: ffff88843ffff000 CR3: 0000000004e70000 CR4: 00000000000006b0
[   59.627229][    C0] Call Trace:
[   59.627229][    C0]  <IRQ>
[ 59.627229][ C0] ? __warn (kernel/panic.c:673) 
[ 59.627229][ C0] ? enqueue_dl_entity (kernel/sched/deadline.c:1803 (discriminator 1)) 
[ 59.627229][ C0] ? report_bug (lib/bug.c:201 lib/bug.c:219) 
[ 59.627229][ C0] ? handle_bug (arch/x86/kernel/traps.c:237) 
[ 59.627229][ C0] ? exc_invalid_op (arch/x86/kernel/traps.c:258 (discriminator 1)) 
[ 59.627229][ C0] ? asm_exc_invalid_op (arch/x86/include/asm/idtentry.h:568) 
[ 59.627229][ C0] ? enqueue_dl_entity (kernel/sched/deadline.c:1803 (discriminator 1)) 
[ 59.627229][ C0] ? update_rq_clock (kernel/sched/core.c:765 kernel/sched/core.c:750) 
[ 59.627229][ C0] ? kvm_sched_clock_read (arch/x86/kernel/kvmclock.c:91) 
[ 59.627229][ C0] ? sched_clock_tick (kernel/sched/clock.c:270 kernel/sched/clock.c:426 kernel/sched/clock.c:412) 
[ 59.627229][ C0] dl_task_timer (kernel/sched/deadline.c:1193) 
[ 59.627229][ C0] ? pick_task_dl (kernel/sched/deadline.c:1174) 
[ 59.627229][ C0] __hrtimer_run_queues (kernel/time/hrtimer.c:1688 kernel/time/hrtimer.c:1752) 
[ 59.627229][ C0] ? enqueue_hrtimer (kernel/time/hrtimer.c:1722) 
[ 59.627229][ C0] ? kvm_clock_read (arch/x86/include/asm/preempt.h:95 arch/x86/kernel/kvmclock.c:80) 
[ 59.627229][ C0] ? ktime_get_update_offsets_now (kernel/time/timekeeping.c:292 (discriminator 4) kernel/time/timekeeping.c:388 (discriminator 4) kernel/time/timekeeping.c:2320 (discriminator 4)) 
[ 59.627229][ C0] hrtimer_interrupt (kernel/time/hrtimer.c:1817) 
[ 59.627229][ C0] __sysvec_apic_timer_interrupt (arch/x86/include/asm/atomic.h:23 include/linux/atomic/atomic-arch-fallback.h:444 include/linux/jump_label.h:260 include/linux/jump_label.h:270 arch/x86/include/asm/trace/irq_vectors.h:41 arch/x86/kernel/apic/apic.c:1081) 
[ 59.627229][ C0] sysvec_apic_timer_interrupt (arch/x86/kernel/apic/apic.c:1074 (discriminator 14)) 
[   59.627229][    C0]  </IRQ>
[   59.627229][    C0]  <TASK>
[ 59.627229][ C0] asm_sysvec_apic_timer_interrupt (arch/x86/include/asm/idtentry.h:645) 
[ 59.627229][ C0] RIP: 0010:_raw_spin_unlock_irqrestore (include/linux/spinlock_api_smp.h:152 kernel/locking/spinlock.c:194) 
[ 59.627229][ C0] Code: 83 c7 18 e8 ca 80 74 fd 48 89 ef e8 82 18 75 fd 81 e3 00 02 00 00 75 25 9c 58 f6 c4 02 75 2d 48 85 db 74 01 fb bf 01 00 00 00 <e8> 93 55 69 fd 65 8b 05 54 1a 66 7c 85 c0 74 0a 5b 5d c3 e8 30 63
All code
========
   0:	83 c7 18             	add    $0x18,%edi
   3:	e8 ca 80 74 fd       	callq  0xfffffffffd7480d2
   8:	48 89 ef             	mov    %rbp,%rdi
   b:	e8 82 18 75 fd       	callq  0xfffffffffd751892
  10:	81 e3 00 02 00 00    	and    $0x200,%ebx
  16:	75 25                	jne    0x3d
  18:	9c                   	pushfq 
  19:	58                   	pop    %rax
  1a:	f6 c4 02             	test   $0x2,%ah
  1d:	75 2d                	jne    0x4c
  1f:	48 85 db             	test   %rbx,%rbx
  22:	74 01                	je     0x25
  24:	fb                   	sti    
  25:	bf 01 00 00 00       	mov    $0x1,%edi
  2a:*	e8 93 55 69 fd       	callq  0xfffffffffd6955c2		<-- trapping instruction
  2f:	65 8b 05 54 1a 66 7c 	mov    %gs:0x7c661a54(%rip),%eax        # 0x7c661a8a
  36:	85 c0                	test   %eax,%eax
  38:	74 0a                	je     0x44
  3a:	5b                   	pop    %rbx
  3b:	5d                   	pop    %rbp
  3c:	c3                   	retq   
  3d:	e8                   	.byte 0xe8
  3e:	30                   	.byte 0x30
  3f:	63                   	.byte 0x63

Code starting with the faulting instruction
===========================================
   0:	e8 93 55 69 fd       	callq  0xfffffffffd695598
   5:	65 8b 05 54 1a 66 7c 	mov    %gs:0x7c661a54(%rip),%eax        # 0x7c661a60
   c:	85 c0                	test   %eax,%eax
   e:	74 0a                	je     0x1a
  10:	5b                   	pop    %rbx
  11:	5d                   	pop    %rbp
  12:	c3                   	retq   
  13:	e8                   	.byte 0xe8
  14:	30                   	.byte 0x30
  15:	63                   	.byte 0x63
[   59.627229][    C0] RSP: 0000:ffffc9000001fbb8 EFLAGS: 00000206
[   59.627229][    C0] RAX: 0000000000000006 RBX: 0000000000000200 RCX: ffffffff812e2631
[   59.627229][    C0] RDX: 0000000000000000 RSI: ffffffff83eaa940 RDI: 0000000000000001
[   59.627229][    C0] RBP: ffff88812d07ed00 R08: 0000000000000001 R09: fffffbfff0e7f757
[   59.627229][    C0] R10: fffffbfff0e7f756 R11: ffffffff873fbab7 R12: 0000000000000000
[   59.627229][    C0] R13: 0000000000000246 R14: ffff888195a701a8 R15: ffff88812cd23350
[ 59.627229][ C0] ? mark_lock (arch/x86/include/asm/bitops.h:228 (discriminator 3) arch/x86/include/asm/bitops.h:240 (discriminator 3) include/asm-generic/bitops/instrumented-non-atomic.h:142 (discriminator 3) kernel/locking/lockdep.c:228 (discriminator 3) kernel/locking/lockdep.c:4655 (discriminator 3)) 
[ 59.627229][ C0] dma_fence_signal (drivers/dma-buf/dma-fence.c:327 drivers/dma-buf/dma-fence.c:476) 
[ 59.627229][ C0] wait_backward (drivers/dma-buf/st-dma-fence-chain.c:621) 
[ 59.627229][ C0] ? find_gap (drivers/dma-buf/st-dma-fence-chain.c:603) 
[ 59.627229][ C0] ? __cond_resched (kernel/sched/core.c:8521) 
[ 59.627229][ C0] __subtests (drivers/dma-buf/selftest.c:106 (discriminator 1)) 
[ 59.627229][ C0] ? kmem_cache_open (mm/slub.c:2479 mm/slub.c:4232 mm/slub.c:4560) 
[ 59.627229][ C0] ? __sanitycheck__ (drivers/dma-buf/selftest.c:92) 
[ 59.627229][ C0] ? kmem_cache_create_usercopy (mm/slab_common.c:351) 
[ 59.627229][ C0] dma_fence_chain (drivers/dma-buf/st-dma-fence-chain.c:708) 
[ 59.627229][ C0] st_init (drivers/dma-buf/selftest.c:141 drivers/dma-buf/selftest.c:155) 
[ 59.627229][ C0] ? udmabuf_dev_init (drivers/dma-buf/selftest.c:154) 
[ 59.627229][ C0] do_one_initcall (init/main.c:1232) 
[ 59.627229][ C0] ? trace_event_raw_event_initcall_level (init/main.c:1223) 
[ 59.627229][ C0] ? parameq (kernel/params.c:171) 
[ 59.627229][ C0] ? strcpy (lib/string.c:83 (discriminator 1)) 
[ 59.627229][ C0] kernel_init_freeable (init/main.c:1293 init/main.c:1310 init/main.c:1329 init/main.c:1547) 
[ 59.627229][ C0] ? finish_task_switch (arch/x86/include/asm/irqflags.h:42 arch/x86/include/asm/irqflags.h:77 kernel/sched/sched.h:1390 kernel/sched/core.c:5129 kernel/sched/core.c:5247) 
[ 59.627229][ C0] ? rest_init (init/main.c:1429) 
[ 59.627229][ C0] kernel_init (init/main.c:1439) 
[ 59.627229][ C0] ? _raw_spin_unlock_irq (arch/x86/include/asm/preempt.h:104 include/linux/spinlock_api_smp.h:160 kernel/locking/spinlock.c:202) 
[ 59.627229][ C0] ret_from_fork (arch/x86/kernel/process.c:153) 
[ 59.627229][ C0] ? rest_init (init/main.c:1429) 
[ 59.627229][ C0] ret_from_fork_asm (arch/x86/entry/entry_64.S:312) 
[   59.627229][    C0]  </TASK>
[   59.627229][    C0] irq event stamp: 842784
[ 59.627229][ C0] hardirqs last enabled at (842783): irqentry_exit (kernel/entry/common.c:436) 
[ 59.627229][ C0] hardirqs last disabled at (842784): sysvec_apic_timer_interrupt (arch/x86/kernel/apic/apic.c:1074) 
[ 59.627229][ C0] softirqs last enabled at (842772): __do_softirq (arch/x86/include/asm/preempt.h:27 kernel/softirq.c:400 kernel/softirq.c:582) 
[ 59.627229][ C0] softirqs last disabled at (842761): irq_exit_rcu (kernel/softirq.c:427 kernel/softirq.c:632 kernel/softirq.c:622 kernel/softirq.c:644) 
[   59.627229][    C0] ---[ end trace 0000000000000000 ]---


The kernel config and materials to reproduce are available at:
https://download.01.org/0day-ci/archive/20231113/202311132217.2a9a4aac-oliver.sang@intel.com



-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Re: [PATCH v5 6/7] sched/deadline: Deferrable dl server
Posted by Steven Rostedt 2 years, 1 month ago
On Sat,  4 Nov 2023 11:59:23 +0100
Daniel Bristot de Oliveira <bristot@kernel.org> wrote:

> @@ -828,6 +836,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
>   * could happen are, typically, a entity voluntarily trying to overcome its
>   * runtime, or it just underestimated it during sched_setattr().
>   */
> +static int start_dl_timer(struct sched_dl_entity *dl_se);
>  static void replenish_dl_entity(struct sched_dl_entity *dl_se)
>  {
>  	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);

Nit, but you really shouldn't have a function prototype declaration right
next to a function, and especially not between the function's comment and
the function itself.

-- Steve
Re: [PATCH v5 6/7] sched/deadline: Deferrable dl server
Posted by Joel Fernandes 2 years, 1 month ago
Hi Daniel,

On Sat, Nov 4, 2023 at 6:59 AM Daniel Bristot de Oliveira
<bristot@kernel.org> wrote:
>
> Among the motivations for the DL servers is the real-time throttling
> mechanism. This mechanism works by throttling the rt_rq after
> running for a long period without leaving space for fair tasks.
>
> The base dl server avoids this problem by boosting fair tasks instead
> of throttling the rt_rq. The point is that it boosts without waiting
> for potential starvation, causing some non-intuitive cases.
>
> For example, an IRQ dispatches two tasks on an idle system, a fair
> and an RT. The DL server will be activated, running the fair task
> before the RT one. This problem can be avoided by deferring the
> dl server activation.
>
> By setting the zerolax option, the dl_server will dispatch an
> SCHED_DEADLINE reservation with replenished runtime, but throttled.
>
> The dl_timer will be set for (period - runtime) ns from start time.
> Thus boosting the fair rq on its 0-laxity time with respect to
> rt_rq.
>
> If the fair scheduler has the opportunity to run while waiting
> for zerolax time, the dl server runtime will be consumed. If
> the runtime is completely consumed before the zerolax time, the
> server will be replenished while still in a throttled state. Then,
> the dl_timer will be reset to the new zerolax time
>
> If the fair server reaches the zerolax time without consuming
> its runtime, the server will be boosted, following CBS rules
> (thus without breaking SCHED_DEADLINE).
>
> Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
> ---
>  include/linux/sched.h   |   2 +
>  kernel/sched/deadline.c | 100 +++++++++++++++++++++++++++++++++++++++-
>  kernel/sched/fair.c     |   3 ++
>  3 files changed, 103 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 5ac1f252e136..56e53e6fd5a0 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -660,6 +660,8 @@ struct sched_dl_entity {
>         unsigned int                    dl_non_contending : 1;
>         unsigned int                    dl_overrun        : 1;
>         unsigned int                    dl_server         : 1;
> +       unsigned int                    dl_zerolax        : 1;
> +       unsigned int                    dl_zerolax_armed  : 1;
>
>         /*
>          * Bandwidth enforcement timer. Each -deadline task has its
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index 1d7b96ca9011..69ee1fbd60e4 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -772,6 +772,14 @@ static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se,
>         /* for non-boosted task, pi_of(dl_se) == dl_se */
>         dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
>         dl_se->runtime = pi_of(dl_se)->dl_runtime;
> +
> +       /*
> +        * If it is a zerolax reservation, throttle it.
> +        */
> +       if (dl_se->dl_zerolax) {
> +               dl_se->dl_throttled = 1;
> +               dl_se->dl_zerolax_armed = 1;
> +       }
>  }
>
>  /*
> @@ -828,6 +836,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
>   * could happen are, typically, a entity voluntarily trying to overcome its
>   * runtime, or it just underestimated it during sched_setattr().
>   */
> +static int start_dl_timer(struct sched_dl_entity *dl_se);
>  static void replenish_dl_entity(struct sched_dl_entity *dl_se)
>  {
>         struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
> @@ -874,6 +883,28 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se)
>                 dl_se->dl_yielded = 0;
>         if (dl_se->dl_throttled)
>                 dl_se->dl_throttled = 0;
> +
> +       /*
> +        * If this is the replenishment of a zerolax reservation,
> +        * clear the flag and return.
> +        */
> +       if (dl_se->dl_zerolax_armed) {
> +               dl_se->dl_zerolax_armed = 0;
> +               return;
> +       }
> +
> +       /*
> +        * A this point, if the zerolax server is not armed, and the deadline
> +        * is in the future, throttle the server and arm the zerolax timer.
> +        */
> +       if (dl_se->dl_zerolax &&
> +           dl_time_before(dl_se->deadline - dl_se->runtime, rq_clock(rq))) {
> +               if (!is_dl_boosted(dl_se)) {
> +                       dl_se->dl_zerolax_armed = 1;
> +                       dl_se->dl_throttled = 1;
> +                       start_dl_timer(dl_se);
> +               }
> +       }
>  }
>
>  /*
> @@ -1024,6 +1055,13 @@ static void update_dl_entity(struct sched_dl_entity *dl_se)
>                 }
>
>                 replenish_dl_new_period(dl_se, rq);
> +       } else if (dl_server(dl_se) && dl_se->dl_zerolax) {
> +               /*
> +                * The server can still use its previous deadline, so throttle
> +                * and arm the zero-laxity timer.
> +                */
> +               dl_se->dl_zerolax_armed = 1;
> +               dl_se->dl_throttled = 1;
>         }
>  }
>
> @@ -1056,8 +1094,20 @@ static int start_dl_timer(struct sched_dl_entity *dl_se)
>          * We want the timer to fire at the deadline, but considering
>          * that it is actually coming from rq->clock and not from
>          * hrtimer's time base reading.
> +        *
> +        * The zerolax reservation will have its timer set to the
> +        * deadline - runtime. At that point, the CBS rule will decide
> +        * if the current deadline can be used, or if a replenishment
> +        * is required to avoid add too much pressure on the system
> +        * (current u > U).
>          */
> -       act = ns_to_ktime(dl_next_period(dl_se));
> +       if (dl_se->dl_zerolax_armed) {
> +               WARN_ON_ONCE(!dl_se->dl_throttled);
> +               act = ns_to_ktime(dl_se->deadline - dl_se->runtime);

Just a question, here if dl_se->deadline - dl_se->runtime is large,
then does that mean that server activation will be much more into the
future? So say I want to give CFS 30%, then it will take 70% of the
period before CFS preempts RT thus "starving" CFS for this duration. I
think that's Ok for smaller periods and runtimes, though.

I think it does reserve the amount of required CFS bandwidth so it is
probably OK, though it is perhaps letting RT run more initially (say
if CFS tasks are not CPU bound and occasionally wake up, they will
always be hit by the 70% latency AFAICS which may be large for large
periods and small runtimes).

I/we're currently trying these patches on ChromeOS as well.

Just started going over it to understand the patch. Looking nice so
far and thanks,

 - Joel
Re: [PATCH v5 6/7] sched/deadline: Deferrable dl server
Posted by Daniel Bristot de Oliveira 2 years, 1 month ago
On 11/6/23 20:32, Joel Fernandes wrote:
> Hi Daniel,
> 
> On Sat, Nov 4, 2023 at 6:59 AM Daniel Bristot de Oliveira
> <bristot@kernel.org> wrote:
>>
>> Among the motivations for the DL servers is the real-time throttling
>> mechanism. This mechanism works by throttling the rt_rq after
>> running for a long period without leaving space for fair tasks.
>>
>> The base dl server avoids this problem by boosting fair tasks instead
>> of throttling the rt_rq. The point is that it boosts without waiting
>> for potential starvation, causing some non-intuitive cases.
>>
>> For example, an IRQ dispatches two tasks on an idle system, a fair
>> and an RT. The DL server will be activated, running the fair task
>> before the RT one. This problem can be avoided by deferring the
>> dl server activation.
>>
>> By setting the zerolax option, the dl_server will dispatch an
>> SCHED_DEADLINE reservation with replenished runtime, but throttled.
>>
>> The dl_timer will be set for (period - runtime) ns from start time.
>> Thus boosting the fair rq on its 0-laxity time with respect to
>> rt_rq.
>>
>> If the fair scheduler has the opportunity to run while waiting
>> for zerolax time, the dl server runtime will be consumed. If
>> the runtime is completely consumed before the zerolax time, the
>> server will be replenished while still in a throttled state. Then,
>> the dl_timer will be reset to the new zerolax time
>>
>> If the fair server reaches the zerolax time without consuming
>> its runtime, the server will be boosted, following CBS rules
>> (thus without breaking SCHED_DEADLINE).
>>
>> Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
>> ---
>>  include/linux/sched.h   |   2 +
>>  kernel/sched/deadline.c | 100 +++++++++++++++++++++++++++++++++++++++-
>>  kernel/sched/fair.c     |   3 ++
>>  3 files changed, 103 insertions(+), 2 deletions(-)
>>
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index 5ac1f252e136..56e53e6fd5a0 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -660,6 +660,8 @@ struct sched_dl_entity {
>>         unsigned int                    dl_non_contending : 1;
>>         unsigned int                    dl_overrun        : 1;
>>         unsigned int                    dl_server         : 1;
>> +       unsigned int                    dl_zerolax        : 1;
>> +       unsigned int                    dl_zerolax_armed  : 1;
>>
>>         /*
>>          * Bandwidth enforcement timer. Each -deadline task has its
>> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
>> index 1d7b96ca9011..69ee1fbd60e4 100644
>> --- a/kernel/sched/deadline.c
>> +++ b/kernel/sched/deadline.c
>> @@ -772,6 +772,14 @@ static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se,
>>         /* for non-boosted task, pi_of(dl_se) == dl_se */
>>         dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
>>         dl_se->runtime = pi_of(dl_se)->dl_runtime;
>> +
>> +       /*
>> +        * If it is a zerolax reservation, throttle it.
>> +        */
>> +       if (dl_se->dl_zerolax) {
>> +               dl_se->dl_throttled = 1;
>> +               dl_se->dl_zerolax_armed = 1;
>> +       }
>>  }
>>
>>  /*
>> @@ -828,6 +836,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
>>   * could happen are, typically, a entity voluntarily trying to overcome its
>>   * runtime, or it just underestimated it during sched_setattr().
>>   */
>> +static int start_dl_timer(struct sched_dl_entity *dl_se);
>>  static void replenish_dl_entity(struct sched_dl_entity *dl_se)
>>  {
>>         struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
>> @@ -874,6 +883,28 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se)
>>                 dl_se->dl_yielded = 0;
>>         if (dl_se->dl_throttled)
>>                 dl_se->dl_throttled = 0;
>> +
>> +       /*
>> +        * If this is the replenishment of a zerolax reservation,
>> +        * clear the flag and return.
>> +        */
>> +       if (dl_se->dl_zerolax_armed) {
>> +               dl_se->dl_zerolax_armed = 0;
>> +               return;
>> +       }
>> +
>> +       /*
>> +        * A this point, if the zerolax server is not armed, and the deadline
>> +        * is in the future, throttle the server and arm the zerolax timer.
>> +        */
>> +       if (dl_se->dl_zerolax &&
>> +           dl_time_before(dl_se->deadline - dl_se->runtime, rq_clock(rq))) {
>> +               if (!is_dl_boosted(dl_se)) {
>> +                       dl_se->dl_zerolax_armed = 1;
>> +                       dl_se->dl_throttled = 1;
>> +                       start_dl_timer(dl_se);
>> +               }
>> +       }
>>  }
>>
>>  /*
>> @@ -1024,6 +1055,13 @@ static void update_dl_entity(struct sched_dl_entity *dl_se)
>>                 }
>>
>>                 replenish_dl_new_period(dl_se, rq);
>> +       } else if (dl_server(dl_se) && dl_se->dl_zerolax) {
>> +               /*
>> +                * The server can still use its previous deadline, so throttle
>> +                * and arm the zero-laxity timer.
>> +                */
>> +               dl_se->dl_zerolax_armed = 1;
>> +               dl_se->dl_throttled = 1;
>>         }
>>  }
>>
>> @@ -1056,8 +1094,20 @@ static int start_dl_timer(struct sched_dl_entity *dl_se)
>>          * We want the timer to fire at the deadline, but considering
>>          * that it is actually coming from rq->clock and not from
>>          * hrtimer's time base reading.
>> +        *
>> +        * The zerolax reservation will have its timer set to the
>> +        * deadline - runtime. At that point, the CBS rule will decide
>> +        * if the current deadline can be used, or if a replenishment
>> +        * is required to avoid add too much pressure on the system
>> +        * (current u > U).
>>          */
>> -       act = ns_to_ktime(dl_next_period(dl_se));
>> +       if (dl_se->dl_zerolax_armed) {
>> +               WARN_ON_ONCE(!dl_se->dl_throttled);
>> +               act = ns_to_ktime(dl_se->deadline - dl_se->runtime);
> 
> Just a question, here if dl_se->deadline - dl_se->runtime is large,
> then does that mean that server activation will be much more into the
> future? So say I want to give CFS 30%, then it will take 70% of the
> period before CFS preempts RT thus "starving" CFS for this duration. I
> think that's Ok for smaller periods and runtimes, though.

I think you are answering yourself here :-)

If the default values are not good, change them o/

The current interface allows you to have more responsive/small chuck of CPU
or less responsive/large chucks of CPU... you can even place RT bellow CFS
for a "bounded amount of time" by disabling the defer option... per CPU.
All at once with different periods patterns on CPUs to increase the
changes of having a cfs rq ready on another CPU... like...

[3/10 - 2/6 - 1.5/5 - 1/3 no defer] in a 4 cpus system :-).

The default setup is based on the throttling to avoid changing
the historical behavior for those that... are happy with them.

-- Daniel
>  - Joel

Re: [PATCH v5 6/7] sched/deadline: Deferrable dl server
Posted by Joel Fernandes 2 years, 1 month ago
On Mon, Nov 6, 2023 at 2:32 PM Joel Fernandes <joel@joelfernandes.org> wrote:
>
> Hi Daniel,
>
> On Sat, Nov 4, 2023 at 6:59 AM Daniel Bristot de Oliveira
> <bristot@kernel.org> wrote:
> >
> > Among the motivations for the DL servers is the real-time throttling
> > mechanism. This mechanism works by throttling the rt_rq after
> > running for a long period without leaving space for fair tasks.
> >
> > The base dl server avoids this problem by boosting fair tasks instead
> > of throttling the rt_rq. The point is that it boosts without waiting
> > for potential starvation, causing some non-intuitive cases.
> >
> > For example, an IRQ dispatches two tasks on an idle system, a fair
> > and an RT. The DL server will be activated, running the fair task
> > before the RT one. This problem can be avoided by deferring the
> > dl server activation.
> >
> > By setting the zerolax option, the dl_server will dispatch an
> > SCHED_DEADLINE reservation with replenished runtime, but throttled.
> >
> > The dl_timer will be set for (period - runtime) ns from start time.
> > Thus boosting the fair rq on its 0-laxity time with respect to
> > rt_rq.
> >
> > If the fair scheduler has the opportunity to run while waiting
> > for zerolax time, the dl server runtime will be consumed. If
> > the runtime is completely consumed before the zerolax time, the
> > server will be replenished while still in a throttled state. Then,
> > the dl_timer will be reset to the new zerolax time
> >
> > If the fair server reaches the zerolax time without consuming
> > its runtime, the server will be boosted, following CBS rules
> > (thus without breaking SCHED_DEADLINE).
> >
> > Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
> > ---
> >  include/linux/sched.h   |   2 +
> >  kernel/sched/deadline.c | 100 +++++++++++++++++++++++++++++++++++++++-
> >  kernel/sched/fair.c     |   3 ++
> >  3 files changed, 103 insertions(+), 2 deletions(-)
> >
> > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > index 5ac1f252e136..56e53e6fd5a0 100644
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -660,6 +660,8 @@ struct sched_dl_entity {
> >         unsigned int                    dl_non_contending : 1;
> >         unsigned int                    dl_overrun        : 1;
> >         unsigned int                    dl_server         : 1;
> > +       unsigned int                    dl_zerolax        : 1;
> > +       unsigned int                    dl_zerolax_armed  : 1;
> >
> >         /*
> >          * Bandwidth enforcement timer. Each -deadline task has its
> > diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> > index 1d7b96ca9011..69ee1fbd60e4 100644
> > --- a/kernel/sched/deadline.c
> > +++ b/kernel/sched/deadline.c
> > @@ -772,6 +772,14 @@ static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se,
> >         /* for non-boosted task, pi_of(dl_se) == dl_se */
> >         dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
> >         dl_se->runtime = pi_of(dl_se)->dl_runtime;
> > +
> > +       /*
> > +        * If it is a zerolax reservation, throttle it.
> > +        */
> > +       if (dl_se->dl_zerolax) {
> > +               dl_se->dl_throttled = 1;
> > +               dl_se->dl_zerolax_armed = 1;
> > +       }
> >  }
> >
> >  /*
> > @@ -828,6 +836,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
> >   * could happen are, typically, a entity voluntarily trying to overcome its
> >   * runtime, or it just underestimated it during sched_setattr().
> >   */
> > +static int start_dl_timer(struct sched_dl_entity *dl_se);
> >  static void replenish_dl_entity(struct sched_dl_entity *dl_se)
> >  {
> >         struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
> > @@ -874,6 +883,28 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se)
> >                 dl_se->dl_yielded = 0;
> >         if (dl_se->dl_throttled)
> >                 dl_se->dl_throttled = 0;
> > +
> > +       /*
> > +        * If this is the replenishment of a zerolax reservation,
> > +        * clear the flag and return.
> > +        */
> > +       if (dl_se->dl_zerolax_armed) {
> > +               dl_se->dl_zerolax_armed = 0;
> > +               return;
> > +       }
> > +
> > +       /*
> > +        * A this point, if the zerolax server is not armed, and the deadline
> > +        * is in the future, throttle the server and arm the zerolax timer.
> > +        */
> > +       if (dl_se->dl_zerolax &&
> > +           dl_time_before(dl_se->deadline - dl_se->runtime, rq_clock(rq))) {
> > +               if (!is_dl_boosted(dl_se)) {
> > +                       dl_se->dl_zerolax_armed = 1;
> > +                       dl_se->dl_throttled = 1;
> > +                       start_dl_timer(dl_se);
> > +               }
> > +       }
> >  }
> >
> >  /*
> > @@ -1024,6 +1055,13 @@ static void update_dl_entity(struct sched_dl_entity *dl_se)
> >                 }
> >
> >                 replenish_dl_new_period(dl_se, rq);
> > +       } else if (dl_server(dl_se) && dl_se->dl_zerolax) {
> > +               /*
> > +                * The server can still use its previous deadline, so throttle
> > +                * and arm the zero-laxity timer.
> > +                */
> > +               dl_se->dl_zerolax_armed = 1;
> > +               dl_se->dl_throttled = 1;
> >         }
> >  }
> >
> > @@ -1056,8 +1094,20 @@ static int start_dl_timer(struct sched_dl_entity *dl_se)
> >          * We want the timer to fire at the deadline, but considering
> >          * that it is actually coming from rq->clock and not from
> >          * hrtimer's time base reading.
> > +        *
> > +        * The zerolax reservation will have its timer set to the
> > +        * deadline - runtime. At that point, the CBS rule will decide
> > +        * if the current deadline can be used, or if a replenishment
> > +        * is required to avoid add too much pressure on the system
> > +        * (current u > U).
> >          */
> > -       act = ns_to_ktime(dl_next_period(dl_se));
> > +       if (dl_se->dl_zerolax_armed) {
> > +               WARN_ON_ONCE(!dl_se->dl_throttled);
> > +               act = ns_to_ktime(dl_se->deadline - dl_se->runtime);
>
> Just a question, here if dl_se->deadline - dl_se->runtime is large,
> then does that mean that server activation will be much more into the
> future? So say I want to give CFS 30%, then it will take 70% of the
> period before CFS preempts RT thus "starving" CFS for this duration. I
> think that's Ok for smaller periods and runtimes, though.
>
> I think it does reserve the amount of required CFS bandwidth so it is
> probably OK, though it is perhaps letting RT run more initially (say
> if CFS tasks are not CPU bound and occasionally wake up, they will
> always be hit by the 70% latency AFAICS which may be large for large
> periods and small runtimes).
>

One more consideration I guess is, because the server is throttled
till 0-laxity time, it is possible that if CFS sleeps even a bit
(after the DL-server is unthrottled), then it will be pushed out to a
full current deadline + period due to CBS. In such a situation,  if
CFS-server is the only DL task running, it might starve RT for a bit
more time.

Example, say CFS runtime is 0.3s and period is 1s. At 0.7s, 0-laxity
timer fires. CFS runs for 0.29s, then sleeps for 0.005s and wakes up
at 0.295s (its remaining runtime is 0.01s at this point which is < the
"time till deadline" of 0.005s). Now the runtime of the CFS-server
will be replenished to the full 3s (due to CBS) and the deadline
pushed out. The end result is the total runtime that the CFS-server
actually gets is 0.0595s (though yes it did sleep for 5ms in between,
still that's tiny -- say if it briefly blocked on a kernel mutex).

On the other hand, if the CFS server started a bit earlier than the
0-laxity, it would probably not have had CBS pushing it out.

This is likely also not an issue for shorter runtime/period values,
still the throttling till later has a small trade-off (Not saying we
should not do this, this whole series is likely a huge improvement
over the current RT throttling).

There is a chance I am uttering nonsense as I am not a DL expert, so
apologies if so.

Thanks.
Re: [PATCH v5 6/7] sched/deadline: Deferrable dl server
Posted by Joel Fernandes 2 years, 1 month ago
On Mon, Nov 6, 2023 at 4:32 PM Joel Fernandes <joel@joelfernandes.org> wrote:
>
> On Mon, Nov 6, 2023 at 2:32 PM Joel Fernandes <joel@joelfernandes.org> wrote:
> >
> > Hi Daniel,
> >
> > On Sat, Nov 4, 2023 at 6:59 AM Daniel Bristot de Oliveira
> > <bristot@kernel.org> wrote:
> > >
> > > Among the motivations for the DL servers is the real-time throttling
> > > mechanism. This mechanism works by throttling the rt_rq after
> > > running for a long period without leaving space for fair tasks.
> > >
> > > The base dl server avoids this problem by boosting fair tasks instead
> > > of throttling the rt_rq. The point is that it boosts without waiting
> > > for potential starvation, causing some non-intuitive cases.
> > >
> > > For example, an IRQ dispatches two tasks on an idle system, a fair
> > > and an RT. The DL server will be activated, running the fair task
> > > before the RT one. This problem can be avoided by deferring the
> > > dl server activation.
> > >
> > > By setting the zerolax option, the dl_server will dispatch an
> > > SCHED_DEADLINE reservation with replenished runtime, but throttled.
> > >
> > > The dl_timer will be set for (period - runtime) ns from start time.
> > > Thus boosting the fair rq on its 0-laxity time with respect to
> > > rt_rq.
> > >
> > > If the fair scheduler has the opportunity to run while waiting
> > > for zerolax time, the dl server runtime will be consumed. If
> > > the runtime is completely consumed before the zerolax time, the
> > > server will be replenished while still in a throttled state. Then,
> > > the dl_timer will be reset to the new zerolax time
> > >
> > > If the fair server reaches the zerolax time without consuming
> > > its runtime, the server will be boosted, following CBS rules
> > > (thus without breaking SCHED_DEADLINE).
> > >
> > > Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
> > > ---
> > >  include/linux/sched.h   |   2 +
> > >  kernel/sched/deadline.c | 100 +++++++++++++++++++++++++++++++++++++++-
> > >  kernel/sched/fair.c     |   3 ++
> > >  3 files changed, 103 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > > index 5ac1f252e136..56e53e6fd5a0 100644
> > > --- a/include/linux/sched.h
> > > +++ b/include/linux/sched.h
> > > @@ -660,6 +660,8 @@ struct sched_dl_entity {
> > >         unsigned int                    dl_non_contending : 1;
> > >         unsigned int                    dl_overrun        : 1;
> > >         unsigned int                    dl_server         : 1;
> > > +       unsigned int                    dl_zerolax        : 1;
> > > +       unsigned int                    dl_zerolax_armed  : 1;
> > >
> > >         /*
> > >          * Bandwidth enforcement timer. Each -deadline task has its
> > > diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> > > index 1d7b96ca9011..69ee1fbd60e4 100644
> > > --- a/kernel/sched/deadline.c
> > > +++ b/kernel/sched/deadline.c
> > > @@ -772,6 +772,14 @@ static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se,
> > >         /* for non-boosted task, pi_of(dl_se) == dl_se */
> > >         dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
> > >         dl_se->runtime = pi_of(dl_se)->dl_runtime;
> > > +
> > > +       /*
> > > +        * If it is a zerolax reservation, throttle it.
> > > +        */
> > > +       if (dl_se->dl_zerolax) {
> > > +               dl_se->dl_throttled = 1;
> > > +               dl_se->dl_zerolax_armed = 1;
> > > +       }
> > >  }
> > >
> > >  /*
> > > @@ -828,6 +836,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
> > >   * could happen are, typically, a entity voluntarily trying to overcome its
> > >   * runtime, or it just underestimated it during sched_setattr().
> > >   */
> > > +static int start_dl_timer(struct sched_dl_entity *dl_se);
> > >  static void replenish_dl_entity(struct sched_dl_entity *dl_se)
> > >  {
> > >         struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
> > > @@ -874,6 +883,28 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se)
> > >                 dl_se->dl_yielded = 0;
> > >         if (dl_se->dl_throttled)
> > >                 dl_se->dl_throttled = 0;
> > > +
> > > +       /*
> > > +        * If this is the replenishment of a zerolax reservation,
> > > +        * clear the flag and return.
> > > +        */
> > > +       if (dl_se->dl_zerolax_armed) {
> > > +               dl_se->dl_zerolax_armed = 0;
> > > +               return;
> > > +       }
> > > +
> > > +       /*
> > > +        * A this point, if the zerolax server is not armed, and the deadline
> > > +        * is in the future, throttle the server and arm the zerolax timer.
> > > +        */
> > > +       if (dl_se->dl_zerolax &&
> > > +           dl_time_before(dl_se->deadline - dl_se->runtime, rq_clock(rq))) {
> > > +               if (!is_dl_boosted(dl_se)) {
> > > +                       dl_se->dl_zerolax_armed = 1;
> > > +                       dl_se->dl_throttled = 1;
> > > +                       start_dl_timer(dl_se);
> > > +               }
> > > +       }
> > >  }
> > >
> > >  /*
> > > @@ -1024,6 +1055,13 @@ static void update_dl_entity(struct sched_dl_entity *dl_se)
> > >                 }
> > >
> > >                 replenish_dl_new_period(dl_se, rq);
> > > +       } else if (dl_server(dl_se) && dl_se->dl_zerolax) {
> > > +               /*
> > > +                * The server can still use its previous deadline, so throttle
> > > +                * and arm the zero-laxity timer.
> > > +                */
> > > +               dl_se->dl_zerolax_armed = 1;
> > > +               dl_se->dl_throttled = 1;
> > >         }
> > >  }
> > >
> > > @@ -1056,8 +1094,20 @@ static int start_dl_timer(struct sched_dl_entity *dl_se)
> > >          * We want the timer to fire at the deadline, but considering
> > >          * that it is actually coming from rq->clock and not from
> > >          * hrtimer's time base reading.
> > > +        *
> > > +        * The zerolax reservation will have its timer set to the
> > > +        * deadline - runtime. At that point, the CBS rule will decide
> > > +        * if the current deadline can be used, or if a replenishment
> > > +        * is required to avoid add too much pressure on the system
> > > +        * (current u > U).
> > >          */
> > > -       act = ns_to_ktime(dl_next_period(dl_se));
> > > +       if (dl_se->dl_zerolax_armed) {
> > > +               WARN_ON_ONCE(!dl_se->dl_throttled);
> > > +               act = ns_to_ktime(dl_se->deadline - dl_se->runtime);
> >
> > Just a question, here if dl_se->deadline - dl_se->runtime is large,
> > then does that mean that server activation will be much more into the
> > future? So say I want to give CFS 30%, then it will take 70% of the
> > period before CFS preempts RT thus "starving" CFS for this duration. I
> > think that's Ok for smaller periods and runtimes, though.
> >
> > I think it does reserve the amount of required CFS bandwidth so it is
> > probably OK, though it is perhaps letting RT run more initially (say
> > if CFS tasks are not CPU bound and occasionally wake up, they will
> > always be hit by the 70% latency AFAICS which may be large for large
> > periods and small runtimes).
> >
>
> One more consideration I guess is, because the server is throttled
> till 0-laxity time, it is possible that if CFS sleeps even a bit
> (after the DL-server is unthrottled), then it will be pushed out to a
> full current deadline + period due to CBS. In such a situation,  if
> CFS-server is the only DL task running, it might starve RT for a bit
> more time.
>
> Example, say CFS runtime is 0.3s and period is 1s. At 0.7s, 0-laxity
> timer fires. CFS runs for 0.29s, then sleeps for 0.005s and wakes up
> at 0.295s (its remaining runtime is 0.01s at this point which is < the
> "time till deadline" of 0.005s). Now the runtime of the CFS-server
> will be replenished to the full 3s (due to CBS) and the deadline
> pushed out. The end result is the total runtime that the CFS-server
> actually gets is 0.0595s (though yes it did sleep for 5ms in between,
> still that's tiny -- say if it briefly blocked on a kernel mutex).

Blah, I got lost in decimal points. Here's the example again:

Say CFS-server runtime is 0.3s and period is 1s.

At 0.7s, 0-laxity timer fires. CFS runs for 0.29s, then sleeps for
0.005s and wakes up at 0.295s (its remaining runtime is 0.01s at this
point which is < the "time till deadline" of 0.005s)

Now the runtime of the CFS-server will be replenished to the full 0.3s
(due to CBS) and the deadline
pushed out.

The end result is, the total runtime that the CFS-server actually gets
is 0.595s (though yes it did sleep for 5ms in between, still that's
tiny -- say if it briefly blocked on a kernel mutex). That's almost
double the allocated runtime.

This is just theoretical and I have yet to see if it is actually an
issue in practice.

Thanks.
Re: [PATCH v5 6/7] sched/deadline: Deferrable dl server
Posted by Steven Rostedt 2 years, 1 month ago
On Mon, 6 Nov 2023 16:37:32 -0500
Joel Fernandes <joel@joelfernandes.org> wrote:

> Say CFS-server runtime is 0.3s and period is 1s.
> 
> At 0.7s, 0-laxity timer fires. CFS runs for 0.29s, then sleeps for
> 0.005s and wakes up at 0.295s (its remaining runtime is 0.01s at this
> point which is < the "time till deadline" of 0.005s)
> 
> Now the runtime of the CFS-server will be replenished to the full 0.3s
> (due to CBS) and the deadline
> pushed out.
> 
> The end result is, the total runtime that the CFS-server actually gets
> is 0.595s (though yes it did sleep for 5ms in between, still that's
> tiny -- say if it briefly blocked on a kernel mutex). That's almost
> double the allocated runtime.
> 
> This is just theoretical and I have yet to see if it is actually an
> issue in practice.

Let me see if I understand what you are asking. By pushing the execution of
the CFS-server to the end of its period, if it it was briefly blocked and
was not able to consume all of its zerolax time, its bandwidth gets
refreshed. Then it can run again, basically doubling its total time.

But this is basically saying that it ran for its runtime at the start of
one period and at the beginning of another, right?

Is that an issue? The CFS-server is still just consuming it's time per
period. That means that an RT tasks was starving the system that much to
push it forward too much anyway. I wonder if we just document this
behavior, if that would be enough?

-- Steve
Re: [PATCH v5 6/7] sched/deadline: Deferrable dl server
Posted by Joel Fernandes 2 years, 1 month ago
On Tue, Nov 07, 2023 at 11:47:32AM -0500, Steven Rostedt wrote:
> On Mon, 6 Nov 2023 16:37:32 -0500
> Joel Fernandes <joel@joelfernandes.org> wrote:
> 
> > Say CFS-server runtime is 0.3s and period is 1s.
> > 
> > At 0.7s, 0-laxity timer fires. CFS runs for 0.29s, then sleeps for
> > 0.005s and wakes up at 0.295s (its remaining runtime is 0.01s at this
> > point which is < the "time till deadline" of 0.005s)
> > 
> > Now the runtime of the CFS-server will be replenished to the full 0.3s
> > (due to CBS) and the deadline
> > pushed out.
> > 
> > The end result is, the total runtime that the CFS-server actually gets
> > is 0.595s (though yes it did sleep for 5ms in between, still that's
> > tiny -- say if it briefly blocked on a kernel mutex). That's almost
> > double the allocated runtime.
> > 
> > This is just theoretical and I have yet to see if it is actually an
> > issue in practice.
> 
> Let me see if I understand what you are asking. By pushing the execution of
> the CFS-server to the end of its period, if it it was briefly blocked and
> was not able to consume all of its zerolax time, its bandwidth gets
> refreshed. Then it can run again, basically doubling its total time.

I think my assumption about what happens during blocking was wrong. If it
blocked, the server is actually stopped via dl_server_stop() and it starts
all over again on enqueue.

That makes me worry about the opposite issue now. If the server restarts
because it blocked briefly, that means again it starts in a throttled state
and has to wait to run till zero-lax time. If CFS is a 99% load but blocks
very briefly after getting to run a little bit (totalling 1% of the time),
then it wont get 30% because it will keep getting delayed to the new 0-lax
every time it wakes up from its very-brief nap. Is that really Ok?

> But this is basically saying that it ran for its runtime at the start of
> one period and at the beginning of another, right?

I am not sure if this can happen but I could be missing something. AFAICS,
there is no scenario where the DL server gets to run at the start of a new
period unless RT is not running. The way the patch is written AFAICS,
whenever the DL-server runs out of runtime, it gets throttled and a timer
fires to go off at the beginning of the next period.
(update_curr_dl_se() -> dl_runtime_exceeded() -> start_dl_timer()).

In this timer handler (which fired at next period beginning), it will
actually replenish_dl_entity() to refresh the runtime and push the period
forward. Then it will throttle the server till the 0-lax time. That  means we
always end up running at the 0-lax time when starting a new period if RT is
running, and never at the beginning. Did I miss something?

On the other hand, if it does not run out of runtime, it will keep running
within its 0-lax time. We know there is enough time within its 0-lax time for
it to run because when we unthrottled it, we checked for that.

Switching gears, another (most likely theoretical) concern I had is what if
the 0-lax timer interrupt gets delayed a little bit. Then we will always end
up not having enough 0-lax time and keep requeuing the timer, that means CFS
will be starved always as we keep pushing the execution to the next period's
0-lax time.

Anyway, I guess I better get to testing this stuff tomorrow and day after on
ChromeOS before LPC starts. Personally I feel this is a great first cut and
hope we can get v5 into mainline and iteratively improve. :)

thanks,

 - Joel
Re: [PATCH v5 6/7] sched/deadline: Deferrable dl server
Posted by Daniel Bristot de Oliveira 2 years, 1 month ago
On 11/7/23 17:47, Steven Rostedt wrote:
> On Mon, 6 Nov 2023 16:37:32 -0500
> Joel Fernandes <joel@joelfernandes.org> wrote:
> 
>> Say CFS-server runtime is 0.3s and period is 1s.
>>
>> At 0.7s, 0-laxity timer fires. CFS runs for 0.29s, then sleeps for
>> 0.005s and wakes up at 0.295s (its remaining runtime is 0.01s at this
>> point which is < the "time till deadline" of 0.005s)
>>
>> Now the runtime of the CFS-server will be replenished to the full 0.3s
>> (due to CBS) and the deadline
>> pushed out.
>>
>> The end result is, the total runtime that the CFS-server actually gets
>> is 0.595s (though yes it did sleep for 5ms in between, still that's
>> tiny -- say if it briefly blocked on a kernel mutex). That's almost
>> double the allocated runtime.
>>
>> This is just theoretical and I have yet to see if it is actually an
>> issue in practice.
> 
> Let me see if I understand what you are asking. By pushing the execution of
> the CFS-server to the end of its period, if it it was briefly blocked and
> was not able to consume all of its zerolax time, its bandwidth gets
> refreshed. Then it can run again, basically doubling its total time.
> 
> But this is basically saying that it ran for its runtime at the start of
> one period and at the beginning of another, right?
> 
> Is that an issue? The CFS-server is still just consuming it's time per
> period. That means that an RT tasks was starving the system that much to
> push it forward too much anyway. I wonder if we just document this
> behavior, if that would be enough?

The code is not doing what I intended because I thought it was doing overload
control on the replenishment, but it is not (my bad).

he is seeing this timeline:

- w=waiting
- r=running
- s=sleeping
- T=throttled
- 3/10 reservation (30%).

|wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww|rrrrrrrrrrrrrrrrrrrrrrrrrrr|s|rrrrrrrrr+rrrrrrrr+rrrrrrrrr|TTTTTTTTTT <CPU
|___________________________period 1_______________________________________________________________|________period 2_______________________ < internal-period
0---------1---------2---------3---------4---------5---------6--------7--------8---------9----------10.......11.......12.........13......... < Real-time

It is not actually that bad because the ~2x runtime is over 2 periods.

But it is not what I intended... I intended this:

|wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww|rrrrrrrrrrrrrrrrrrrrrrrrrrrrsr|TTTTTTTTTT[...]TTTTTTTTTTT|rrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr|TTTTTTT
|___________________________period 1_________________________________|_________period 2________________________[...]___________|___period 3____________________|[.... internal-period
0---------1---------2---------3---------4---------5---------6--------7--------8---------9----------10.......11.[...]16.........17........18........19........20|[.... < Real-time
---------------------------------------------------------------------+---------------------------------------------------------|
                                                                     |                                                         +new period
                                                                     +30/30>30/100, thus new period.

At the replenishment time, if the runtime left/period left > dl_rutime/dl_period,
replenish with a new period to avoid adding to much pressure to CBS/EDF.

One might say: but then the task period is different... or out of sync...
but it is not a problem: look at the "real-time"... the task starts and
run at the "deadline - runtime...." emulating the "zerolax"
(note, I do not like the term zerolax here... but (thomas voice:) whatever :-)).

One could say: in presence of deadline, this timelime will be different...

But that is intentional, as we do not want the fair server to break DL. But more
than that, if one has DL tasks, FIFO latency "property" is broken, and they should
just disable the defer option....

that is what I mentioned at the log:

"If the fair server reaches the zerolax time without consuming
its runtime, the server will be boosted, following CBS rules
(thus without breaking SCHED_DEADLINE)."

by the rule I meant doing the overload check... I thought it was
there already... but it was not... there was no need for it.

I am working on it... it is a simple change (but I need to test).

-- Daniel
Re: [PATCH v5 6/7] sched/deadline: Deferrable dl server
Posted by Daniel Bristot de Oliveira 2 years, 1 month ago
> The code is not doing what I intended because I thought it was doing overload
> control on the replenishment, but it is not (my bad).
> 

I am still testing but... it is missing something like this (famous last words).

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 1092ca8892e0..6e2d21c47a04 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -842,6 +842,8 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
  * runtime, or it just underestimated it during sched_setattr().
  */
 static int start_dl_timer(struct sched_dl_entity *dl_se);
+static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t);
+
 static void replenish_dl_entity(struct sched_dl_entity *dl_se)
 {
 	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
@@ -852,9 +854,18 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se)
 	/*
 	 * This could be the case for a !-dl task that is boosted.
 	 * Just go with full inherited parameters.
+	 *
+	 * Or, it could be the case of a zerolax reservation that
+	 * was not able to consume its runtime in background and
+	 * reached this point with current u > U.
+	 *
+	 * In both cases, set a new period.
 	 */
-	if (dl_se->dl_deadline == 0)
-		replenish_dl_new_period(dl_se, rq);
+	if (dl_se->dl_deadline == 0 ||
+		(dl_se->dl_zerolax_armed && dl_entity_overflow(dl_se, rq_clock(rq)))) {
+			dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
+			dl_se->runtime = pi_of(dl_se)->dl_runtime;
+	}

 	if (dl_se->dl_yielded && dl_se->runtime > 0)
 		dl_se->runtime = 0;
Re: [PATCH v5 6/7] sched/deadline: Deferrable dl server
Posted by Peter Zijlstra 2 years, 1 month ago
On Tue, Nov 07, 2023 at 07:50:28PM +0100, Daniel Bristot de Oliveira wrote:
> > The code is not doing what I intended because I thought it was doing overload
> > control on the replenishment, but it is not (my bad).
> > 
> 
> I am still testing but... it is missing something like this (famous last words).
> 
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index 1092ca8892e0..6e2d21c47a04 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -842,6 +842,8 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
>   * runtime, or it just underestimated it during sched_setattr().
>   */
>  static int start_dl_timer(struct sched_dl_entity *dl_se);
> +static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t);
> +
>  static void replenish_dl_entity(struct sched_dl_entity *dl_se)
>  {
>  	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
> @@ -852,9 +854,18 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se)
>  	/*
>  	 * This could be the case for a !-dl task that is boosted.
>  	 * Just go with full inherited parameters.
> +	 *
> +	 * Or, it could be the case of a zerolax reservation that
> +	 * was not able to consume its runtime in background and
> +	 * reached this point with current u > U.
> +	 *
> +	 * In both cases, set a new period.
>  	 */
> -	if (dl_se->dl_deadline == 0)
> -		replenish_dl_new_period(dl_se, rq);
> +	if (dl_se->dl_deadline == 0 ||
> +		(dl_se->dl_zerolax_armed && dl_entity_overflow(dl_se, rq_clock(rq)))) {
> +			dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
> +			dl_se->runtime = pi_of(dl_se)->dl_runtime;
> +	}
> 
>  	if (dl_se->dl_yielded && dl_se->runtime > 0)
>  		dl_se->runtime = 0;

Should we rather not cap the runtime, something like so?

Because the above also causes period drift, which we do not want.

---
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 58b542bf2893..1453a2cd0680 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -829,10 +829,12 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
  */
 static void replenish_dl_entity(struct sched_dl_entity *dl_se)
 {
+	struct sched_dl_entity *pi_se = pi_of(dl_se);
 	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
 	struct rq *rq = rq_of_dl_rq(dl_rq);
+	u64 dl_runtime = pi_se->dl_runtime;
 
-	WARN_ON_ONCE(pi_of(dl_se)->dl_runtime <= 0);
+	WARN_ON_ONCE(dl_runtime <= 0);
 
 	/*
 	 * This could be the case for a !-dl task that is boosted.
@@ -851,10 +853,13 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se)
 	 * arbitrary large.
 	 */
 	while (dl_se->runtime <= 0) {
-		dl_se->deadline += pi_of(dl_se)->dl_period;
-		dl_se->runtime += pi_of(dl_se)->dl_runtime;
+		dl_se->deadline += pi_se->dl_period;
+		dl_se->runtime += dl_runtime;
 	}
 
+	if (dl_se->zerolax && dl_se->runtime > dl_runtime)
+		dl_se->runtime = dl_runtime;
+
 	/*
 	 * At this point, the deadline really should be "in
 	 * the future" with respect to rq->clock. If it's
Re: [PATCH v5 6/7] sched/deadline: Deferrable dl server
Posted by Juri Lelli 2 years, 1 month ago
Hi Peter,

On 08/11/23 13:44, Peter Zijlstra wrote:
> On Tue, Nov 07, 2023 at 07:50:28PM +0100, Daniel Bristot de Oliveira wrote:
> > > The code is not doing what I intended because I thought it was doing overload
> > > control on the replenishment, but it is not (my bad).
> > > 
> > 
> > I am still testing but... it is missing something like this (famous last words).
> > 
> > diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> > index 1092ca8892e0..6e2d21c47a04 100644
> > --- a/kernel/sched/deadline.c
> > +++ b/kernel/sched/deadline.c
> > @@ -842,6 +842,8 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
> >   * runtime, or it just underestimated it during sched_setattr().
> >   */
> >  static int start_dl_timer(struct sched_dl_entity *dl_se);
> > +static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t);
> > +
> >  static void replenish_dl_entity(struct sched_dl_entity *dl_se)
> >  {
> >  	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
> > @@ -852,9 +854,18 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se)
> >  	/*
> >  	 * This could be the case for a !-dl task that is boosted.
> >  	 * Just go with full inherited parameters.
> > +	 *
> > +	 * Or, it could be the case of a zerolax reservation that
> > +	 * was not able to consume its runtime in background and
> > +	 * reached this point with current u > U.
> > +	 *
> > +	 * In both cases, set a new period.
> >  	 */
> > -	if (dl_se->dl_deadline == 0)
> > -		replenish_dl_new_period(dl_se, rq);
> > +	if (dl_se->dl_deadline == 0 ||
> > +		(dl_se->dl_zerolax_armed && dl_entity_overflow(dl_se, rq_clock(rq)))) {
> > +			dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
> > +			dl_se->runtime = pi_of(dl_se)->dl_runtime;
> > +	}
> > 
> >  	if (dl_se->dl_yielded && dl_se->runtime > 0)
> >  		dl_se->runtime = 0;
> 
> Should we rather not cap the runtime, something like so?
> 
> Because the above also causes period drift, which we do not want.

I was honestly also concerned with the drift, but then thought it might
not be an issue for the dl_server (zerolax), as it doesn't have a
userspace counterpart that relies on synchronized clocks?

> 
> ---
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index 58b542bf2893..1453a2cd0680 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -829,10 +829,12 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
>   */
>  static void replenish_dl_entity(struct sched_dl_entity *dl_se)
>  {
> +	struct sched_dl_entity *pi_se = pi_of(dl_se);
>  	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
>  	struct rq *rq = rq_of_dl_rq(dl_rq);
> +	u64 dl_runtime = pi_se->dl_runtime;
>  
> -	WARN_ON_ONCE(pi_of(dl_se)->dl_runtime <= 0);
> +	WARN_ON_ONCE(dl_runtime <= 0);
>  
>  	/*
>  	 * This could be the case for a !-dl task that is boosted.
> @@ -851,10 +853,13 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se)
>  	 * arbitrary large.
>  	 */
>  	while (dl_se->runtime <= 0) {
> -		dl_se->deadline += pi_of(dl_se)->dl_period;
> -		dl_se->runtime += pi_of(dl_se)->dl_runtime;
> +		dl_se->deadline += pi_se->dl_period;
> +		dl_se->runtime += dl_runtime;
>  	}
>  
> +	if (dl_se->zerolax && dl_se->runtime > dl_runtime)
> +		dl_se->runtime = dl_runtime;
> +

Anyway, I have the impression that this breaks EDF/CBS, as we are letting
the dl_server run with full dl_runtime w/o postponing the period
(essentially an u = 1 reservation until runtime is depleted).

I would say we need to either do

dl_se->deadline += pi_of(dl_se)->dl_period;
dl_se->runtime = pi_of(dl_se)->dl_runtime;

or (as Daniel proposed)

dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
dl_se->runtime = pi_of(dl_se)->dl_runtime;

and I seem to be inclined towards the latter, as the former would
essentially reduce dl_server bandwidth under dl_runtime/dl_period at
times.

Best,
Juri
Re: [PATCH v5 6/7] sched/deadline: Deferrable dl server
Posted by Peter Zijlstra 2 years, 1 month ago
On Wed, Nov 08, 2023 at 04:14:18PM +0100, Juri Lelli wrote:
> > +	if (dl_se->zerolax && dl_se->runtime > dl_runtime)
> > +		dl_se->runtime = dl_runtime;
> > +
> 
> Anyway, I have the impression that this breaks EDF/CBS, as we are letting
> the dl_server run with full dl_runtime w/o postponing the period
> (essentially an u = 1 reservation until runtime is depleted).

Yeah, I sorted it with Daniel, we were not trying to fix the same
problem :-)
Re: [PATCH v5 6/7] sched/deadline: Deferrable dl server
Posted by Daniel Bristot de Oliveira 2 years, 1 month ago
On 11/8/23 13:44, Peter Zijlstra wrote:
> Should we rather not cap the runtime, something like so?
> 
> Because the above also causes period drift, which we do not want.

like in the example I showed before:

- 3/10 reservation (30%).
- w=waiting
- r=running
- s=sleeping
- T=throttled
- fair server dispatched at 0, starvation from RT.


|wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww|rrrrrrrrrrrrrrrrrrrrrrrrrrrrrr|TTTTTTTTTT[...]TTTTTTTTTTT|rrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr|TTTTTTT
|___________________________period 1_________________________________|_________period 2________________________[...]___________|___period 3____________________|[.... internal-period
0---------1---------2---------3---------4---------5---------6--------7--------8---------9----------10.......11.[...]16.........17........18........19........20|[.... < Real-time
---------------------------------------------------------------------+---------------------------------------------------------|
                                                                     |                                                         +new period

From "real-world/wall clock" the internal period shift produces the
"zerolax" timeline. It runs 3 units of time before the 10's.

If one has a mix of DL and FIFO task, and want to enforce
a given response time to the fair server, they can reduce the
fair server period to achieve that.

-- Daniel
Re: [PATCH v5 6/7] sched/deadline: Deferrable dl server
Posted by Daniel Bristot de Oliveira 2 years, 1 month ago
On 11/8/23 13:44, Peter Zijlstra wrote:
> Because the above also causes period drift, which we do not want.

The period drift is not a problem when we do not have DL tasks because
... we do not have dl tasks. The task will run for runtime.

But not doing the period drift is bad if we have DL tasks because we
break the (current u <= U) rule... which breaks CBS/EDF.
Re: [PATCH v5 6/7] sched/deadline: Deferrable dl server
Posted by Peter Zijlstra 2 years, 1 month ago
On Wed, Nov 08, 2023 at 01:44:01PM +0100, Peter Zijlstra wrote:

> Should we rather not cap the runtime, something like so?
> 

Clearly I should've done the patch against a tree that includes the
changes... 

> ---
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index 58b542bf2893..1453a2cd0680 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -829,10 +829,12 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
>   */
>  static void replenish_dl_entity(struct sched_dl_entity *dl_se)
>  {
> +	struct sched_dl_entity *pi_se = pi_of(dl_se);
>  	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
>  	struct rq *rq = rq_of_dl_rq(dl_rq);
> +	u64 dl_runtime = pi_se->dl_runtime;
>  
> -	WARN_ON_ONCE(pi_of(dl_se)->dl_runtime <= 0);
> +	WARN_ON_ONCE(dl_runtime <= 0);
>  
>  	/*
>  	 * This could be the case for a !-dl task that is boosted.
> @@ -851,10 +853,13 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se)
>  	 * arbitrary large.
>  	 */
>  	while (dl_se->runtime <= 0) {
> -		dl_se->deadline += pi_of(dl_se)->dl_period;
> -		dl_se->runtime += pi_of(dl_se)->dl_runtime;
> +		dl_se->deadline += pi_se->dl_period;
> +		dl_se->runtime += dl_runtime;
>  	}
>  
> +	if (dl_se->zerolax && dl_se->runtime > dl_runtime)
> +		dl_se->runtime = dl_runtime;
> +

This should ofcourse go in the if (dl_se->dl_zerolax_armed) branch a
little down from here.

>  	/*
>  	 * At this point, the deadline really should be "in
>  	 * the future" with respect to rq->clock. If it's
Re: [PATCH v5 6/7] sched/deadline: Deferrable dl server
Posted by Daniel Bristot de Oliveira 2 years, 1 month ago
On 11/8/23 13:50, Peter Zijlstra wrote:
>> ---
>> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
>> index 58b542bf2893..1453a2cd0680 100644
>> --- a/kernel/sched/deadline.c
>> +++ b/kernel/sched/deadline.c
>> @@ -829,10 +829,12 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
>>   */
>>  static void replenish_dl_entity(struct sched_dl_entity *dl_se)>>  {

assuming starting rt, 3/10 params:

it arrives here with:

	runtime = 3
	laxity = 10 - 7 = 3
	u = 1

>> +	struct sched_dl_entity *pi_se = pi_of(dl_se);
>>  	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
>>  	struct rq *rq = rq_of_dl_rq(dl_rq);
>> +	u64 dl_runtime = pi_se->dl_runtime;
>>  
>> -	WARN_ON_ONCE(pi_of(dl_se)->dl_runtime <= 0);
>> +	WARN_ON_ONCE(dl_runtime <= 0);
>>  
>>  	/*
>>  	 * This could be the case for a !-dl task that is boosted.
>> @@ -851,10 +853,13 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se)
>>  	 * arbitrary large.
>>  	 */

skip the while because runtime = 3 > 0

>>  	while (dl_se->runtime <= 0) {
>> -		dl_se->deadline += pi_of(dl_se)->dl_period;
>> -		dl_se->runtime += pi_of(dl_se)->dl_runtime;
>> +		dl_se->deadline += pi_se->dl_period;
>> +		dl_se->runtime += dl_runtime;
>>  	}

runtime is already = dl_runtime...

>> +	if (dl_se->zerolax && dl_se->runtime > dl_runtime)
>> +		dl_se->runtime = dl_runtime;
>> +

There is a way to cap it: it is doing the revised wakeup rule...
the runtime will become 1. That is not what we want...

and we would have to keep arming the server... while shifting the
(internal) period puts the scheduler in the regular case :-)

Externally, e.g., the user with the mouse his laptop, sees the
"zerolax" timeline... :-)

i.e., after at most 7, they get 3, before 10.

it is simpler...

and breaking the U thing is breaking GRUB, admission control.. and so on...
by default - not in a overload DL overload scenario... it is by default :-/.

> This should ofcourse go in the if (dl_se->dl_zerolax_armed) branch a
> little down from here.
Re: [PATCH v5 6/7] sched/deadline: Deferrable dl server
Posted by Joel Fernandes 2 years, 1 month ago
Hi Daniel,

On Tue, Nov 7, 2023 at 1:50 PM Daniel Bristot de Oliveira
<bristot@kernel.org> wrote:
>
> > The code is not doing what I intended because I thought it was doing overload
> > control on the replenishment, but it is not (my bad).
> >
>
> I am still testing but... it is missing something like this (famous last words).
>
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index 1092ca8892e0..6e2d21c47a04 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -842,6 +842,8 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
>   * runtime, or it just underestimated it during sched_setattr().
>   */
>  static int start_dl_timer(struct sched_dl_entity *dl_se);
> +static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t);
> +
>  static void replenish_dl_entity(struct sched_dl_entity *dl_se)
>  {
>         struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
> @@ -852,9 +854,18 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se)
>         /*
>          * This could be the case for a !-dl task that is boosted.
>          * Just go with full inherited parameters.
> +        *
> +        * Or, it could be the case of a zerolax reservation that
> +        * was not able to consume its runtime in background and
> +        * reached this point with current u > U.
> +        *
> +        * In both cases, set a new period.
>          */
> -       if (dl_se->dl_deadline == 0)
> -               replenish_dl_new_period(dl_se, rq);
> +       if (dl_se->dl_deadline == 0 ||
> +               (dl_se->dl_zerolax_armed && dl_entity_overflow(dl_se, rq_clock(rq)))) {
> +                       dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
> +                       dl_se->runtime = pi_of(dl_se)->dl_runtime;
> +       }
>
>         if (dl_se->dl_yielded && dl_se->runtime > 0)
>                 dl_se->runtime = 0;

I was wondering does this mean GRUB needs to be enabled? Otherwise I
can see that "runtime / (deadline - t) > dl_runtime / dl_deadline"
will be true almost all the time due to the constraint of executing at
the 0-lax time.

Because at the 0-lax time, AFAICS this will be 100% > 30% (say if CFS
has a 30% reservation).

And I think even if GRUB is enabled, it is possible other DL task may
have reserved bandwidth.

Or is there a subtlety that makes that not possible?

thanks,

 - Joel
Re: [PATCH v5 6/7] sched/deadline: Deferrable dl server
Posted by Daniel Bristot de Oliveira 2 years, 1 month ago
On 11/8/23 04:20, Joel Fernandes wrote:
> Hi Daniel,
> 
> On Tue, Nov 7, 2023 at 1:50 PM Daniel Bristot de Oliveira
> <bristot@kernel.org> wrote:
>>
>>> The code is not doing what I intended because I thought it was doing overload
>>> control on the replenishment, but it is not (my bad).
>>>
>>
>> I am still testing but... it is missing something like this (famous last words).
>>
>> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
>> index 1092ca8892e0..6e2d21c47a04 100644
>> --- a/kernel/sched/deadline.c
>> +++ b/kernel/sched/deadline.c
>> @@ -842,6 +842,8 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
>>   * runtime, or it just underestimated it during sched_setattr().
>>   */
>>  static int start_dl_timer(struct sched_dl_entity *dl_se);
>> +static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t);
>> +
>>  static void replenish_dl_entity(struct sched_dl_entity *dl_se)
>>  {
>>         struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
>> @@ -852,9 +854,18 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se)
>>         /*
>>          * This could be the case for a !-dl task that is boosted.
>>          * Just go with full inherited parameters.
>> +        *
>> +        * Or, it could be the case of a zerolax reservation that
>> +        * was not able to consume its runtime in background and
>> +        * reached this point with current u > U.
>> +        *
>> +        * In both cases, set a new period.
>>          */
>> -       if (dl_se->dl_deadline == 0)
>> -               replenish_dl_new_period(dl_se, rq);
>> +       if (dl_se->dl_deadline == 0 ||
>> +               (dl_se->dl_zerolax_armed && dl_entity_overflow(dl_se, rq_clock(rq)))) {
>> +                       dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
>> +                       dl_se->runtime = pi_of(dl_se)->dl_runtime;
>> +       }
>>
>>         if (dl_se->dl_yielded && dl_se->runtime > 0)
>>                 dl_se->runtime = 0;
> 
> I was wondering does this mean GRUB needs to be enabled? Otherwise I
> can see that "runtime / (deadline - t) > dl_runtime / dl_deadline"
> will be true almost all the time due to the constraint of executing at
> the 0-lax time.

No grub needed. It will only happen if the fair server did not have any chance to run.

If it happens, it is not a problem, see that timeline I replied in the previous
email.

We do not want a zerolax scheduler, because it breaks everything else. It is
a deferred EDF, that looking from wall clock, composes an "zerolaxish" timeline.

> Because at the 0-lax time, AFAICS this will be 100% > 30% (say if CFS
> has a 30% reservation).
> 
> And I think even if GRUB is enabled, it is possible other DL task may
> have reserved bandwidth.
> 
> Or is there a subtlety that makes that not possible?
> 
> thanks,
> 
>  - Joel

Re: [PATCH v5 6/7] sched/deadline: Deferrable dl server
Posted by Joel Fernandes 2 years, 1 month ago
On Wed, Nov 08, 2023 at 09:01:17AM +0100, Daniel Bristot de Oliveira wrote:
> On 11/8/23 04:20, Joel Fernandes wrote:
> > Hi Daniel,
> > 
> > On Tue, Nov 7, 2023 at 1:50 PM Daniel Bristot de Oliveira
> > <bristot@kernel.org> wrote:
> >>
> >>> The code is not doing what I intended because I thought it was doing overload
> >>> control on the replenishment, but it is not (my bad).
> >>>
> >>
> >> I am still testing but... it is missing something like this (famous last words).
> >>
> >> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> >> index 1092ca8892e0..6e2d21c47a04 100644
> >> --- a/kernel/sched/deadline.c
> >> +++ b/kernel/sched/deadline.c
> >> @@ -842,6 +842,8 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
> >>   * runtime, or it just underestimated it during sched_setattr().
> >>   */
> >>  static int start_dl_timer(struct sched_dl_entity *dl_se);
> >> +static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t);
> >> +
> >>  static void replenish_dl_entity(struct sched_dl_entity *dl_se)
> >>  {
> >>         struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
> >> @@ -852,9 +854,18 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se)
> >>         /*
> >>          * This could be the case for a !-dl task that is boosted.
> >>          * Just go with full inherited parameters.
> >> +        *
> >> +        * Or, it could be the case of a zerolax reservation that
> >> +        * was not able to consume its runtime in background and
> >> +        * reached this point with current u > U.
> >> +        *
> >> +        * In both cases, set a new period.
> >>          */
> >> -       if (dl_se->dl_deadline == 0)
> >> -               replenish_dl_new_period(dl_se, rq);
> >> +       if (dl_se->dl_deadline == 0 ||
> >> +               (dl_se->dl_zerolax_armed && dl_entity_overflow(dl_se, rq_clock(rq)))) {
> >> +                       dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
> >> +                       dl_se->runtime = pi_of(dl_se)->dl_runtime;
> >> +       }
> >>
> >>         if (dl_se->dl_yielded && dl_se->runtime > 0)
> >>                 dl_se->runtime = 0;
> > 
> > I was wondering does this mean GRUB needs to be enabled? Otherwise I
> > can see that "runtime / (deadline - t) > dl_runtime / dl_deadline"
> > will be true almost all the time due to the constraint of executing at
> > the 0-lax time.
> 
> No grub needed. It will only happen if the fair server did not have any chance to run.
> 
> If it happens, it is not a problem, see that timeline I replied in the previous
> email.

Ah you're right, I mistakenly read your diff assuming you were calling
replenish_dl_new_period() on dl_entity_overflow(). Indeed the diff is needed
(I was actually wondering about why that was not done in my initial review as
well -- so its good we found it in discussion).

> We do not want a zerolax scheduler, because it breaks everything else. It is
> a deferred EDF, that looking from wall clock, composes an "zerolaxish" timeline.

Indeed. I was not intending that we do zerolax scheduler, I was merely
misreading the diff assuming you were throttling the DL-server once again at
the zerolax time.

thanks,

 - Joel

Re: [PATCH v5 6/7] sched/deadline: Deferrable dl server
Posted by Steven Rostedt 2 years, 1 month ago
On Tue, 7 Nov 2023 11:47:32 -0500
Steven Rostedt <rostedt@goodmis.org> wrote:

> Let me see if I understand what you are asking. By pushing the execution of
> the CFS-server to the end of its period, if it it was briefly blocked and
> was not able to consume all of its zerolax time, its bandwidth gets
> refreshed. Then it can run again, basically doubling its total time.
> 
> But this is basically saying that it ran for its runtime at the start of
> one period and at the beginning of another, right?
> 
> Is that an issue? The CFS-server is still just consuming it's time per
> period. That means that an RT tasks was starving the system that much to
> push it forward too much anyway. I wonder if we just document this
> behavior, if that would be enough?

I may have even captured this scenario.

I ran my migrate[1] program which I use to test RT migration, and it kicks
off a bunch of RT tasks. I like this test because with the
/proc/sys/kernel/sched_rt_* options set, it shows the lines where they are
throttled really well.

This time, I disabled those, and just kept the default:

~# cat /sys/kernel/debug/sched/rq/cpu0/fair_server_defer
1

~# cat /sys/kernel/debug/sched/rq/cpu0/fair_server_period 
1000000000

~# cat /sys/kernel/debug/sched/rq/cpu0/fair_server_runtime 
50000000

And ran my userspin[2] program. And recorded it with:

  trace-cmd record -e sched_switch

The kernelshark output shows the delay from userspin taking up 0.1 seconds
(double the time usually given), with a little preemption in between.

-- Steve
Re: [PATCH v5 6/7] sched/deadline: Deferrable dl server
Posted by Steven Rostedt 2 years, 1 month ago
What's more interesting, when looking at the userspin task, I see a lot of this:

         migrate-1153  [003]  1272.988097: sched_switch:         migrate:1153 [90] S ==> userspin:1135 [120]
        userspin-1135  [003]  1272.988111: sched_switch:         userspin:1135 [120] R ==> migrate:1146 [97]

userspin sneaks in for 14 microseconds

         migrate-1146  [003]  1272.988141: sched_switch:         migrate:1146 [97] R ==> migrate:1159 [84]
         migrate-1159  [003]  1272.988159: print:                tracing_mark_write: thread 13 iter 15, took lock 15020 in 140726333419648 us
         migrate-1159  [003]  1272.992161: print:                tracing_mark_write: thread 13 iter 15, unlock lock 6
         migrate-1159  [003]  1272.992169: print:                tracing_mark_write: thread 13 iter 15 sleeping
         migrate-1159  [003]  1272.992177: sched_switch:         migrate:1159 [84] S ==> userspin:1135 [120]
        userspin-1135  [003]  1272.992190: sched_switch:         userspin:1135 [120] R ==> migrate:1150 [93]

Again for 13 microseconds.

         migrate-1150  [003]  1272.995118: sched_switch:         migrate:1150 [93] R ==> migrate:1153 [90]
         migrate-1153  [003]  1272.995129: print:                tracing_mark_write: thread 7 iter 15, taking lock 5
         migrate-1153  [003]  1272.995164: print:                tracing_mark_write: thread 7 iter 15, took lock 32 in 140726333419648 us
         migrate-1153  [003]  1273.005166: print:                tracing_mark_write: thread 7 iter 15, unlock lock 5
         migrate-1153  [003]  1273.005174: print:                tracing_mark_write: thread 7 iter 15 sleeping
         migrate-1153  [003]  1273.005183: sched_switch:         migrate:1153 [90] S ==> userspin:1135 [120]
        userspin-1135  [003]  1273.005204: sched_switch:         userspin:1135 [120] R ==> migrate:1159 [84]

For 21 microseconds.

         migrate-1159  [003]  1273.005216: print:                tracing_mark_write: thread 13 iter 15, taking lock 7
         migrate-1159  [003]  1273.005271: print:                tracing_mark_write: thread 13 iter 15, took lock 53 in 140726333419648 us
         migrate-1159  [003]  1273.009273: print:                tracing_mark_write: thread 13 iter 15, unlock lock 7
         migrate-1159  [003]  1273.009281: print:                tracing_mark_write: thread 13 iter 15 sleeping
         migrate-1159  [003]  1273.009289: sched_switch:         migrate:1159 [84] S ==> userspin:1135 [120]
        userspin-1135  [003]  1273.009301: sched_switch:         userspin:1135 [120] R ==> migrate:1147 [96]

12 microseconds

         migrate-1147  [003]  1273.012205: sched_switch:         migrate:1147 [96] R ==> migrate:1153 [90]
         migrate-1153  [003]  1273.012217: print:                tracing_mark_write: thread 7 iter 15, taking lock 6
         migrate-1153  [003]  1273.012228: sched_switch:         migrate:1153 [90] S ==> userspin:1135 [120]
        userspin-1135  [003]  1273.012242: sched_switch:         userspin:1135 [120] R ==> migrate:1146 [97]
         migrate-1146  [003]  1273.014251: sched_switch:         migrate:1146 [97] R ==> migrate:1148 [95]

2 milliseconds. (which is probably fine).

         migrate-1148  [003]  1273.020300: print:                tracing_mark_write: thread 2 iter 14, unlock lock 2
         migrate-1148  [003]  1273.020302: print:                tracing_mark_write: thread 2 iter 14 sleeping
         migrate-1148  [003]  1273.020309: sched_switch:         migrate:1148 [95] S ==> userspin:1135 [120]
        userspin-1135  [003]  1273.020324: sched_switch:         userspin:1135 [120] R ==> migrate:1147 [96]

15 microseconds.

         migrate-1147  [003]  1273.020360: print:                tracing_mark_write: thread 1 iter 14, unlock lock 1
         migrate-1147  [003]  1273.020373: print:                tracing_mark_write: thread 1 iter 14 sleeping
         migrate-1147  [003]  1273.020381: sched_switch:         migrate:1147 [96] S ==> userspin:1135 [120]
        userspin-1135  [003]  1273.021397: sched_switch:         userspin:1135 [120] R ==> migrate:1147 [96]

1 millisecond.

         migrate-1147  [003]  1273.021402: print:                tracing_mark_write: thread 1 iter 14, taking lock 2
         migrate-1147  [003]  1273.021404: print:                tracing_mark_write: thread 1 iter 14, took lock 1 in 140726333419648 us
         migrate-1147  [003]  1273.022200: sched_switch:         migrate:1147 [96] R ==> migrate:1152 [91]
         migrate-1152  [003]  1273.022206: print:                tracing_mark_write: thread 6 iter 15, taking lock 6
         migrate-1152  [003]  1273.022217: sched_switch:         migrate:1152 [91] S ==> migrate:1147 [96]
         migrate-1147  [003]  1273.022289: sched_switch:         migrate:1147 [96] R ==> migrate:1159 [84]
         migrate-1159  [003]  1273.022299: print:                tracing_mark_write: thread 13 iter 16, taking lock 0
         migrate-1159  [003]  1273.022326: print:                tracing_mark_write: thread 13 iter 16, took lock 25 in 140726333419648 us
         migrate-1159  [003]  1273.026328: print:                tracing_mark_write: thread 13 iter 16, unlock lock 0
         migrate-1159  [003]  1273.026337: print:                tracing_mark_write: thread 13 iter 16 sleeping
         migrate-1159  [003]  1273.026346: sched_switch:         migrate:1159 [84] S ==> userspin:1135 [120]
        userspin-1135  [003]  1273.026359: sched_switch:         userspin:1135 [120] R ==> migrate:1146 [97]

13 microseconds, and so on...

         migrate-1146  [003]  1273.027170: sched_switch:         migrate:1146 [97] R ==> migrate:1149 [94]
         migrate-1149  [003]  1273.027189: print:                tracing_mark_write: thread 3 iter 14, took lock 1927 in 140726333419648 us
         migrate-1149  [003]  1273.027335: sched_switch:         migrate:1149 [94] R ==> migrate:1153 [90]
         migrate-1153  [003]  1273.027349: print:                tracing_mark_write: thread 7 iter 15, took lock 15130 in 140726333419648 us
         migrate-1153  [003]  1273.037352: print:                tracing_mark_write: thread 7 iter 15, unlock lock 6
         migrate-1153  [003]  1273.037362: print:                tracing_mark_write: thread 7 iter 15 sleeping
         migrate-1153  [003]  1273.037370: sched_switch:         migrate:1153 [90] S ==> userspin:1135 [120]
        userspin-1135  [003]  1273.037395: sched_switch:         userspin:1135 [120] R ==> migrate:1147 [96]
         migrate-1147  [003]  1273.037406: print:                tracing_mark_write: thread 1 iter 14, unlock lock 2
         migrate-1147  [003]  1273.037408: print:                tracing_mark_write: thread 1 iter 14 sleeping
         migrate-1147  [003]  1273.037414: sched_switch:         migrate:1147 [96] S ==> userspin:1135 [120]
        userspin-1135  [003]  1273.038428: sched_switch:         userspin:1135 [120] R ==> migrate:1147 [96]


It looks like it sneaks in when it's about to schedule a new RT task.

Is this expected?

-- Steve
Re: [PATCH v5 6/7] sched/deadline: Deferrable dl server
Posted by Steven Rostedt 2 years, 1 month ago

Just got this too (with the 20% we talked about on IRC).

 migrate-991     6.....   713.996237: print:                tracing_mark_write: thread 7 iter 3 sleeping

The above is 991 in userspace writing to trace_marker

 migrate-991     6d..2.   713.996251: bprint:               __schedule: Pick userspin:973:120

I added the above printk in the core pick_next_task().

 migrate-991     6d..2.   713.996254: sched_switch:         migrate:991 [90] S ==> userspin:973 [120]

We switch to userspin for just 16 microseconds, and notice, NEED_RESCHED is
not set.

userspin-973     6dN.2.   713.996270: bprint:               pick_task_rt: Pick RT migrate:988:93

The above printk is in pick_next_task_rt(), and NEED_RESCHED is now set!

userspin-973     6dN.2.   713.996271: bprint:               __schedule: Pick migrate:988:93
userspin-973     6d..2.   713.996272: sched_switch:         userspin:973 [120] R ==> migrate:988 [93]

I'll add your latest patch and see if that's different.

I'll also test this without any of the patches first.

-- Steve
Re: [PATCH v5 6/7] sched/deadline: Deferrable dl server
Posted by Steven Rostedt 2 years, 1 month ago
On Tue, 7 Nov 2023 14:32:16 -0500
Steven Rostedt <rostedt@goodmis.org> wrote:

> I'll also test this without any of the patches first.

And it still happens, and I now I know why :-)

Duh! The program is "migrate" which stress tests how RT tasks migrate
between CPUs when there's more RT tasks running that CPUs to run them on.

This is the push/pull logic in action!

 migrate-958     4d..2.   266.971936: sched_switch:         migrate:958 [89] S ==> userspin:939 [120]

Task 958 of priority 89 (lower is higher) goes to sleep. There's no RT
tasks on CPU 4 to run, so it runs userspin.

 migrate-953     2d.h2.   266.971938: sched_waking:         comm=migrate pid=957 prio=90 target_cpu=002
 migrate-953     2d..2.   266.971944: sched_switch:         migrate:953 [94] R ==> migrate:957 [90]

On CPU 2, task 957 (prio 90) preempts 953 (prio 94).

userspin-939     4d..2.   266.971953: sched_switch:         userspin:939 [120] R ==> migrate:953 [94]

Now 953 migrates over to CPU 4 as it's currently the CPU running the lowest
priority task.

There's other cases where another CPU was simply overloaded, and when the
RT task on the CPU with userspin went to sleep, it triggered an IPI to the
overloaded CPU to tell it to push it over here.

All is good. Nothing to see here ;-)

-- Steve
Re: [PATCH v5 6/7] sched/deadline: Deferrable dl server
Posted by Steven Rostedt 2 years, 1 month ago
On Tue, 7 Nov 2023 12:35:40 -0500
Steven Rostedt <rostedt@goodmis.org> wrote:

> I ran my migrate[1] program which I use to test RT migration, and it kicks

> 
> And ran my userspin[2] program. And recorded it with:

I forgot to add the [1] and [2]

[1] https://rostedt.org/code/migrate.c
[2] https://rostedt.org/code/userspin.c

-- Steve
Re: [PATCH v5 6/7] sched/deadline: Deferrable dl server
Posted by Daniel Bristot de Oliveira 2 years, 1 month ago
On 11/6/23 22:37, Joel Fernandes wrote:
> On Mon, Nov 6, 2023 at 4:32 PM Joel Fernandes <joel@joelfernandes.org> wrote:
>>
>> On Mon, Nov 6, 2023 at 2:32 PM Joel Fernandes <joel@joelfernandes.org> wrote:
>>>
>>> Hi Daniel,
>>>
>>> On Sat, Nov 4, 2023 at 6:59 AM Daniel Bristot de Oliveira
>>> <bristot@kernel.org> wrote:
>>>>
>>>> Among the motivations for the DL servers is the real-time throttling
>>>> mechanism. This mechanism works by throttling the rt_rq after
>>>> running for a long period without leaving space for fair tasks.
>>>>
>>>> The base dl server avoids this problem by boosting fair tasks instead
>>>> of throttling the rt_rq. The point is that it boosts without waiting
>>>> for potential starvation, causing some non-intuitive cases.
>>>>
>>>> For example, an IRQ dispatches two tasks on an idle system, a fair
>>>> and an RT. The DL server will be activated, running the fair task
>>>> before the RT one. This problem can be avoided by deferring the
>>>> dl server activation.
>>>>
>>>> By setting the zerolax option, the dl_server will dispatch an
>>>> SCHED_DEADLINE reservation with replenished runtime, but throttled.
>>>>
>>>> The dl_timer will be set for (period - runtime) ns from start time.
>>>> Thus boosting the fair rq on its 0-laxity time with respect to
>>>> rt_rq.
>>>>
>>>> If the fair scheduler has the opportunity to run while waiting
>>>> for zerolax time, the dl server runtime will be consumed. If
>>>> the runtime is completely consumed before the zerolax time, the
>>>> server will be replenished while still in a throttled state. Then,
>>>> the dl_timer will be reset to the new zerolax time
>>>>
>>>> If the fair server reaches the zerolax time without consuming
>>>> its runtime, the server will be boosted, following CBS rules
>>>> (thus without breaking SCHED_DEADLINE).
>>>>
>>>> Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
>>>> ---
>>>>  include/linux/sched.h   |   2 +
>>>>  kernel/sched/deadline.c | 100 +++++++++++++++++++++++++++++++++++++++-
>>>>  kernel/sched/fair.c     |   3 ++
>>>>  3 files changed, 103 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>>>> index 5ac1f252e136..56e53e6fd5a0 100644
>>>> --- a/include/linux/sched.h
>>>> +++ b/include/linux/sched.h
>>>> @@ -660,6 +660,8 @@ struct sched_dl_entity {
>>>>         unsigned int                    dl_non_contending : 1;
>>>>         unsigned int                    dl_overrun        : 1;
>>>>         unsigned int                    dl_server         : 1;
>>>> +       unsigned int                    dl_zerolax        : 1;
>>>> +       unsigned int                    dl_zerolax_armed  : 1;
>>>>
>>>>         /*
>>>>          * Bandwidth enforcement timer. Each -deadline task has its
>>>> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
>>>> index 1d7b96ca9011..69ee1fbd60e4 100644
>>>> --- a/kernel/sched/deadline.c
>>>> +++ b/kernel/sched/deadline.c
>>>> @@ -772,6 +772,14 @@ static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se,
>>>>         /* for non-boosted task, pi_of(dl_se) == dl_se */
>>>>         dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
>>>>         dl_se->runtime = pi_of(dl_se)->dl_runtime;
>>>> +
>>>> +       /*
>>>> +        * If it is a zerolax reservation, throttle it.
>>>> +        */
>>>> +       if (dl_se->dl_zerolax) {
>>>> +               dl_se->dl_throttled = 1;
>>>> +               dl_se->dl_zerolax_armed = 1;
>>>> +       }
>>>>  }
>>>>
>>>>  /*
>>>> @@ -828,6 +836,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
>>>>   * could happen are, typically, a entity voluntarily trying to overcome its
>>>>   * runtime, or it just underestimated it during sched_setattr().
>>>>   */
>>>> +static int start_dl_timer(struct sched_dl_entity *dl_se);
>>>>  static void replenish_dl_entity(struct sched_dl_entity *dl_se)
>>>>  {
>>>>         struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
>>>> @@ -874,6 +883,28 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se)
>>>>                 dl_se->dl_yielded = 0;
>>>>         if (dl_se->dl_throttled)
>>>>                 dl_se->dl_throttled = 0;
>>>> +
>>>> +       /*
>>>> +        * If this is the replenishment of a zerolax reservation,
>>>> +        * clear the flag and return.
>>>> +        */
>>>> +       if (dl_se->dl_zerolax_armed) {
>>>> +               dl_se->dl_zerolax_armed = 0;
>>>> +               return;
>>>> +       }
>>>> +
>>>> +       /*
>>>> +        * A this point, if the zerolax server is not armed, and the deadline
>>>> +        * is in the future, throttle the server and arm the zerolax timer.
>>>> +        */
>>>> +       if (dl_se->dl_zerolax &&
>>>> +           dl_time_before(dl_se->deadline - dl_se->runtime, rq_clock(rq))) {
>>>> +               if (!is_dl_boosted(dl_se)) {
>>>> +                       dl_se->dl_zerolax_armed = 1;
>>>> +                       dl_se->dl_throttled = 1;
>>>> +                       start_dl_timer(dl_se);
>>>> +               }
>>>> +       }
>>>>  }
>>>>
>>>>  /*
>>>> @@ -1024,6 +1055,13 @@ static void update_dl_entity(struct sched_dl_entity *dl_se)
>>>>                 }
>>>>
>>>>                 replenish_dl_new_period(dl_se, rq);
>>>> +       } else if (dl_server(dl_se) && dl_se->dl_zerolax) {
>>>> +               /*
>>>> +                * The server can still use its previous deadline, so throttle
>>>> +                * and arm the zero-laxity timer.
>>>> +                */
>>>> +               dl_se->dl_zerolax_armed = 1;
>>>> +               dl_se->dl_throttled = 1;
>>>>         }
>>>>  }
>>>>
>>>> @@ -1056,8 +1094,20 @@ static int start_dl_timer(struct sched_dl_entity *dl_se)
>>>>          * We want the timer to fire at the deadline, but considering
>>>>          * that it is actually coming from rq->clock and not from
>>>>          * hrtimer's time base reading.
>>>> +        *
>>>> +        * The zerolax reservation will have its timer set to the
>>>> +        * deadline - runtime. At that point, the CBS rule will decide
>>>> +        * if the current deadline can be used, or if a replenishment
>>>> +        * is required to avoid add too much pressure on the system
>>>> +        * (current u > U).
>>>>          */
>>>> -       act = ns_to_ktime(dl_next_period(dl_se));
>>>> +       if (dl_se->dl_zerolax_armed) {
>>>> +               WARN_ON_ONCE(!dl_se->dl_throttled);
>>>> +               act = ns_to_ktime(dl_se->deadline - dl_se->runtime);
>>>
>>> Just a question, here if dl_se->deadline - dl_se->runtime is large,
>>> then does that mean that server activation will be much more into the
>>> future? So say I want to give CFS 30%, then it will take 70% of the
>>> period before CFS preempts RT thus "starving" CFS for this duration. I
>>> think that's Ok for smaller periods and runtimes, though.
>>>
>>> I think it does reserve the amount of required CFS bandwidth so it is
>>> probably OK, though it is perhaps letting RT run more initially (say
>>> if CFS tasks are not CPU bound and occasionally wake up, they will
>>> always be hit by the 70% latency AFAICS which may be large for large
>>> periods and small runtimes).
>>>
>>
>> One more consideration I guess is, because the server is throttled
>> till 0-laxity time, it is possible that if CFS sleeps even a bit
>> (after the DL-server is unthrottled), then it will be pushed out to a
>> full current deadline + period due to CBS. In such a situation,  if
>> CFS-server is the only DL task running, it might starve RT for a bit
>> more time.
>>
>> Example, say CFS runtime is 0.3s and period is 1s. At 0.7s, 0-laxity
>> timer fires. CFS runs for 0.29s, then sleeps for 0.005s and wakes up
>> at 0.295s (its remaining runtime is 0.01s at this point which is < the
>> "time till deadline" of 0.005s). Now the runtime of the CFS-server
>> will be replenished to the full 3s (due to CBS) and the deadline
>> pushed out. The end result is the total runtime that the CFS-server
>> actually gets is 0.0595s (though yes it did sleep for 5ms in between,
>> still that's tiny -- say if it briefly blocked on a kernel mutex).
> 
> Blah, I got lost in decimal points. Here's the example again:
> 
> Say CFS-server runtime is 0.3s and period is 1s.
> 
> At 0.7s, 0-laxity timer fires. CFS runs for 0.29s, then sleeps for
> 0.005s and wakes up at 0.295s (its remaining runtime is 0.01s at this
> point which is < the "time till deadline" of 0.005s)
> 
> Now the runtime of the CFS-server will be replenished to the full 0.3s
> (due to CBS) and the deadline
> pushed out.
> 
> The end result is, the total runtime that the CFS-server actually gets
> is 0.595s (though yes it did sleep for 5ms in between, still that's
> tiny -- say if it briefly blocked on a kernel mutex). That's almost
> double the allocated runtime.

I think I got what you mean, and I think I took for granted that we were
doing overload control on the replenishment, but it seems that we are not..

I just got back from a doct appt, I will do a proper reply later today.

Thanks Joel!
-- Daniel


> This is just theoretical and I have yet to see if it is actually an
> issue in practice.
> 
> Thanks.

Re: [PATCH v5 6/7] sched/deadline: Deferrable dl server
Posted by Joel Fernandes 2 years, 1 month ago
On Tue, Nov 07, 2023 at 12:58:48PM +0100, Daniel Bristot de Oliveira wrote:
[...]
> >> One more consideration I guess is, because the server is throttled
> >> till 0-laxity time, it is possible that if CFS sleeps even a bit
> >> (after the DL-server is unthrottled), then it will be pushed out to a
> >> full current deadline + period due to CBS. In such a situation,  if
> >> CFS-server is the only DL task running, it might starve RT for a bit
> >> more time.
> >>
> >> Example, say CFS runtime is 0.3s and period is 1s. At 0.7s, 0-laxity
> >> timer fires. CFS runs for 0.29s, then sleeps for 0.005s and wakes up
> >> at 0.295s (its remaining runtime is 0.01s at this point which is < the
> >> "time till deadline" of 0.005s). Now the runtime of the CFS-server
> >> will be replenished to the full 3s (due to CBS) and the deadline
> >> pushed out. The end result is the total runtime that the CFS-server
> >> actually gets is 0.0595s (though yes it did sleep for 5ms in between,
> >> still that's tiny -- say if it briefly blocked on a kernel mutex).
> > 
> > Blah, I got lost in decimal points. Here's the example again:
> > 
> > Say CFS-server runtime is 0.3s and period is 1s.
> > 
> > At 0.7s, 0-laxity timer fires. CFS runs for 0.29s, then sleeps for
> > 0.005s and wakes up at 0.295s (its remaining runtime is 0.01s at this
> > point which is < the "time till deadline" of 0.005s)
> > 
> > Now the runtime of the CFS-server will be replenished to the full 0.3s
> > (due to CBS) and the deadline
> > pushed out.
> > 
> > The end result is, the total runtime that the CFS-server actually gets
> > is 0.595s (though yes it did sleep for 5ms in between, still that's
> > tiny -- say if it briefly blocked on a kernel mutex). That's almost
> > double the allocated runtime.
> 
> I think I got what you mean, and I think I took for granted that we were
> doing overload control on the replenishment, but it seems that we are not..
> 
> I just got back from a doct appt, I will do a proper reply later today.

Ah ok! Thanks Daniel! And hope the appointment went well.

 - Joel
Re: [PATCH v5 6/7] sched/deadline: Deferrable dl server
Posted by Peter Zijlstra 2 years, 1 month ago
On Sat, Nov 04, 2023 at 11:59:23AM +0100, Daniel Bristot de Oliveira wrote:

> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index b15f7f376a67..399237cd9f59 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1201,6 +1201,8 @@ static void update_curr(struct cfs_rq *cfs_rq)
>  		account_group_exec_runtime(curtask, delta_exec);
>  		if (curtask->server)
>  			dl_server_update(curtask->server, delta_exec);
> +		else
> +			dl_server_update(&rq_of(cfs_rq)->fair_server, delta_exec);
>  	}
>  
>  	account_cfs_rq_runtime(cfs_rq, delta_exec);

I've rewritten that something like so..

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1182,12 +1182,13 @@ s64 update_curr_common(struct rq *rq)
 static void update_curr(struct cfs_rq *cfs_rq)
 {
 	struct sched_entity *curr = cfs_rq->curr;
+	struct rq *rq = rq_of(cfs_rq);
 	s64 delta_exec;
 
 	if (unlikely(!curr))
 		return;
 
-	delta_exec = update_curr_se(rq_of(cfs_rq), curr);
+	delta_exec = update_curr_se(rq, curr);
 	if (unlikely(delta_exec <= 0))
 		return;
 
@@ -1195,8 +1196,17 @@ static void update_curr(struct cfs_rq *c
 	update_deadline(cfs_rq, curr);
 	update_min_vruntime(cfs_rq);
 
-	if (entity_is_task(curr))
-		update_curr_task(task_of(curr), delta_exec);
+	if (entity_is_task(curr)) {
+		struct task_struct *p = task_of(curr);
+		update_curr_task(p, delta_exec);
+		/*
+		 * Any fair task that runs outside of fair_server should
+		 * account against fair_server such that it can account for
+		 * this time and possible avoid running this period.
+		 */
+		if (p->dl_server != &rq->fair_server)
+			dl_server_update(&rq->fair_server, delta_exec);
+	}
 
 	account_cfs_rq_runtime(cfs_rq, delta_exec);
 }
Re: [PATCH v5 6/7] sched/deadline: Deferrable dl server
Posted by Daniel Bristot de Oliveira 2 years, 1 month ago
On 11/6/23 15:55, Peter Zijlstra wrote:
> On Sat, Nov 04, 2023 at 11:59:23AM +0100, Daniel Bristot de Oliveira wrote:
> 
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index b15f7f376a67..399237cd9f59 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -1201,6 +1201,8 @@ static void update_curr(struct cfs_rq *cfs_rq)
>>  		account_group_exec_runtime(curtask, delta_exec);
>>  		if (curtask->server)
>>  			dl_server_update(curtask->server, delta_exec);
>> +		else
>> +			dl_server_update(&rq_of(cfs_rq)->fair_server, delta_exec);
>>  	}
>>  
>>  	account_cfs_rq_runtime(cfs_rq, delta_exec);
>  
> @@ -1195,8 +1196,17 @@ static void update_curr(struct cfs_rq *c
>  	update_deadline(cfs_rq, curr);
>  	update_min_vruntime(cfs_rq);
>  
> -	if (entity_is_task(curr))
> -		update_curr_task(task_of(curr), delta_exec);
> +	if (entity_is_task(curr)) {
> +		struct task_struct *p = task_of(curr);
> +		update_curr_task(p, delta_exec);
> +		/*
> +		 * Any fair task that runs outside of fair_server should
> +		 * account against fair_server such that it can account for
> +		 * this time and possible avoid running this period.
> +		 */
> +		if (p->dl_server != &rq->fair_server)
> +			dl_server_update(&rq->fair_server, delta_exec);
aren't we missing:
		   else
			dl_server_update(&rq_of(cfs_rq)->fair_server, delta_exec);

or am I missing something? :-)

> +	}
>  
>  	account_cfs_rq_runtime(cfs_rq, delta_exec);
>  }