Complete EEVDF | Patchew

[PATCH 21/24] sched/eevdf: Allow shorter slices to wakeup-preempt

Posted by Peter Zijlstra 1 year, 6 months ago

Part of the reason to have shorter slices is to improve
responsiveness. Allow shorter slices to preempt longer slices on
wakeup.

    Task                  |   Runtime ms  | Switches | Avg delay ms    | Max delay ms    | Sum delay ms     |

  100ms massive_intr 500us cyclictest NO_PREEMPT_SHORT

  1 massive_intr:(5)      | 846018.956 ms |   779188 | avg:   0.273 ms | max:  58.337 ms | sum:212545.245 ms |
  2 massive_intr:(5)      | 853450.693 ms |   792269 | avg:   0.275 ms | max:  71.193 ms | sum:218263.588 ms |
  3 massive_intr:(5)      | 843888.920 ms |   771456 | avg:   0.277 ms | max:  92.405 ms | sum:213353.221 ms |
  1 chromium-browse:(8)   |  53015.889 ms |   131766 | avg:   0.463 ms | max:  36.341 ms | sum:60959.230  ms |
  2 chromium-browse:(8)   |  53864.088 ms |   136962 | avg:   0.480 ms | max:  27.091 ms | sum:65687.681  ms |
  3 chromium-browse:(9)   |  53637.904 ms |   132637 | avg:   0.481 ms | max:  24.756 ms | sum:63781.673  ms |
  1 cyclictest:(5)        |  12615.604 ms |   639689 | avg:   0.471 ms | max:  32.272 ms | sum:301351.094 ms |
  2 cyclictest:(5)        |  12511.583 ms |   642578 | avg:   0.448 ms | max:  44.243 ms | sum:287632.830 ms |
  3 cyclictest:(5)        |  12545.867 ms |   635953 | avg:   0.475 ms | max:  25.530 ms | sum:302374.658 ms |

  100ms massive_intr 500us cyclictest PREEMPT_SHORT

  1 massive_intr:(5)      | 839843.919 ms |   837384 | avg:   0.264 ms | max:  74.366 ms | sum:221476.885 ms |
  2 massive_intr:(5)      | 852449.913 ms |   845086 | avg:   0.252 ms | max:  68.162 ms | sum:212595.968 ms |
  3 massive_intr:(5)      | 839180.725 ms |   836883 | avg:   0.266 ms | max:  69.742 ms | sum:222812.038 ms |
  1 chromium-browse:(11)  |  54591.481 ms |   138388 | avg:   0.458 ms | max:  35.427 ms | sum:63401.508  ms |
  2 chromium-browse:(8)   |  52034.541 ms |   132276 | avg:   0.436 ms | max:  31.826 ms | sum:57732.958  ms |
  3 chromium-browse:(8)   |  55231.771 ms |   141892 | avg:   0.469 ms | max:  27.607 ms | sum:66538.697  ms |
  1 cyclictest:(5)        |  13156.391 ms |   667412 | avg:   0.373 ms | max:  38.247 ms | sum:249174.502 ms |
  2 cyclictest:(5)        |  12688.939 ms |   665144 | avg:   0.374 ms | max:  33.548 ms | sum:248509.392 ms |
  3 cyclictest:(5)        |  13475.623 ms |   669110 | avg:   0.370 ms | max:  37.819 ms | sum:247673.390 ms |

As per the numbers the, this makes cyclictest (short slice) it's
max-delay more consistent and consistency drops the sum-delay. The
trade-off is that the massive_intr (long slice) gets more context
switches and a slight increase in sum-delay.

[mike: numbers]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Mike Galbraith <umgwanakikbuti@gmail.com>
---
 kernel/sched/fair.c     |   64 ++++++++++++++++++++++++++++++++++++++++++------
 kernel/sched/features.h |    5 +++
 2 files changed, 61 insertions(+), 8 deletions(-)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -973,10 +973,10 @@ static void clear_buddies(struct cfs_rq
  * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i
  * this is probably good enough.
  */
-static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	if ((s64)(se->vruntime - se->deadline) < 0)
-		return;
+		return false;
 
 	/*
 	 * For EEVDF the virtual time slope is determined by w_i (iow.
@@ -993,10 +993,7 @@ static void update_deadline(struct cfs_r
 	/*
 	 * The task has consumed its request, reschedule.
 	 */
-	if (cfs_rq->nr_running > 1) {
-		resched_curr(rq_of(cfs_rq));
-		clear_buddies(cfs_rq, se);
-	}
+	return true;
 }
 
 #include "pelt.h"
@@ -1134,6 +1131,38 @@ static inline void update_curr_task(stru
 		dl_server_update(p->dl_server, delta_exec);
 }
 
+static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+{
+	if (!sched_feat(PREEMPT_SHORT))
+		return false;
+
+	if (curr->vlag == curr->deadline)
+		return false;
+
+	return !entity_eligible(cfs_rq, curr);
+}
+
+static inline bool do_preempt_short(struct cfs_rq *cfs_rq,
+				    struct sched_entity *pse, struct sched_entity *se)
+{
+	if (!sched_feat(PREEMPT_SHORT))
+		return false;
+
+	if (pse->slice >= se->slice)
+		return false;
+
+	if (!entity_eligible(cfs_rq, pse))
+		return false;
+
+	if (entity_before(pse, se))
+		return true;
+
+	if (!entity_eligible(cfs_rq, se))
+		return true;
+
+	return false;
+}
+
 /*
  * Used by other classes to account runtime.
  */
@@ -1157,6 +1186,7 @@ static void update_curr(struct cfs_rq *c
 	struct sched_entity *curr = cfs_rq->curr;
 	struct rq *rq = rq_of(cfs_rq);
 	s64 delta_exec;
+	bool resched;
 
 	if (unlikely(!curr))
 		return;
@@ -1166,7 +1196,7 @@ static void update_curr(struct cfs_rq *c
 		return;
 
 	curr->vruntime += calc_delta_fair(delta_exec, curr);
-	update_deadline(cfs_rq, curr);
+	resched = update_deadline(cfs_rq, curr);
 	update_min_vruntime(cfs_rq);
 
 	if (entity_is_task(curr)) {
@@ -1184,6 +1214,14 @@ static void update_curr(struct cfs_rq *c
 	}
 
 	account_cfs_rq_runtime(cfs_rq, delta_exec);
+
+	if (rq->nr_running == 1)
+		return;
+
+	if (resched || did_preempt_short(cfs_rq, curr)) {
+		resched_curr(rq);
+		clear_buddies(cfs_rq, curr);
+	}
 }
 
 static void update_curr_fair(struct rq *rq)
@@ -8611,7 +8649,17 @@ static void check_preempt_wakeup_fair(st
 	cfs_rq = cfs_rq_of(se);
 	update_curr(cfs_rq);
 	/*
-	 * XXX pick_eevdf(cfs_rq) != se ?
+	 * If @p has a shorter slice than current and @p is eligible, override
+	 * current's slice protection in order to allow preemption.
+	 *
+	 * Note that even if @p does not turn out to be the most eligible
+	 * task at this moment, current's slice protection will be lost.
+	 */
+	if (do_preempt_short(cfs_rq, pse, se) && se->vlag == se->deadline)
+		se->vlag = se->deadline + 1;
+
+	/*
+	 * If @p has become the most eligible task, force preemption.
 	 */
 	if (pick_eevdf(cfs_rq) == pse)
 		goto preempt;
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -18,6 +18,11 @@ SCHED_FEAT(PLACE_REL_DEADLINE, true)
  * 0-lag point or until is has exhausted it's slice.
  */
 SCHED_FEAT(RUN_TO_PARITY, true)
+/*
+ * Allow wakeup of tasks with a shorter slice to cancel RESPECT_SLICE for
+ * current.
+ */
+SCHED_FEAT(PREEMPT_SHORT, true)
 
 /*
  * Prefer to schedule the task we woke last (assuming it failed

Re: [PATCH 21/24] sched/eevdf: Allow shorter slices to wakeup-preempt

Posted by Chen Yu 1 year, 6 months ago

Hi Peter,

On 2024-07-27 at 12:27:53 +0200, Peter Zijlstra wrote:
> Part of the reason to have shorter slices is to improve
> responsiveness. Allow shorter slices to preempt longer slices on
> wakeup.
> 
>     Task                  |   Runtime ms  | Switches | Avg delay ms    | Max delay ms    | Sum delay ms     |
> 
>   100ms massive_intr 500us cyclictest NO_PREEMPT_SHORT
> 
>   1 massive_intr:(5)      | 846018.956 ms |   779188 | avg:   0.273 ms | max:  58.337 ms | sum:212545.245 ms |
>   2 massive_intr:(5)      | 853450.693 ms |   792269 | avg:   0.275 ms | max:  71.193 ms | sum:218263.588 ms |
>   3 massive_intr:(5)      | 843888.920 ms |   771456 | avg:   0.277 ms | max:  92.405 ms | sum:213353.221 ms |
>   1 chromium-browse:(8)   |  53015.889 ms |   131766 | avg:   0.463 ms | max:  36.341 ms | sum:60959.230  ms |
>   2 chromium-browse:(8)   |  53864.088 ms |   136962 | avg:   0.480 ms | max:  27.091 ms | sum:65687.681  ms |
>   3 chromium-browse:(9)   |  53637.904 ms |   132637 | avg:   0.481 ms | max:  24.756 ms | sum:63781.673  ms |
>   1 cyclictest:(5)        |  12615.604 ms |   639689 | avg:   0.471 ms | max:  32.272 ms | sum:301351.094 ms |
>   2 cyclictest:(5)        |  12511.583 ms |   642578 | avg:   0.448 ms | max:  44.243 ms | sum:287632.830 ms |
>   3 cyclictest:(5)        |  12545.867 ms |   635953 | avg:   0.475 ms | max:  25.530 ms | sum:302374.658 ms |
> 
>   100ms massive_intr 500us cyclictest PREEMPT_SHORT
> 
>   1 massive_intr:(5)      | 839843.919 ms |   837384 | avg:   0.264 ms | max:  74.366 ms | sum:221476.885 ms |
>   2 massive_intr:(5)      | 852449.913 ms |   845086 | avg:   0.252 ms | max:  68.162 ms | sum:212595.968 ms |
>   3 massive_intr:(5)      | 839180.725 ms |   836883 | avg:   0.266 ms | max:  69.742 ms | sum:222812.038 ms |
>   1 chromium-browse:(11)  |  54591.481 ms |   138388 | avg:   0.458 ms | max:  35.427 ms | sum:63401.508  ms |
>   2 chromium-browse:(8)   |  52034.541 ms |   132276 | avg:   0.436 ms | max:  31.826 ms | sum:57732.958  ms |
>   3 chromium-browse:(8)   |  55231.771 ms |   141892 | avg:   0.469 ms | max:  27.607 ms | sum:66538.697  ms |
>   1 cyclictest:(5)        |  13156.391 ms |   667412 | avg:   0.373 ms | max:  38.247 ms | sum:249174.502 ms |
>   2 cyclictest:(5)        |  12688.939 ms |   665144 | avg:   0.374 ms | max:  33.548 ms | sum:248509.392 ms |
>   3 cyclictest:(5)        |  13475.623 ms |   669110 | avg:   0.370 ms | max:  37.819 ms | sum:247673.390 ms |
> 
> As per the numbers the, this makes cyclictest (short slice) it's
> max-delay more consistent and consistency drops the sum-delay. The
> trade-off is that the massive_intr (long slice) gets more context
> switches and a slight increase in sum-delay.
> 
> [mike: numbers]
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Tested-by: Mike Galbraith <umgwanakikbuti@gmail.com>

Besides this short preemption, it seems that an important patch is missing from
this patch set, that was originally from Prateek and you refined it to fix the
current task's false negative eligibility:
https://lore.kernel.org/lkml/20240424150721.GQ30852@noisy.programming.kicks-ass.net/

The RESPECT_SLICE is introduced to honor the current task's slice during wakeup preemption.
Without it we got reported that over-preemption and performance downgrading are observed
when running SPECjbb on servers.

echo RESPECT_SLICE > /sys/kernel/debug/sched/features

echo RUN_TO_PARITY > /sys/kernel/debug/sched/features
@task_duration_usecs_before_preempted: 
[2, 4)              8732 |@@@                                                 |
[4, 8)            109400 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@          |
[8, 16)            95815 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@               |
[16, 32)          110647 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@         |
[32, 64)          131298 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ |
[64, 128)         132566 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[128, 256)         82095 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@                    |
[256, 512)         33771 |@@@@@@@@@@@@@                                       |
[512, 1K)          24180 |@@@@@@@@@                                           |
[1K, 2K)           31056 |@@@@@@@@@@@@                                        |
[2K, 4K)          117533 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@      |
[4K, 8K)            4472 |@                                                   |
[8K, 16K)           1149 |                                                    |
[16K, 32K)           289 |                                                    |
[32K, 64K)           110 |                                                    |
[64K, 128K)           20 |                                                    |
[128K, 256K)           4 |                                                    |


echo NO_RUN_TO_PARITY > /sys/kernel/debug/sched/features
@task_duration_usecs_before_preempted: 
[4, 8)                 1 |                                                    |
[8, 16)               12 |                                                    |
[16, 32)              20 |                                                    |
[32, 64)              38 |                                                    |
[64, 128)             64 |                                                    |
[128, 256)            98 |                                                    |
[256, 512)           248 |                                                    |
[512, 1K)           1196 |                                                    |
[1K, 2K)            3456 |                                                    |
[2K, 4K)          417269 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[4K, 8K)           22893 |@@                                                  |
[8K, 16K)           7818 |                                                    |
[16K, 32K)          1471 |                                                    |
[32K, 64K)           373 |                                                    |
[64K, 128K)           96 |                                                    |
[128K, 256K)           3 |                                                    |

We can see that without the fix, the task will be preempted and can not reach
its time slice budget(we enlarge its slice).

May I know if we can put that patch into this series please?

thanks,
Chenyu

Re: [PATCH 21/24] sched/eevdf: Allow shorter slices to wakeup-preempt

Posted by Peter Zijlstra 1 year, 6 months ago

On Thu, Aug 08, 2024 at 06:15:50PM +0800, Chen Yu wrote:
> Hi Peter,
> 
> On 2024-07-27 at 12:27:53 +0200, Peter Zijlstra wrote:
> > Part of the reason to have shorter slices is to improve
> > responsiveness. Allow shorter slices to preempt longer slices on
> > wakeup.
> > 
> >     Task                  |   Runtime ms  | Switches | Avg delay ms    | Max delay ms    | Sum delay ms     |
> > 
> >   100ms massive_intr 500us cyclictest NO_PREEMPT_SHORT
> > 
> >   1 massive_intr:(5)      | 846018.956 ms |   779188 | avg:   0.273 ms | max:  58.337 ms | sum:212545.245 ms |
> >   2 massive_intr:(5)      | 853450.693 ms |   792269 | avg:   0.275 ms | max:  71.193 ms | sum:218263.588 ms |
> >   3 massive_intr:(5)      | 843888.920 ms |   771456 | avg:   0.277 ms | max:  92.405 ms | sum:213353.221 ms |
> >   1 chromium-browse:(8)   |  53015.889 ms |   131766 | avg:   0.463 ms | max:  36.341 ms | sum:60959.230  ms |
> >   2 chromium-browse:(8)   |  53864.088 ms |   136962 | avg:   0.480 ms | max:  27.091 ms | sum:65687.681  ms |
> >   3 chromium-browse:(9)   |  53637.904 ms |   132637 | avg:   0.481 ms | max:  24.756 ms | sum:63781.673  ms |
> >   1 cyclictest:(5)        |  12615.604 ms |   639689 | avg:   0.471 ms | max:  32.272 ms | sum:301351.094 ms |
> >   2 cyclictest:(5)        |  12511.583 ms |   642578 | avg:   0.448 ms | max:  44.243 ms | sum:287632.830 ms |
> >   3 cyclictest:(5)        |  12545.867 ms |   635953 | avg:   0.475 ms | max:  25.530 ms | sum:302374.658 ms |
> > 
> >   100ms massive_intr 500us cyclictest PREEMPT_SHORT
> > 
> >   1 massive_intr:(5)      | 839843.919 ms |   837384 | avg:   0.264 ms | max:  74.366 ms | sum:221476.885 ms |
> >   2 massive_intr:(5)      | 852449.913 ms |   845086 | avg:   0.252 ms | max:  68.162 ms | sum:212595.968 ms |
> >   3 massive_intr:(5)      | 839180.725 ms |   836883 | avg:   0.266 ms | max:  69.742 ms | sum:222812.038 ms |
> >   1 chromium-browse:(11)  |  54591.481 ms |   138388 | avg:   0.458 ms | max:  35.427 ms | sum:63401.508  ms |
> >   2 chromium-browse:(8)   |  52034.541 ms |   132276 | avg:   0.436 ms | max:  31.826 ms | sum:57732.958  ms |
> >   3 chromium-browse:(8)   |  55231.771 ms |   141892 | avg:   0.469 ms | max:  27.607 ms | sum:66538.697  ms |
> >   1 cyclictest:(5)        |  13156.391 ms |   667412 | avg:   0.373 ms | max:  38.247 ms | sum:249174.502 ms |
> >   2 cyclictest:(5)        |  12688.939 ms |   665144 | avg:   0.374 ms | max:  33.548 ms | sum:248509.392 ms |
> >   3 cyclictest:(5)        |  13475.623 ms |   669110 | avg:   0.370 ms | max:  37.819 ms | sum:247673.390 ms |
> > 
> > As per the numbers the, this makes cyclictest (short slice) it's
> > max-delay more consistent and consistency drops the sum-delay. The
> > trade-off is that the massive_intr (long slice) gets more context
> > switches and a slight increase in sum-delay.
> > 
> > [mike: numbers]
> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > Tested-by: Mike Galbraith <umgwanakikbuti@gmail.com>
> 
> Besides this short preemption, it seems that an important patch is missing from
> this patch set, that was originally from Prateek and you refined it to fix the
> current task's false negative eligibility:
> https://lore.kernel.org/lkml/20240424150721.GQ30852@noisy.programming.kicks-ass.net/
> 
> The RESPECT_SLICE is introduced to honor the current task's slice during wakeup preemption.
> Without it we got reported that over-preemption and performance downgrading are observed
> when running SPECjbb on servers.

So I *think* that running as SCHED_BATCH gets you exactly that
behaviour, no?

Re: [PATCH 21/24] sched/eevdf: Allow shorter slices to wakeup-preempt

Posted by Chen Yu 1 year, 6 months ago

On 2024-08-08 at 12:22:07 +0200, Peter Zijlstra wrote:
> On Thu, Aug 08, 2024 at 06:15:50PM +0800, Chen Yu wrote:
> > Hi Peter,
> > 
> > On 2024-07-27 at 12:27:53 +0200, Peter Zijlstra wrote:
> > > Part of the reason to have shorter slices is to improve
> > > responsiveness. Allow shorter slices to preempt longer slices on
> > > wakeup.
> > > 
> > >     Task                  |   Runtime ms  | Switches | Avg delay ms    | Max delay ms    | Sum delay ms     |
> > > 
> > >   100ms massive_intr 500us cyclictest NO_PREEMPT_SHORT
> > > 
> > >   1 massive_intr:(5)      | 846018.956 ms |   779188 | avg:   0.273 ms | max:  58.337 ms | sum:212545.245 ms |
> > >   2 massive_intr:(5)      | 853450.693 ms |   792269 | avg:   0.275 ms | max:  71.193 ms | sum:218263.588 ms |
> > >   3 massive_intr:(5)      | 843888.920 ms |   771456 | avg:   0.277 ms | max:  92.405 ms | sum:213353.221 ms |
> > >   1 chromium-browse:(8)   |  53015.889 ms |   131766 | avg:   0.463 ms | max:  36.341 ms | sum:60959.230  ms |
> > >   2 chromium-browse:(8)   |  53864.088 ms |   136962 | avg:   0.480 ms | max:  27.091 ms | sum:65687.681  ms |
> > >   3 chromium-browse:(9)   |  53637.904 ms |   132637 | avg:   0.481 ms | max:  24.756 ms | sum:63781.673  ms |
> > >   1 cyclictest:(5)        |  12615.604 ms |   639689 | avg:   0.471 ms | max:  32.272 ms | sum:301351.094 ms |
> > >   2 cyclictest:(5)        |  12511.583 ms |   642578 | avg:   0.448 ms | max:  44.243 ms | sum:287632.830 ms |
> > >   3 cyclictest:(5)        |  12545.867 ms |   635953 | avg:   0.475 ms | max:  25.530 ms | sum:302374.658 ms |
> > > 
> > >   100ms massive_intr 500us cyclictest PREEMPT_SHORT
> > > 
> > >   1 massive_intr:(5)      | 839843.919 ms |   837384 | avg:   0.264 ms | max:  74.366 ms | sum:221476.885 ms |
> > >   2 massive_intr:(5)      | 852449.913 ms |   845086 | avg:   0.252 ms | max:  68.162 ms | sum:212595.968 ms |
> > >   3 massive_intr:(5)      | 839180.725 ms |   836883 | avg:   0.266 ms | max:  69.742 ms | sum:222812.038 ms |
> > >   1 chromium-browse:(11)  |  54591.481 ms |   138388 | avg:   0.458 ms | max:  35.427 ms | sum:63401.508  ms |
> > >   2 chromium-browse:(8)   |  52034.541 ms |   132276 | avg:   0.436 ms | max:  31.826 ms | sum:57732.958  ms |
> > >   3 chromium-browse:(8)   |  55231.771 ms |   141892 | avg:   0.469 ms | max:  27.607 ms | sum:66538.697  ms |
> > >   1 cyclictest:(5)        |  13156.391 ms |   667412 | avg:   0.373 ms | max:  38.247 ms | sum:249174.502 ms |
> > >   2 cyclictest:(5)        |  12688.939 ms |   665144 | avg:   0.374 ms | max:  33.548 ms | sum:248509.392 ms |
> > >   3 cyclictest:(5)        |  13475.623 ms |   669110 | avg:   0.370 ms | max:  37.819 ms | sum:247673.390 ms |
> > > 
> > > As per the numbers the, this makes cyclictest (short slice) it's
> > > max-delay more consistent and consistency drops the sum-delay. The
> > > trade-off is that the massive_intr (long slice) gets more context
> > > switches and a slight increase in sum-delay.
> > > 
> > > [mike: numbers]
> > > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > > Tested-by: Mike Galbraith <umgwanakikbuti@gmail.com>
> > 
> > Besides this short preemption, it seems that an important patch is missing from
> > this patch set, that was originally from Prateek and you refined it to fix the
> > current task's false negative eligibility:
> > https://lore.kernel.org/lkml/20240424150721.GQ30852@noisy.programming.kicks-ass.net/
> > 
> > The RESPECT_SLICE is introduced to honor the current task's slice during wakeup preemption.
> > Without it we got reported that over-preemption and performance downgrading are observed
> > when running SPECjbb on servers.
> 
> So I *think* that running as SCHED_BATCH gets you exactly that
> behaviour, no?

SCHED_BATCH should work as it avoids the wakeup preemption as much as possible.
Except that RESPECT_SLICE considers the cgroup hierarchical to check if the current
sched_entity has used up its slice, which seems to be less aggressive.

thanks,
Chenyu

Re: [PATCH 21/24] sched/eevdf: Allow shorter slices to wakeup-preempt

Posted by Peter Zijlstra 1 year, 6 months ago

On Thu, Aug 08, 2024 at 08:31:53PM +0800, Chen Yu wrote:
> On 2024-08-08 at 12:22:07 +0200, Peter Zijlstra wrote:
> > On Thu, Aug 08, 2024 at 06:15:50PM +0800, Chen Yu wrote:
> > > Hi Peter,
> > > 
> > > On 2024-07-27 at 12:27:53 +0200, Peter Zijlstra wrote:
> > > > Part of the reason to have shorter slices is to improve
> > > > responsiveness. Allow shorter slices to preempt longer slices on
> > > > wakeup.
> > > > 
> > > >     Task                  |   Runtime ms  | Switches | Avg delay ms    | Max delay ms    | Sum delay ms     |
> > > > 
> > > >   100ms massive_intr 500us cyclictest NO_PREEMPT_SHORT
> > > > 
> > > >   1 massive_intr:(5)      | 846018.956 ms |   779188 | avg:   0.273 ms | max:  58.337 ms | sum:212545.245 ms |
> > > >   2 massive_intr:(5)      | 853450.693 ms |   792269 | avg:   0.275 ms | max:  71.193 ms | sum:218263.588 ms |
> > > >   3 massive_intr:(5)      | 843888.920 ms |   771456 | avg:   0.277 ms | max:  92.405 ms | sum:213353.221 ms |
> > > >   1 chromium-browse:(8)   |  53015.889 ms |   131766 | avg:   0.463 ms | max:  36.341 ms | sum:60959.230  ms |
> > > >   2 chromium-browse:(8)   |  53864.088 ms |   136962 | avg:   0.480 ms | max:  27.091 ms | sum:65687.681  ms |
> > > >   3 chromium-browse:(9)   |  53637.904 ms |   132637 | avg:   0.481 ms | max:  24.756 ms | sum:63781.673  ms |
> > > >   1 cyclictest:(5)        |  12615.604 ms |   639689 | avg:   0.471 ms | max:  32.272 ms | sum:301351.094 ms |
> > > >   2 cyclictest:(5)        |  12511.583 ms |   642578 | avg:   0.448 ms | max:  44.243 ms | sum:287632.830 ms |
> > > >   3 cyclictest:(5)        |  12545.867 ms |   635953 | avg:   0.475 ms | max:  25.530 ms | sum:302374.658 ms |
> > > > 
> > > >   100ms massive_intr 500us cyclictest PREEMPT_SHORT
> > > > 
> > > >   1 massive_intr:(5)      | 839843.919 ms |   837384 | avg:   0.264 ms | max:  74.366 ms | sum:221476.885 ms |
> > > >   2 massive_intr:(5)      | 852449.913 ms |   845086 | avg:   0.252 ms | max:  68.162 ms | sum:212595.968 ms |
> > > >   3 massive_intr:(5)      | 839180.725 ms |   836883 | avg:   0.266 ms | max:  69.742 ms | sum:222812.038 ms |
> > > >   1 chromium-browse:(11)  |  54591.481 ms |   138388 | avg:   0.458 ms | max:  35.427 ms | sum:63401.508  ms |
> > > >   2 chromium-browse:(8)   |  52034.541 ms |   132276 | avg:   0.436 ms | max:  31.826 ms | sum:57732.958  ms |
> > > >   3 chromium-browse:(8)   |  55231.771 ms |   141892 | avg:   0.469 ms | max:  27.607 ms | sum:66538.697  ms |
> > > >   1 cyclictest:(5)        |  13156.391 ms |   667412 | avg:   0.373 ms | max:  38.247 ms | sum:249174.502 ms |
> > > >   2 cyclictest:(5)        |  12688.939 ms |   665144 | avg:   0.374 ms | max:  33.548 ms | sum:248509.392 ms |
> > > >   3 cyclictest:(5)        |  13475.623 ms |   669110 | avg:   0.370 ms | max:  37.819 ms | sum:247673.390 ms |
> > > > 
> > > > As per the numbers the, this makes cyclictest (short slice) it's
> > > > max-delay more consistent and consistency drops the sum-delay. The
> > > > trade-off is that the massive_intr (long slice) gets more context
> > > > switches and a slight increase in sum-delay.
> > > > 
> > > > [mike: numbers]
> > > > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > > > Tested-by: Mike Galbraith <umgwanakikbuti@gmail.com>
> > > 
> > > Besides this short preemption, it seems that an important patch is missing from
> > > this patch set, that was originally from Prateek and you refined it to fix the
> > > current task's false negative eligibility:
> > > https://lore.kernel.org/lkml/20240424150721.GQ30852@noisy.programming.kicks-ass.net/
> > > 
> > > The RESPECT_SLICE is introduced to honor the current task's slice during wakeup preemption.
> > > Without it we got reported that over-preemption and performance downgrading are observed
> > > when running SPECjbb on servers.
> > 
> > So I *think* that running as SCHED_BATCH gets you exactly that
> > behaviour, no?
> 
> SCHED_BATCH should work as it avoids the wakeup preemption as much as possible.
> Except that RESPECT_SLICE considers the cgroup hierarchical to check if the current
> sched_entity has used up its slice, which seems to be less aggressive.

Note that update_deadline() will trigger a resched at the end up a slice
regardless -- this is driven from update_curr() and also invoked from
any preemption.

Re: [PATCH 21/24] sched/eevdf: Allow shorter slices to wakeup-preempt

Posted by Chunxin Zang 1 year, 6 months ago


> On Jul 27, 2024, at 18:27, Peter Zijlstra <peterz@infradead.org> wrote:
> 
> Part of the reason to have shorter slices is to improve
> responsiveness. Allow shorter slices to preempt longer slices on
> wakeup.
> 
>   Task                  |   Runtime ms  | Switches | Avg delay ms    | Max delay ms    | Sum delay ms     |
> 
> 100ms massive_intr 500us cyclictest NO_PREEMPT_SHORT
> 
> 1 massive_intr:(5)      | 846018.956 ms |   779188 | avg:   0.273 ms | max:  58.337 ms | sum:212545.245 ms |
> 2 massive_intr:(5)      | 853450.693 ms |   792269 | avg:   0.275 ms | max:  71.193 ms | sum:218263.588 ms |
> 3 massive_intr:(5)      | 843888.920 ms |   771456 | avg:   0.277 ms | max:  92.405 ms | sum:213353.221 ms |
> 1 chromium-browse:(8)   |  53015.889 ms |   131766 | avg:   0.463 ms | max:  36.341 ms | sum:60959.230  ms |
> 2 chromium-browse:(8)   |  53864.088 ms |   136962 | avg:   0.480 ms | max:  27.091 ms | sum:65687.681  ms |
> 3 chromium-browse:(9)   |  53637.904 ms |   132637 | avg:   0.481 ms | max:  24.756 ms | sum:63781.673  ms |
> 1 cyclictest:(5)        |  12615.604 ms |   639689 | avg:   0.471 ms | max:  32.272 ms | sum:301351.094 ms |
> 2 cyclictest:(5)        |  12511.583 ms |   642578 | avg:   0.448 ms | max:  44.243 ms | sum:287632.830 ms |
> 3 cyclictest:(5)        |  12545.867 ms |   635953 | avg:   0.475 ms | max:  25.530 ms | sum:302374.658 ms |
> 
> 100ms massive_intr 500us cyclictest PREEMPT_SHORT
> 
> 1 massive_intr:(5)      | 839843.919 ms |   837384 | avg:   0.264 ms | max:  74.366 ms | sum:221476.885 ms |
> 2 massive_intr:(5)      | 852449.913 ms |   845086 | avg:   0.252 ms | max:  68.162 ms | sum:212595.968 ms |
> 3 massive_intr:(5)      | 839180.725 ms |   836883 | avg:   0.266 ms | max:  69.742 ms | sum:222812.038 ms |
> 1 chromium-browse:(11)  |  54591.481 ms |   138388 | avg:   0.458 ms | max:  35.427 ms | sum:63401.508  ms |
> 2 chromium-browse:(8)   |  52034.541 ms |   132276 | avg:   0.436 ms | max:  31.826 ms | sum:57732.958  ms |
> 3 chromium-browse:(8)   |  55231.771 ms |   141892 | avg:   0.469 ms | max:  27.607 ms | sum:66538.697  ms |
> 1 cyclictest:(5)        |  13156.391 ms |   667412 | avg:   0.373 ms | max:  38.247 ms | sum:249174.502 ms |
> 2 cyclictest:(5)        |  12688.939 ms |   665144 | avg:   0.374 ms | max:  33.548 ms | sum:248509.392 ms |
> 3 cyclictest:(5)        |  13475.623 ms |   669110 | avg:   0.370 ms | max:  37.819 ms | sum:247673.390 ms |
> 
> As per the numbers the, this makes cyclictest (short slice) it's
> max-delay more consistent and consistency drops the sum-delay. The
> trade-off is that the massive_intr (long slice) gets more context
> switches and a slight increase in sum-delay.
> 
> [mike: numbers]
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Tested-by: Mike Galbraith <umgwanakikbuti@gmail.com>
> ---
> kernel/sched/fair.c     |   64 ++++++++++++++++++++++++++++++++++++++++++------
> kernel/sched/features.h |    5 +++
> 2 files changed, 61 insertions(+), 8 deletions(-)
> 
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -973,10 +973,10 @@ static void clear_buddies(struct cfs_rq
> * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i
> * this is probably good enough.
> */
> -static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
> +static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
> {
> if ((s64)(se->vruntime - se->deadline) < 0)
> - return;
> + return false;
> 
> /*
> * For EEVDF the virtual time slope is determined by w_i (iow.
> @@ -993,10 +993,7 @@ static void update_deadline(struct cfs_r
> /*
> * The task has consumed its request, reschedule.
> */
> - if (cfs_rq->nr_running > 1) {
> - resched_curr(rq_of(cfs_rq));
> - clear_buddies(cfs_rq, se);
> - }
> + return true;
> }
> 
> #include "pelt.h"
> @@ -1134,6 +1131,38 @@ static inline void update_curr_task(stru
> dl_server_update(p->dl_server, delta_exec);
> }
> 
> +static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct sched_entity *curr)
> +{
> + if (!sched_feat(PREEMPT_SHORT))
> + return false;
> +
> + if (curr->vlag == curr->deadline)
> + return false;
> +
> + return !entity_eligible(cfs_rq, curr);
> +}

Hi perter 

Can this be made more aggressive here? Something like , in the PREEMPT_SHORT
+ NO_RUN_TO_PARITY combination, it could break the first deadline of the current
task. This can achieve better latency benefits in certain embedded scenarios, such as
high-priority periodic tasks.

+static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+{
+ if (!sched_feat(PREEMPT_SHORT))
+ return false;
+
+ if (sched_feat(RUN_TO_PARITY) && curr->vlag == curr->deadline)
+ return false;
+
+ return !entity_eligible(cfs_rq, curr);
+}

Additionally, if possible, could you please include my name in this patch? I spent over a
month finding this solution and conducting the tests, and I hope to leave some trace of
my efforts during that time. This is also one of the reasons why I love Linux and am eager
to contribute to open source. I would be extremely grateful.
https://lore.kernel.org/lkml/20240613131437.9555-1-spring.cxz@gmail.com/

thanks
Chunxin


> +
> +static inline bool do_preempt_short(struct cfs_rq *cfs_rq,
> +    struct sched_entity *pse, struct sched_entity *se)
> +{
> + if (!sched_feat(PREEMPT_SHORT))
> + return false;
> +
> + if (pse->slice >= se->slice)
> + return false;
> +
> + if (!entity_eligible(cfs_rq, pse))
> + return false;
> +
> + if (entity_before(pse, se))
> + return true;
> +
> + if (!entity_eligible(cfs_rq, se))
> + return true;
> +
> + return false;
> +}
> +
> /*
> * Used by other classes to account runtime.
> */
> @@ -1157,6 +1186,7 @@ static void update_curr(struct cfs_rq *c
> struct sched_entity *curr = cfs_rq->curr;
> struct rq *rq = rq_of(cfs_rq);
> s64 delta_exec;
> + bool resched;
> 
> if (unlikely(!curr))
> return;
> @@ -1166,7 +1196,7 @@ static void update_curr(struct cfs_rq *c
> return;
> 
> curr->vruntime += calc_delta_fair(delta_exec, curr);
> - update_deadline(cfs_rq, curr);
> + resched = update_deadline(cfs_rq, curr);
> update_min_vruntime(cfs_rq);
> 
> if (entity_is_task(curr)) {
> @@ -1184,6 +1214,14 @@ static void update_curr(struct cfs_rq *c
> }
> 
> account_cfs_rq_runtime(cfs_rq, delta_exec);
> +
> + if (rq->nr_running == 1)
> + return;
> +
> + if (resched || did_preempt_short(cfs_rq, curr)) {
> + resched_curr(rq);
> + clear_buddies(cfs_rq, curr);
> + }
> }
> 
> static void update_curr_fair(struct rq *rq)
> @@ -8611,7 +8649,17 @@ static void check_preempt_wakeup_fair(st
> cfs_rq = cfs_rq_of(se);
> update_curr(cfs_rq);
> /*
> - * XXX pick_eevdf(cfs_rq) != se ?
> + * If @p has a shorter slice than current and @p is eligible, override
> + * current's slice protection in order to allow preemption.
> + *
> + * Note that even if @p does not turn out to be the most eligible
> + * task at this moment, current's slice protection will be lost.
> + */
> + if (do_preempt_short(cfs_rq, pse, se) && se->vlag == se->deadline)
> + se->vlag = se->deadline + 1;
> +
> + /*
> + * If @p has become the most eligible task, force preemption.
> */
> if (pick_eevdf(cfs_rq) == pse)
> goto preempt;
> --- a/kernel/sched/features.h
> +++ b/kernel/sched/features.h
> @@ -18,6 +18,11 @@ SCHED_FEAT(PLACE_REL_DEADLINE, true)
> * 0-lag point or until is has exhausted it's slice.
> */
> SCHED_FEAT(RUN_TO_PARITY, true)
> +/*
> + * Allow wakeup of tasks with a shorter slice to cancel RESPECT_SLICE for
> + * current.
> + */
> +SCHED_FEAT(PREEMPT_SHORT, true)
> 
> /*
> * Prefer to schedule the task we woke last (assuming it failed
> 
> 
>

Re: [PATCH 21/24] sched/eevdf: Allow shorter slices to wakeup-preempt

Posted by Peter Zijlstra 1 year, 6 months ago

On Mon, Aug 05, 2024 at 08:24:24PM +0800, Chunxin Zang wrote:
> > On Jul 27, 2024, at 18:27, Peter Zijlstra <peterz@infradead.org> wrote:

> > +static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct sched_entity *curr)
> > +{
> > + if (!sched_feat(PREEMPT_SHORT))
> > + return false;
> > +
> > + if (curr->vlag == curr->deadline)
> > + return false;
> > +
> > + return !entity_eligible(cfs_rq, curr);
> > +}

> Can this be made more aggressive here? Something like , in the PREEMPT_SHORT
> + NO_RUN_TO_PARITY combination, it could break the first deadline of the current
> task. This can achieve better latency benefits in certain embedded scenarios, such as
> high-priority periodic tasks.

You are aware we have SCHED_DEADLINE for those, right?

Why can't you use that? and what can we do to fix that.

> +static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct sched_entity *curr)
> +{
> + if (!sched_feat(PREEMPT_SHORT))
> + return false;
> +
> + if (sched_feat(RUN_TO_PARITY) && curr->vlag == curr->deadline)
> + return false;
> +
> + return !entity_eligible(cfs_rq, curr);
> +}

No, this will destroy the steady state schedule (where no tasks
join/leave) and make it so that all tasks hug the lag=0 state
arbitrarily close -- as allowed by the scheduling quanta.

Yes, it will get you better latency, because nobody gets to actually run
it's requested slice.

The goal really is for tasks to get their request -- and yes that means
you get to wait. PREEMPT_SHORT is already an exception to this rule, and
it is very specifically limited to wake-ups so as to retain as much of
the intended behaviour as possible.

> Additionally, if possible, could you please include my name in this patch? I spent over a
> month finding this solution and conducting the tests, and I hope to leave some trace of
> my efforts during that time. This is also one of the reasons why I love Linux and am eager
> to contribute to open source. I would be extremely grateful.

I've made it the below. Does that work for you?

---
Subject: sched/eevdf: Allow shorter slices to wakeup-preempt
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue Sep 26 14:32:32 CEST 2023

Part of the reason to have shorter slices is to improve
responsiveness. Allow shorter slices to preempt longer slices on
wakeup.

    Task                  |   Runtime ms  | Switches | Avg delay ms    | Max delay ms    | Sum delay ms     |

  100ms massive_intr 500us cyclictest NO_PREEMPT_SHORT

  1 massive_intr:(5)      | 846018.956 ms |   779188 | avg:   0.273 ms | max:  58.337 ms | sum:212545.245 ms |
  2 massive_intr:(5)      | 853450.693 ms |   792269 | avg:   0.275 ms | max:  71.193 ms | sum:218263.588 ms |
  3 massive_intr:(5)      | 843888.920 ms |   771456 | avg:   0.277 ms | max:  92.405 ms | sum:213353.221 ms |
  1 chromium-browse:(8)   |  53015.889 ms |   131766 | avg:   0.463 ms | max:  36.341 ms | sum:60959.230  ms |
  2 chromium-browse:(8)   |  53864.088 ms |   136962 | avg:   0.480 ms | max:  27.091 ms | sum:65687.681  ms |
  3 chromium-browse:(9)   |  53637.904 ms |   132637 | avg:   0.481 ms | max:  24.756 ms | sum:63781.673  ms |
  1 cyclictest:(5)        |  12615.604 ms |   639689 | avg:   0.471 ms | max:  32.272 ms | sum:301351.094 ms |
  2 cyclictest:(5)        |  12511.583 ms |   642578 | avg:   0.448 ms | max:  44.243 ms | sum:287632.830 ms |
  3 cyclictest:(5)        |  12545.867 ms |   635953 | avg:   0.475 ms | max:  25.530 ms | sum:302374.658 ms |

  100ms massive_intr 500us cyclictest PREEMPT_SHORT

  1 massive_intr:(5)      | 839843.919 ms |   837384 | avg:   0.264 ms | max:  74.366 ms | sum:221476.885 ms |
  2 massive_intr:(5)      | 852449.913 ms |   845086 | avg:   0.252 ms | max:  68.162 ms | sum:212595.968 ms |
  3 massive_intr:(5)      | 839180.725 ms |   836883 | avg:   0.266 ms | max:  69.742 ms | sum:222812.038 ms |
  1 chromium-browse:(11)  |  54591.481 ms |   138388 | avg:   0.458 ms | max:  35.427 ms | sum:63401.508  ms |
  2 chromium-browse:(8)   |  52034.541 ms |   132276 | avg:   0.436 ms | max:  31.826 ms | sum:57732.958  ms |
  3 chromium-browse:(8)   |  55231.771 ms |   141892 | avg:   0.469 ms | max:  27.607 ms | sum:66538.697  ms |
  1 cyclictest:(5)        |  13156.391 ms |   667412 | avg:   0.373 ms | max:  38.247 ms | sum:249174.502 ms |
  2 cyclictest:(5)        |  12688.939 ms |   665144 | avg:   0.374 ms | max:  33.548 ms | sum:248509.392 ms |
  3 cyclictest:(5)        |  13475.623 ms |   669110 | avg:   0.370 ms | max:  37.819 ms | sum:247673.390 ms |

As per the numbers the, this makes cyclictest (short slice) it's
max-delay more consistent and consistency drops the sum-delay. The
trade-off is that the massive_intr (long slice) gets more context
switches and a slight increase in sum-delay.

Chunxin contributed did_preempt_short() where a task that lost slice
protection from PREEMPT_SHORT gets rescheduled once it becomes
in-eligible.

[mike: numbers]
Co-Developed-by: Chunxin Zang <zangchunxin@lixiang.com>
Signed-off-by: Chunxin Zang <zangchunxin@lixiang.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Link: https://lkml.kernel.org/r/20240727105030.735459544@infradead.org
---
 kernel/sched/fair.c     |   64 ++++++++++++++++++++++++++++++++++++++++++------
 kernel/sched/features.h |    5 +++
 2 files changed, 61 insertions(+), 8 deletions(-)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -973,10 +973,10 @@ static void clear_buddies(struct cfs_rq
  * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i
  * this is probably good enough.
  */
-static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	if ((s64)(se->vruntime - se->deadline) < 0)
-		return;
+		return false;
 
 	/*
 	 * For EEVDF the virtual time slope is determined by w_i (iow.
@@ -993,10 +993,7 @@ static void update_deadline(struct cfs_r
 	/*
 	 * The task has consumed its request, reschedule.
 	 */
-	if (cfs_rq->nr_running > 1) {
-		resched_curr(rq_of(cfs_rq));
-		clear_buddies(cfs_rq, se);
-	}
+	return true;
 }
 
 #include "pelt.h"
@@ -1134,6 +1131,38 @@ static inline void update_curr_task(stru
 		dl_server_update(p->dl_server, delta_exec);
 }
 
+static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+{
+	if (!sched_feat(PREEMPT_SHORT))
+		return false;
+
+	if (curr->vlag == curr->deadline)
+		return false;
+
+	return !entity_eligible(cfs_rq, curr);
+}
+
+static inline bool do_preempt_short(struct cfs_rq *cfs_rq,
+				    struct sched_entity *pse, struct sched_entity *se)
+{
+	if (!sched_feat(PREEMPT_SHORT))
+		return false;
+
+	if (pse->slice >= se->slice)
+		return false;
+
+	if (!entity_eligible(cfs_rq, pse))
+		return false;
+
+	if (entity_before(pse, se))
+		return true;
+
+	if (!entity_eligible(cfs_rq, se))
+		return true;
+
+	return false;
+}
+
 /*
  * Used by other classes to account runtime.
  */
@@ -1157,6 +1186,7 @@ static void update_curr(struct cfs_rq *c
 	struct sched_entity *curr = cfs_rq->curr;
 	struct rq *rq = rq_of(cfs_rq);
 	s64 delta_exec;
+	bool resched;
 
 	if (unlikely(!curr))
 		return;
@@ -1166,7 +1196,7 @@ static void update_curr(struct cfs_rq *c
 		return;
 
 	curr->vruntime += calc_delta_fair(delta_exec, curr);
-	update_deadline(cfs_rq, curr);
+	resched = update_deadline(cfs_rq, curr);
 	update_min_vruntime(cfs_rq);
 
 	if (entity_is_task(curr)) {
@@ -1184,6 +1214,14 @@ static void update_curr(struct cfs_rq *c
 	}
 
 	account_cfs_rq_runtime(cfs_rq, delta_exec);
+
+	if (rq->nr_running == 1)
+		return;
+
+	if (resched || did_preempt_short(cfs_rq, curr)) {
+		resched_curr(rq);
+		clear_buddies(cfs_rq, curr);
+	}
 }
 
 static void update_curr_fair(struct rq *rq)
@@ -8611,7 +8649,17 @@ static void check_preempt_wakeup_fair(st
 	cfs_rq = cfs_rq_of(se);
 	update_curr(cfs_rq);
 	/*
-	 * XXX pick_eevdf(cfs_rq) != se ?
+	 * If @p has a shorter slice than current and @p is eligible, override
+	 * current's slice protection in order to allow preemption.
+	 *
+	 * Note that even if @p does not turn out to be the most eligible
+	 * task at this moment, current's slice protection will be lost.
+	 */
+	if (do_preempt_short(cfs_rq, pse, se) && se->vlag == se->deadline)
+		se->vlag = se->deadline + 1;
+
+	/*
+	 * If @p has become the most eligible task, force preemption.
 	 */
 	if (pick_eevdf(cfs_rq) == pse)
 		goto preempt;
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -18,6 +18,11 @@ SCHED_FEAT(PLACE_REL_DEADLINE, true)
  * 0-lag point or until is has exhausted it's slice.
  */
 SCHED_FEAT(RUN_TO_PARITY, true)
+/*
+ * Allow wakeup of tasks with a shorter slice to cancel RESPECT_SLICE for
+ * current.
+ */
+SCHED_FEAT(PREEMPT_SHORT, true)
 
 /*
  * Prefer to schedule the task we woke last (assuming it failed

Re: [PATCH 21/24] sched/eevdf: Allow shorter slices to wakeup-preempt

Posted by Chunxin Zang 1 year, 5 months ago


> On Aug 8, 2024, at 01:54, Peter Zijlstra <peterz@infradead.org> wrote:
> 
> On Mon, Aug 05, 2024 at 08:24:24PM +0800, Chunxin Zang wrote:
>>> On Jul 27, 2024, at 18:27, Peter Zijlstra <peterz@infradead.org> wrote:
> 
>>> +static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct sched_entity *curr)
>>> +{
>>> + if (!sched_feat(PREEMPT_SHORT))
>>> + return false;
>>> +
>>> + if (curr->vlag == curr->deadline)
>>> + return false;
>>> +
>>> + return !entity_eligible(cfs_rq, curr);
>>> +}
> 
>> Can this be made more aggressive here? Something like , in the PREEMPT_SHORT
>> + NO_RUN_TO_PARITY combination, it could break the first deadline of the current
>> task. This can achieve better latency benefits in certain embedded scenarios, such as
>> high-priority periodic tasks.
> 
> You are aware we have SCHED_DEADLINE for those, right?
> 
> Why can't you use that? and what can we do to fix that.
> 
>> +static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct sched_entity *curr)
>> +{
>> + if (!sched_feat(PREEMPT_SHORT))
>> + return false;
>> +
>> + if (sched_feat(RUN_TO_PARITY) && curr->vlag == curr->deadline)
>> + return false;
>> +
>> + return !entity_eligible(cfs_rq, curr);
>> +}
> 
> No, this will destroy the steady state schedule (where no tasks
> join/leave) and make it so that all tasks hug the lag=0 state
> arbitrarily close -- as allowed by the scheduling quanta.
> 
> Yes, it will get you better latency, because nobody gets to actually run
> it's requested slice.
> 
> The goal really is for tasks to get their request -- and yes that means
> you get to wait. PREEMPT_SHORT is already an exception to this rule, and
> it is very specifically limited to wake-ups so as to retain as much of
> the intended behaviour as possible.

I think I understand your point now. That approach does indeed seem more reasonable.
Next, I will try using 'request slice' to conduct more tests in my scenario.

> 
>> Additionally, if possible, could you please include my name in this patch? I spent over a
>> month finding this solution and conducting the tests, and I hope to leave some trace of
>> my efforts during that time. This is also one of the reasons why I love Linux and am eager
>> to contribute to open source. I would be extremely grateful.
> 
> I've made it the below. Does that work for you?

Heartfelt thanks :)

Chunxin

> 
> ---
> Subject: sched/eevdf: Allow shorter slices to wakeup-preempt
> From: Peter Zijlstra <peterz@infradead.org>
> Date: Tue Sep 26 14:32:32 CEST 2023
> 
> Part of the reason to have shorter slices is to improve
> responsiveness. Allow shorter slices to preempt longer slices on
> wakeup.
> 
>    Task                  |   Runtime ms  | Switches | Avg delay ms    | Max delay ms    | Sum delay ms     |
> 
>  100ms massive_intr 500us cyclictest NO_PREEMPT_SHORT
> 
>  1 massive_intr:(5)      | 846018.956 ms |   779188 | avg:   0.273 ms | max:  58.337 ms | sum:212545.245 ms |
>  2 massive_intr:(5)      | 853450.693 ms |   792269 | avg:   0.275 ms | max:  71.193 ms | sum:218263.588 ms |
>  3 massive_intr:(5)      | 843888.920 ms |   771456 | avg:   0.277 ms | max:  92.405 ms | sum:213353.221 ms |
>  1 chromium-browse:(8)   |  53015.889 ms |   131766 | avg:   0.463 ms | max:  36.341 ms | sum:60959.230  ms |
>  2 chromium-browse:(8)   |  53864.088 ms |   136962 | avg:   0.480 ms | max:  27.091 ms | sum:65687.681  ms |
>  3 chromium-browse:(9)   |  53637.904 ms |   132637 | avg:   0.481 ms | max:  24.756 ms | sum:63781.673  ms |
>  1 cyclictest:(5)        |  12615.604 ms |   639689 | avg:   0.471 ms | max:  32.272 ms | sum:301351.094 ms |
>  2 cyclictest:(5)        |  12511.583 ms |   642578 | avg:   0.448 ms | max:  44.243 ms | sum:287632.830 ms |
>  3 cyclictest:(5)        |  12545.867 ms |   635953 | avg:   0.475 ms | max:  25.530 ms | sum:302374.658 ms |
> 
>  100ms massive_intr 500us cyclictest PREEMPT_SHORT
> 
>  1 massive_intr:(5)      | 839843.919 ms |   837384 | avg:   0.264 ms | max:  74.366 ms | sum:221476.885 ms |
>  2 massive_intr:(5)      | 852449.913 ms |   845086 | avg:   0.252 ms | max:  68.162 ms | sum:212595.968 ms |
>  3 massive_intr:(5)      | 839180.725 ms |   836883 | avg:   0.266 ms | max:  69.742 ms | sum:222812.038 ms |
>  1 chromium-browse:(11)  |  54591.481 ms |   138388 | avg:   0.458 ms | max:  35.427 ms | sum:63401.508  ms |
>  2 chromium-browse:(8)   |  52034.541 ms |   132276 | avg:   0.436 ms | max:  31.826 ms | sum:57732.958  ms |
>  3 chromium-browse:(8)   |  55231.771 ms |   141892 | avg:   0.469 ms | max:  27.607 ms | sum:66538.697  ms |
>  1 cyclictest:(5)        |  13156.391 ms |   667412 | avg:   0.373 ms | max:  38.247 ms | sum:249174.502 ms |
>  2 cyclictest:(5)        |  12688.939 ms |   665144 | avg:   0.374 ms | max:  33.548 ms | sum:248509.392 ms |
>  3 cyclictest:(5)        |  13475.623 ms |   669110 | avg:   0.370 ms | max:  37.819 ms | sum:247673.390 ms |
> 
> As per the numbers the, this makes cyclictest (short slice) it's
> max-delay more consistent and consistency drops the sum-delay. The
> trade-off is that the massive_intr (long slice) gets more context
> switches and a slight increase in sum-delay.
> 
> Chunxin contributed did_preempt_short() where a task that lost slice
> protection from PREEMPT_SHORT gets rescheduled once it becomes
> in-eligible.
> 
> [mike: numbers]
> Co-Developed-by: Chunxin Zang <zangchunxin@lixiang.com>
> Signed-off-by: Chunxin Zang <zangchunxin@lixiang.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Tested-by: Mike Galbraith <umgwanakikbuti@gmail.com>
> Link: https://lkml.kernel.org/r/20240727105030.735459544@infradead.org
> ---
> kernel/sched/fair.c     |   64 ++++++++++++++++++++++++++++++++++++++++++------
> kernel/sched/features.h |    5 +++
> 2 files changed, 61 insertions(+), 8 deletions(-)
> 
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -973,10 +973,10 @@ static void clear_buddies(struct cfs_rq
>  * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i
>  * this is probably good enough.
>  */
> -static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
> +static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
> {
> if ((s64)(se->vruntime - se->deadline) < 0)
> - return;
> + return false;
> 
> /*
>  * For EEVDF the virtual time slope is determined by w_i (iow.
> @@ -993,10 +993,7 @@ static void update_deadline(struct cfs_r
> /*
>  * The task has consumed its request, reschedule.
>  */
> - if (cfs_rq->nr_running > 1) {
> - resched_curr(rq_of(cfs_rq));
> - clear_buddies(cfs_rq, se);
> - }
> + return true;
> }
> 
> #include "pelt.h"
> @@ -1134,6 +1131,38 @@ static inline void update_curr_task(stru
> dl_server_update(p->dl_server, delta_exec);
> }
> 
> +static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct sched_entity *curr)
> +{
> + if (!sched_feat(PREEMPT_SHORT))
> + return false;
> +
> + if (curr->vlag == curr->deadline)
> + return false;
> +
> + return !entity_eligible(cfs_rq, curr);
> +}
> +
> +static inline bool do_preempt_short(struct cfs_rq *cfs_rq,
> +     struct sched_entity *pse, struct sched_entity *se)
> +{
> + if (!sched_feat(PREEMPT_SHORT))
> + return false;
> +
> + if (pse->slice >= se->slice)
> + return false;
> +
> + if (!entity_eligible(cfs_rq, pse))
> + return false;
> +
> + if (entity_before(pse, se))
> + return true;
> +
> + if (!entity_eligible(cfs_rq, se))
> + return true;
> +
> + return false;
> +}
> +
> /*
>  * Used by other classes to account runtime.
>  */
> @@ -1157,6 +1186,7 @@ static void update_curr(struct cfs_rq *c
> struct sched_entity *curr = cfs_rq->curr;
> struct rq *rq = rq_of(cfs_rq);
> s64 delta_exec;
> + bool resched;
> 
> if (unlikely(!curr))
> return;
> @@ -1166,7 +1196,7 @@ static void update_curr(struct cfs_rq *c
> return;
> 
> curr->vruntime += calc_delta_fair(delta_exec, curr);
> - update_deadline(cfs_rq, curr);
> + resched = update_deadline(cfs_rq, curr);
> update_min_vruntime(cfs_rq);
> 
> if (entity_is_task(curr)) {
> @@ -1184,6 +1214,14 @@ static void update_curr(struct cfs_rq *c
> }
> 
> account_cfs_rq_runtime(cfs_rq, delta_exec);
> +
> + if (rq->nr_running == 1)
> + return;
> +
> + if (resched || did_preempt_short(cfs_rq, curr)) {
> + resched_curr(rq);
> + clear_buddies(cfs_rq, curr);
> + }
> }
> 
> static void update_curr_fair(struct rq *rq)
> @@ -8611,7 +8649,17 @@ static void check_preempt_wakeup_fair(st
> cfs_rq = cfs_rq_of(se);
> update_curr(cfs_rq);
> /*
> -  * XXX pick_eevdf(cfs_rq) != se ?
> +  * If @p has a shorter slice than current and @p is eligible, override
> +  * current's slice protection in order to allow preemption.
> +  *
> +  * Note that even if @p does not turn out to be the most eligible
> +  * task at this moment, current's slice protection will be lost.
> +  */
> + if (do_preempt_short(cfs_rq, pse, se) && se->vlag == se->deadline)
> + se->vlag = se->deadline + 1;
> +
> + /*
> +  * If @p has become the most eligible task, force preemption.
>  */
> if (pick_eevdf(cfs_rq) == pse)
> goto preempt;
> --- a/kernel/sched/features.h
> +++ b/kernel/sched/features.h
> @@ -18,6 +18,11 @@ SCHED_FEAT(PLACE_REL_DEADLINE, true)
>  * 0-lag point or until is has exhausted it's slice.
>  */
> SCHED_FEAT(RUN_TO_PARITY, true)
> +/*
> + * Allow wakeup of tasks with a shorter slice to cancel RESPECT_SLICE for
> + * current.
> + */
> +SCHED_FEAT(PREEMPT_SHORT, true)
> 
> /*
>  * Prefer to schedule the task we woke last (assuming it failed

[tip: sched/core] sched/eevdf: Allow shorter slices to wakeup-preempt

Posted by tip-bot2 for Peter Zijlstra 1 year, 5 months ago

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     85e511df3cec46021024176672a748008ed135bf
Gitweb:        https://git.kernel.org/tip/85e511df3cec46021024176672a748008ed135bf
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Tue, 26 Sep 2023 14:32:32 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Sat, 17 Aug 2024 11:06:45 +02:00

sched/eevdf: Allow shorter slices to wakeup-preempt

Part of the reason to have shorter slices is to improve
responsiveness. Allow shorter slices to preempt longer slices on
wakeup.

    Task                  |   Runtime ms  | Switches | Avg delay ms    | Max delay ms    | Sum delay ms     |

  100ms massive_intr 500us cyclictest NO_PREEMPT_SHORT

  1 massive_intr:(5)      | 846018.956 ms |   779188 | avg:   0.273 ms | max:  58.337 ms | sum:212545.245 ms |
  2 massive_intr:(5)      | 853450.693 ms |   792269 | avg:   0.275 ms | max:  71.193 ms | sum:218263.588 ms |
  3 massive_intr:(5)      | 843888.920 ms |   771456 | avg:   0.277 ms | max:  92.405 ms | sum:213353.221 ms |
  1 chromium-browse:(8)   |  53015.889 ms |   131766 | avg:   0.463 ms | max:  36.341 ms | sum:60959.230  ms |
  2 chromium-browse:(8)   |  53864.088 ms |   136962 | avg:   0.480 ms | max:  27.091 ms | sum:65687.681  ms |
  3 chromium-browse:(9)   |  53637.904 ms |   132637 | avg:   0.481 ms | max:  24.756 ms | sum:63781.673  ms |
  1 cyclictest:(5)        |  12615.604 ms |   639689 | avg:   0.471 ms | max:  32.272 ms | sum:301351.094 ms |
  2 cyclictest:(5)        |  12511.583 ms |   642578 | avg:   0.448 ms | max:  44.243 ms | sum:287632.830 ms |
  3 cyclictest:(5)        |  12545.867 ms |   635953 | avg:   0.475 ms | max:  25.530 ms | sum:302374.658 ms |

  100ms massive_intr 500us cyclictest PREEMPT_SHORT

  1 massive_intr:(5)      | 839843.919 ms |   837384 | avg:   0.264 ms | max:  74.366 ms | sum:221476.885 ms |
  2 massive_intr:(5)      | 852449.913 ms |   845086 | avg:   0.252 ms | max:  68.162 ms | sum:212595.968 ms |
  3 massive_intr:(5)      | 839180.725 ms |   836883 | avg:   0.266 ms | max:  69.742 ms | sum:222812.038 ms |
  1 chromium-browse:(11)  |  54591.481 ms |   138388 | avg:   0.458 ms | max:  35.427 ms | sum:63401.508  ms |
  2 chromium-browse:(8)   |  52034.541 ms |   132276 | avg:   0.436 ms | max:  31.826 ms | sum:57732.958  ms |
  3 chromium-browse:(8)   |  55231.771 ms |   141892 | avg:   0.469 ms | max:  27.607 ms | sum:66538.697  ms |
  1 cyclictest:(5)        |  13156.391 ms |   667412 | avg:   0.373 ms | max:  38.247 ms | sum:249174.502 ms |
  2 cyclictest:(5)        |  12688.939 ms |   665144 | avg:   0.374 ms | max:  33.548 ms | sum:248509.392 ms |
  3 cyclictest:(5)        |  13475.623 ms |   669110 | avg:   0.370 ms | max:  37.819 ms | sum:247673.390 ms |

As per the numbers the, this makes cyclictest (short slice) it's
max-delay more consistent and consistency drops the sum-delay. The
trade-off is that the massive_intr (long slice) gets more context
switches and a slight increase in sum-delay.

Chunxin contributed did_preempt_short() where a task that lost slice
protection from PREEMPT_SHORT gets rescheduled once it becomes
in-eligible.

[mike: numbers]
Co-Developed-by: Chunxin Zang <zangchunxin@lixiang.com>
Signed-off-by: Chunxin Zang <zangchunxin@lixiang.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Valentin Schneider <vschneid@redhat.com>
Tested-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Link: https://lkml.kernel.org/r/20240727105030.735459544@infradead.org
---
 kernel/sched/fair.c     | 64 +++++++++++++++++++++++++++++++++++-----
 kernel/sched/features.h |  5 +++-
 2 files changed, 61 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fef0e1f..cc30ea3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -973,10 +973,10 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
  * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i
  * this is probably good enough.
  */
-static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	if ((s64)(se->vruntime - se->deadline) < 0)
-		return;
+		return false;
 
 	/*
 	 * For EEVDF the virtual time slope is determined by w_i (iow.
@@ -993,10 +993,7 @@ static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	/*
 	 * The task has consumed its request, reschedule.
 	 */
-	if (cfs_rq->nr_running > 1) {
-		resched_curr(rq_of(cfs_rq));
-		clear_buddies(cfs_rq, se);
-	}
+	return true;
 }
 
 #include "pelt.h"
@@ -1134,6 +1131,38 @@ static inline void update_curr_task(struct task_struct *p, s64 delta_exec)
 		dl_server_update(p->dl_server, delta_exec);
 }
 
+static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+{
+	if (!sched_feat(PREEMPT_SHORT))
+		return false;
+
+	if (curr->vlag == curr->deadline)
+		return false;
+
+	return !entity_eligible(cfs_rq, curr);
+}
+
+static inline bool do_preempt_short(struct cfs_rq *cfs_rq,
+				    struct sched_entity *pse, struct sched_entity *se)
+{
+	if (!sched_feat(PREEMPT_SHORT))
+		return false;
+
+	if (pse->slice >= se->slice)
+		return false;
+
+	if (!entity_eligible(cfs_rq, pse))
+		return false;
+
+	if (entity_before(pse, se))
+		return true;
+
+	if (!entity_eligible(cfs_rq, se))
+		return true;
+
+	return false;
+}
+
 /*
  * Used by other classes to account runtime.
  */
@@ -1157,6 +1186,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
 	struct sched_entity *curr = cfs_rq->curr;
 	struct rq *rq = rq_of(cfs_rq);
 	s64 delta_exec;
+	bool resched;
 
 	if (unlikely(!curr))
 		return;
@@ -1166,7 +1196,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
 		return;
 
 	curr->vruntime += calc_delta_fair(delta_exec, curr);
-	update_deadline(cfs_rq, curr);
+	resched = update_deadline(cfs_rq, curr);
 	update_min_vruntime(cfs_rq);
 
 	if (entity_is_task(curr)) {
@@ -1184,6 +1214,14 @@ static void update_curr(struct cfs_rq *cfs_rq)
 	}
 
 	account_cfs_rq_runtime(cfs_rq, delta_exec);
+
+	if (rq->nr_running == 1)
+		return;
+
+	if (resched || did_preempt_short(cfs_rq, curr)) {
+		resched_curr(rq);
+		clear_buddies(cfs_rq, curr);
+	}
 }
 
 static void update_curr_fair(struct rq *rq)
@@ -8605,7 +8643,17 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int 
 	cfs_rq = cfs_rq_of(se);
 	update_curr(cfs_rq);
 	/*
-	 * XXX pick_eevdf(cfs_rq) != se ?
+	 * If @p has a shorter slice than current and @p is eligible, override
+	 * current's slice protection in order to allow preemption.
+	 *
+	 * Note that even if @p does not turn out to be the most eligible
+	 * task at this moment, current's slice protection will be lost.
+	 */
+	if (do_preempt_short(cfs_rq, pse, se) && se->vlag == se->deadline)
+		se->vlag = se->deadline + 1;
+
+	/*
+	 * If @p has become the most eligible task, force preemption.
 	 */
 	if (pick_eevdf(cfs_rq) == pse)
 		goto preempt;
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index caa4d72..2908740 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -18,6 +18,11 @@ SCHED_FEAT(PLACE_REL_DEADLINE, true)
  * 0-lag point or until is has exhausted it's slice.
  */
 SCHED_FEAT(RUN_TO_PARITY, true)
+/*
+ * Allow wakeup of tasks with a shorter slice to cancel RESPECT_SLICE for
+ * current.
+ */
+SCHED_FEAT(PREEMPT_SHORT, true)
 
 /*
  * Prefer to schedule the task we woke last (assuming it failed