[PATCH 13/24] sched/fair: Prepare pick_next_task() for delayed dequeue

Peter Zijlstra posted 24 patches 1 year, 4 months ago
[PATCH 13/24] sched/fair: Prepare pick_next_task() for delayed dequeue
Posted by Peter Zijlstra 1 year, 4 months ago
Delayed dequeue's natural end is when it gets picked again. Ensure
pick_next_task() knows what to do with delayed tasks.

Note, this relies on the earlier patch that made pick_next_task()
state invariant -- it will restart the pick on dequeue, because
obviously the just dequeued task is no longer eligible.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/fair.c |   23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5453,6 +5453,8 @@ set_next_entity(struct cfs_rq *cfs_rq, s
 	se->prev_sum_exec_runtime = se->sum_exec_runtime;
 }
 
+static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags);
+
 /*
  * Pick the next process, keeping these things in mind, in this order:
  * 1) keep things fair between processes/task groups
@@ -5461,16 +5463,27 @@ set_next_entity(struct cfs_rq *cfs_rq, s
  * 4) do not run the "skip" process, if something else is available
  */
 static struct sched_entity *
-pick_next_entity(struct cfs_rq *cfs_rq)
+pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
 {
 	/*
 	 * Enabling NEXT_BUDDY will affect latency but not fairness.
 	 */
 	if (sched_feat(NEXT_BUDDY) &&
-	    cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next))
+	    cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) {
+		/* ->next will never be delayed */
+		SCHED_WARN_ON(cfs_rq->next->sched_delayed);
 		return cfs_rq->next;
+	}
+
+	struct sched_entity *se = pick_eevdf(cfs_rq);
+	if (se->sched_delayed) {
+		dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+		SCHED_WARN_ON(se->sched_delayed);
+		SCHED_WARN_ON(se->on_rq);
 
-	return pick_eevdf(cfs_rq);
+		return NULL;
+	}
+	return se;
 }
 
 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
@@ -8478,7 +8491,9 @@ static struct task_struct *pick_task_fai
 		if (unlikely(check_cfs_rq_runtime(cfs_rq)))
 			goto again;
 
-		se = pick_next_entity(cfs_rq);
+		se = pick_next_entity(rq, cfs_rq);
+		if (!se)
+			goto again;
 		cfs_rq = group_cfs_rq(se);
 	} while (cfs_rq);
Re: [PATCH 13/24] sched/fair: Prepare pick_next_task() for delayed dequeue
Posted by Luis Machado 1 year, 3 months ago
Hi Peter,

On 7/27/24 11:27, Peter Zijlstra wrote:
> Delayed dequeue's natural end is when it gets picked again. Ensure
> pick_next_task() knows what to do with delayed tasks.
> 
> Note, this relies on the earlier patch that made pick_next_task()
> state invariant -- it will restart the pick on dequeue, because
> obviously the just dequeued task is no longer eligible.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  kernel/sched/fair.c |   23 +++++++++++++++++++----
>  1 file changed, 19 insertions(+), 4 deletions(-)
> 
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5453,6 +5453,8 @@ set_next_entity(struct cfs_rq *cfs_rq, s
>  	se->prev_sum_exec_runtime = se->sum_exec_runtime;
>  }
>  
> +static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags);
> +
>  /*
>   * Pick the next process, keeping these things in mind, in this order:
>   * 1) keep things fair between processes/task groups
> @@ -5461,16 +5463,27 @@ set_next_entity(struct cfs_rq *cfs_rq, s
>   * 4) do not run the "skip" process, if something else is available
>   */
>  static struct sched_entity *
> -pick_next_entity(struct cfs_rq *cfs_rq)
> +pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
>  {
>  	/*
>  	 * Enabling NEXT_BUDDY will affect latency but not fairness.
>  	 */
>  	if (sched_feat(NEXT_BUDDY) &&
> -	    cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next))
> +	    cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) {
> +		/* ->next will never be delayed */
> +		SCHED_WARN_ON(cfs_rq->next->sched_delayed);
>  		return cfs_rq->next;
> +	}
> +
> +	struct sched_entity *se = pick_eevdf(cfs_rq);
> +	if (se->sched_delayed) {
> +		dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
> +		SCHED_WARN_ON(se->sched_delayed);
> +		SCHED_WARN_ON(se->on_rq);

While exercising the h_nr_delayed changes on Android/Pixel 6 (6.8-based), I ran into
a situation where pick_eevdf seems to be returning NULL, and then we proceed to try to
dereference it and crash during boot.

I can fix it by guarding against a NULL se after the call to pick_eevdf, and then the code
runs OK from there as pick_task_fair will have another go at trying to pick the next entity.

I haven't checked exactly why we return NULL from pick_eevdf, but I recall seeing similar
reports of pick_eevdf sometimes failing to pick any task. Anyway, I thought I'd point this
out in case others see a similar situation.

Back to testing the h_nr_delayed changes.
[tip: sched/core] sched/fair: Prepare pick_next_task() for delayed dequeue
Posted by tip-bot2 for Peter Zijlstra 1 year, 3 months ago
The following commit has been merged into the sched/core branch of tip:

Commit-ID:     f12e148892ede8d9ee82bcd3e469e6d01fc077ac
Gitweb:        https://git.kernel.org/tip/f12e148892ede8d9ee82bcd3e469e6d01fc077ac
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Thu, 23 May 2024 11:26:25 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Sat, 17 Aug 2024 11:06:43 +02:00

sched/fair: Prepare pick_next_task() for delayed dequeue

Delayed dequeue's natural end is when it gets picked again. Ensure
pick_next_task() knows what to do with delayed tasks.

Note, this relies on the earlier patch that made pick_next_task()
state invariant -- it will restart the pick on dequeue, because
obviously the just dequeued task is no longer eligible.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <vschneid@redhat.com>
Tested-by: Valentin Schneider <vschneid@redhat.com>
Link: https://lkml.kernel.org/r/20240727105029.747330118@infradead.org
---
 kernel/sched/fair.c | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9a84903..a4f1f79 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5473,6 +5473,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	se->prev_sum_exec_runtime = se->sum_exec_runtime;
 }
 
+static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags);
+
 /*
  * Pick the next process, keeping these things in mind, in this order:
  * 1) keep things fair between processes/task groups
@@ -5481,16 +5483,27 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
  * 4) do not run the "skip" process, if something else is available
  */
 static struct sched_entity *
-pick_next_entity(struct cfs_rq *cfs_rq)
+pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
 {
 	/*
 	 * Enabling NEXT_BUDDY will affect latency but not fairness.
 	 */
 	if (sched_feat(NEXT_BUDDY) &&
-	    cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next))
+	    cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) {
+		/* ->next will never be delayed */
+		SCHED_WARN_ON(cfs_rq->next->sched_delayed);
 		return cfs_rq->next;
+	}
+
+	struct sched_entity *se = pick_eevdf(cfs_rq);
+	if (se->sched_delayed) {
+		dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+		SCHED_WARN_ON(se->sched_delayed);
+		SCHED_WARN_ON(se->on_rq);
 
-	return pick_eevdf(cfs_rq);
+		return NULL;
+	}
+	return se;
 }
 
 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
@@ -8507,7 +8520,9 @@ again:
 		if (unlikely(check_cfs_rq_runtime(cfs_rq)))
 			goto again;
 
-		se = pick_next_entity(cfs_rq);
+		se = pick_next_entity(rq, cfs_rq);
+		if (!se)
+			goto again;
 		cfs_rq = group_cfs_rq(se);
 	} while (cfs_rq);