Complete EEVDF | Patchew

[PATCH 07/24] sched/fair: Re-organize dequeue_task_fair()

Posted by Peter Zijlstra 1 year, 6 months ago

Working towards delaying dequeue, notably also inside the hierachy,
rework dequeue_task_fair() such that it can 'resume' an interrupted
hierarchy walk.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/fair.c |   61 ++++++++++++++++++++++++++++++++++------------------
 1 file changed, 40 insertions(+), 21 deletions(-)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6861,34 +6861,43 @@ enqueue_task_fair(struct rq *rq, struct
 static void set_next_buddy(struct sched_entity *se);
 
 /*
- * The dequeue_task method is called before nr_running is
- * decreased. We remove the task from the rbtree and
- * update the fair scheduling stats:
+ * Basically dequeue_task_fair(), except it can deal with dequeue_entity()
+ * failing half-way through and resume the dequeue later.
+ *
+ * Returns:
+ * -1 - dequeue delayed
+ *  0 - dequeue throttled
+ *  1 - dequeue complete
  */
-static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 {
-	struct cfs_rq *cfs_rq;
-	struct sched_entity *se = &p->se;
-	int task_sleep = flags & DEQUEUE_SLEEP;
-	int idle_h_nr_running = task_has_idle_policy(p);
 	bool was_sched_idle = sched_idle_rq(rq);
 	int rq_h_nr_running = rq->cfs.h_nr_running;
+	bool task_sleep = flags & DEQUEUE_SLEEP;
+	struct task_struct *p = NULL;
+	int idle_h_nr_running = 0;
+	int h_nr_running = 0;
+	struct cfs_rq *cfs_rq;
 
-	util_est_dequeue(&rq->cfs, p);
+	if (entity_is_task(se)) {
+		p = task_of(se);
+		h_nr_running = 1;
+		idle_h_nr_running = task_has_idle_policy(p);
+	}
 
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
 		dequeue_entity(cfs_rq, se, flags);
 
-		cfs_rq->h_nr_running--;
+		cfs_rq->h_nr_running -= h_nr_running;
 		cfs_rq->idle_h_nr_running -= idle_h_nr_running;
 
 		if (cfs_rq_is_idle(cfs_rq))
-			idle_h_nr_running = 1;
+			idle_h_nr_running = h_nr_running;
 
 		/* end evaluation on encountering a throttled cfs_rq */
 		if (cfs_rq_throttled(cfs_rq))
-			goto dequeue_throttle;
+			return 0;
 
 		/* Don't dequeue parent if it has other entities besides us */
 		if (cfs_rq->load.weight) {
@@ -6912,20 +6921,18 @@ static bool dequeue_task_fair(struct rq
 		se_update_runnable(se);
 		update_cfs_group(se);
 
-		cfs_rq->h_nr_running--;
+		cfs_rq->h_nr_running -= h_nr_running;
 		cfs_rq->idle_h_nr_running -= idle_h_nr_running;
 
 		if (cfs_rq_is_idle(cfs_rq))
-			idle_h_nr_running = 1;
+			idle_h_nr_running = h_nr_running;
 
 		/* end evaluation on encountering a throttled cfs_rq */
 		if (cfs_rq_throttled(cfs_rq))
-			goto dequeue_throttle;
-
+			return 0;
 	}
 
-	/* At this point se is NULL and we are at root level*/
-	sub_nr_running(rq, 1);
+	sub_nr_running(rq, h_nr_running);
 
 	if (rq_h_nr_running && !rq->cfs.h_nr_running)
 		dl_server_stop(&rq->fair_server);
@@ -6934,10 +6941,22 @@ static bool dequeue_task_fair(struct rq
 	if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
 		rq->next_balance = jiffies;
 
-dequeue_throttle:
-	util_est_update(&rq->cfs, p, task_sleep);
-	hrtick_update(rq);
+	return 1;
+}
+/*
+ * The dequeue_task method is called before nr_running is
+ * decreased. We remove the task from the rbtree and
+ * update the fair scheduling stats:
+ */
+static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+{
+	util_est_dequeue(&rq->cfs, p);
 
+	if (dequeue_entities(rq, &p->se, flags) < 0)
+		return false;
+
+	util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
+	hrtick_update(rq);
 	return true;
 }

Re: [PATCH 07/24] sched/fair: Re-organize dequeue_task_fair()

Posted by Valentin Schneider 1 year, 6 months ago

On 27/07/24 12:27, Peter Zijlstra wrote:
> Working towards delaying dequeue, notably also inside the hierachy,
> rework dequeue_task_fair() such that it can 'resume' an interrupted
> hierarchy walk.
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  kernel/sched/fair.c |   61 ++++++++++++++++++++++++++++++++++------------------
>  1 file changed, 40 insertions(+), 21 deletions(-)
>
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6861,34 +6861,43 @@ enqueue_task_fair(struct rq *rq, struct
>  static void set_next_buddy(struct sched_entity *se);
>
>  /*
> - * The dequeue_task method is called before nr_running is
> - * decreased. We remove the task from the rbtree and
> - * update the fair scheduling stats:
> + * Basically dequeue_task_fair(), except it can deal with dequeue_entity()
> + * failing half-way through and resume the dequeue later.
> + *
> + * Returns:
> + * -1 - dequeue delayed
> + *  0 - dequeue throttled
> + *  1 - dequeue complete
>   */
> -static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> +static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
>  {
> -	struct cfs_rq *cfs_rq;
> -	struct sched_entity *se = &p->se;
> -	int task_sleep = flags & DEQUEUE_SLEEP;
> -	int idle_h_nr_running = task_has_idle_policy(p);
>       bool was_sched_idle = sched_idle_rq(rq);
>       int rq_h_nr_running = rq->cfs.h_nr_running;
> +	bool task_sleep = flags & DEQUEUE_SLEEP;
> +	struct task_struct *p = NULL;
> +	int idle_h_nr_running = 0;
> +	int h_nr_running = 0;
> +	struct cfs_rq *cfs_rq;
>
> -	util_est_dequeue(&rq->cfs, p);
> +	if (entity_is_task(se)) {
> +		p = task_of(se);
> +		h_nr_running = 1;
> +		idle_h_nr_running = task_has_idle_policy(p);
> +	}
>

This leaves the *h_nr_running to 0 for non-task entities. IIUC this makes
sense for ->sched_delayed entities (they should be empty of tasks), not so
sure for the other case. However, this only ends up being used for non-task
entities in:
- pick_next_entity(), if se->sched_delayed
- unregister_fair_sched_group()

IIRC unregister_fair_sched_group() can only happen after the group has been
drained, so it would then indeed be empty of tasks, but I reckon this could
do with a comment/assert in dequeue_entities(), no? Or did I get too
confused by cgroups again?

Re: [PATCH 07/24] sched/fair: Re-organize dequeue_task_fair()

Posted by Peter Zijlstra 1 year, 6 months ago

On Fri, Aug 09, 2024 at 06:53:30PM +0200, Valentin Schneider wrote:
> On 27/07/24 12:27, Peter Zijlstra wrote:
> > Working towards delaying dequeue, notably also inside the hierachy,
> > rework dequeue_task_fair() such that it can 'resume' an interrupted
> > hierarchy walk.
> >
> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > ---
> >  kernel/sched/fair.c |   61 ++++++++++++++++++++++++++++++++++------------------
> >  1 file changed, 40 insertions(+), 21 deletions(-)
> >
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -6861,34 +6861,43 @@ enqueue_task_fair(struct rq *rq, struct
> >  static void set_next_buddy(struct sched_entity *se);
> >
> >  /*
> > - * The dequeue_task method is called before nr_running is
> > - * decreased. We remove the task from the rbtree and
> > - * update the fair scheduling stats:
> > + * Basically dequeue_task_fair(), except it can deal with dequeue_entity()
> > + * failing half-way through and resume the dequeue later.
> > + *
> > + * Returns:
> > + * -1 - dequeue delayed
> > + *  0 - dequeue throttled
> > + *  1 - dequeue complete
> >   */
> > -static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> > +static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
> >  {
> > -	struct cfs_rq *cfs_rq;
> > -	struct sched_entity *se = &p->se;
> > -	int task_sleep = flags & DEQUEUE_SLEEP;
> > -	int idle_h_nr_running = task_has_idle_policy(p);
> >       bool was_sched_idle = sched_idle_rq(rq);
> >       int rq_h_nr_running = rq->cfs.h_nr_running;
> > +	bool task_sleep = flags & DEQUEUE_SLEEP;
> > +	struct task_struct *p = NULL;
> > +	int idle_h_nr_running = 0;
> > +	int h_nr_running = 0;
> > +	struct cfs_rq *cfs_rq;
> >
> > -	util_est_dequeue(&rq->cfs, p);
> > +	if (entity_is_task(se)) {
> > +		p = task_of(se);
> > +		h_nr_running = 1;
> > +		idle_h_nr_running = task_has_idle_policy(p);
> > +	}
> >
> 
> This leaves the *h_nr_running to 0 for non-task entities. IIUC this makes
> sense for ->sched_delayed entities (they should be empty of tasks), not so
> sure for the other case. However, this only ends up being used for non-task
> entities in:
> - pick_next_entity(), if se->sched_delayed
> - unregister_fair_sched_group()
> 
> IIRC unregister_fair_sched_group() can only happen after the group has been
> drained, so it would then indeed be empty of tasks, but I reckon this could
> do with a comment/assert in dequeue_entities(), no? Or did I get too
> confused by cgroups again?
> 

Yeah, so I did have me a patch that made all this work for cfs bandwidth
control as well. And then we need all this for throttled cgroup entries
as well.

Anyway... I had the patch, it worked, but then I remembered you were
going to rewrite all that anyway and I was making a terrible mess of
things, so I made it go away again.

Re: [PATCH 07/24] sched/fair: Re-organize dequeue_task_fair()

Posted by Valentin Schneider 1 year, 6 months ago

On 11/08/24 00:17, Peter Zijlstra wrote:
> On Fri, Aug 09, 2024 at 06:53:30PM +0200, Valentin Schneider wrote:
>> On 27/07/24 12:27, Peter Zijlstra wrote:
>> > -static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>> > +static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
>> >  {
>> > -	struct cfs_rq *cfs_rq;
>> > -	struct sched_entity *se = &p->se;
>> > -	int task_sleep = flags & DEQUEUE_SLEEP;
>> > -	int idle_h_nr_running = task_has_idle_policy(p);
>> >       bool was_sched_idle = sched_idle_rq(rq);
>> >       int rq_h_nr_running = rq->cfs.h_nr_running;
>> > +	bool task_sleep = flags & DEQUEUE_SLEEP;
>> > +	struct task_struct *p = NULL;
>> > +	int idle_h_nr_running = 0;
>> > +	int h_nr_running = 0;
>> > +	struct cfs_rq *cfs_rq;
>> >
>> > -	util_est_dequeue(&rq->cfs, p);
>> > +	if (entity_is_task(se)) {
>> > +		p = task_of(se);
>> > +		h_nr_running = 1;
>> > +		idle_h_nr_running = task_has_idle_policy(p);
>> > +	}
>> >
>>
>> This leaves the *h_nr_running to 0 for non-task entities. IIUC this makes
>> sense for ->sched_delayed entities (they should be empty of tasks), not so
>> sure for the other case. However, this only ends up being used for non-task
>> entities in:
>> - pick_next_entity(), if se->sched_delayed
>> - unregister_fair_sched_group()
>>
>> IIRC unregister_fair_sched_group() can only happen after the group has been
>> drained, so it would then indeed be empty of tasks, but I reckon this could
>> do with a comment/assert in dequeue_entities(), no? Or did I get too
>> confused by cgroups again?
>>
>
> Yeah, so I did have me a patch that made all this work for cfs bandwidth
> control as well. And then we need all this for throttled cgroup entries
> as well.
>
> Anyway... I had the patch, it worked, but then I remembered you were
> going to rewrite all that anyway and I was making a terrible mess of
> things, so I made it go away again.

Heh, sounds like someone needs to get back to it then :-)

[tip: sched/core] sched/fair: Re-organize dequeue_task_fair()

Posted by tip-bot2 for Peter Zijlstra 1 year, 5 months ago

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     fab4a808ba9fb59b691d7096eed9b1494812ffd6
Gitweb:        https://git.kernel.org/tip/fab4a808ba9fb59b691d7096eed9b1494812ffd6
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Wed, 03 Apr 2024 09:50:41 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Sat, 17 Aug 2024 11:06:41 +02:00

sched/fair: Re-organize dequeue_task_fair()

Working towards delaying dequeue, notably also inside the hierachy,
rework dequeue_task_fair() such that it can 'resume' an interrupted
hierarchy walk.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <vschneid@redhat.com>
Tested-by: Valentin Schneider <vschneid@redhat.com>
Link: https://lkml.kernel.org/r/20240727105028.977256873@infradead.org
---
 kernel/sched/fair.c | 62 +++++++++++++++++++++++++++++---------------
 1 file changed, 41 insertions(+), 21 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 03f76b3..59b00d7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6861,34 +6861,43 @@ enqueue_throttle:
 static void set_next_buddy(struct sched_entity *se);
 
 /*
- * The dequeue_task method is called before nr_running is
- * decreased. We remove the task from the rbtree and
- * update the fair scheduling stats:
+ * Basically dequeue_task_fair(), except it can deal with dequeue_entity()
+ * failing half-way through and resume the dequeue later.
+ *
+ * Returns:
+ * -1 - dequeue delayed
+ *  0 - dequeue throttled
+ *  1 - dequeue complete
  */
-static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 {
-	struct cfs_rq *cfs_rq;
-	struct sched_entity *se = &p->se;
-	int task_sleep = flags & DEQUEUE_SLEEP;
-	int idle_h_nr_running = task_has_idle_policy(p);
 	bool was_sched_idle = sched_idle_rq(rq);
 	int rq_h_nr_running = rq->cfs.h_nr_running;
+	bool task_sleep = flags & DEQUEUE_SLEEP;
+	struct task_struct *p = NULL;
+	int idle_h_nr_running = 0;
+	int h_nr_running = 0;
+	struct cfs_rq *cfs_rq;
 
-	util_est_dequeue(&rq->cfs, p);
+	if (entity_is_task(se)) {
+		p = task_of(se);
+		h_nr_running = 1;
+		idle_h_nr_running = task_has_idle_policy(p);
+	}
 
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
 		dequeue_entity(cfs_rq, se, flags);
 
-		cfs_rq->h_nr_running--;
+		cfs_rq->h_nr_running -= h_nr_running;
 		cfs_rq->idle_h_nr_running -= idle_h_nr_running;
 
 		if (cfs_rq_is_idle(cfs_rq))
-			idle_h_nr_running = 1;
+			idle_h_nr_running = h_nr_running;
 
 		/* end evaluation on encountering a throttled cfs_rq */
 		if (cfs_rq_throttled(cfs_rq))
-			goto dequeue_throttle;
+			return 0;
 
 		/* Don't dequeue parent if it has other entities besides us */
 		if (cfs_rq->load.weight) {
@@ -6912,20 +6921,18 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		se_update_runnable(se);
 		update_cfs_group(se);
 
-		cfs_rq->h_nr_running--;
+		cfs_rq->h_nr_running -= h_nr_running;
 		cfs_rq->idle_h_nr_running -= idle_h_nr_running;
 
 		if (cfs_rq_is_idle(cfs_rq))
-			idle_h_nr_running = 1;
+			idle_h_nr_running = h_nr_running;
 
 		/* end evaluation on encountering a throttled cfs_rq */
 		if (cfs_rq_throttled(cfs_rq))
-			goto dequeue_throttle;
-
+			return 0;
 	}
 
-	/* At this point se is NULL and we are at root level*/
-	sub_nr_running(rq, 1);
+	sub_nr_running(rq, h_nr_running);
 
 	if (rq_h_nr_running && !rq->cfs.h_nr_running)
 		dl_server_stop(&rq->fair_server);
@@ -6934,10 +6941,23 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
 		rq->next_balance = jiffies;
 
-dequeue_throttle:
-	util_est_update(&rq->cfs, p, task_sleep);
-	hrtick_update(rq);
+	return 1;
+}
+
+/*
+ * The dequeue_task method is called before nr_running is
+ * decreased. We remove the task from the rbtree and
+ * update the fair scheduling stats:
+ */
+static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+{
+	util_est_dequeue(&rq->cfs, p);
+
+	if (dequeue_entities(rq, &p->se, flags) < 0)
+		return false;
 
+	util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
+	hrtick_update(rq);
 	return true;
 }