Working towards delaying dequeue, notably also inside the hierachy,
rework dequeue_task_fair() such that it can 'resume' an interrupted
hierarchy walk.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
kernel/sched/fair.c | 61 ++++++++++++++++++++++++++++++++++------------------
1 file changed, 40 insertions(+), 21 deletions(-)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6861,34 +6861,43 @@ enqueue_task_fair(struct rq *rq, struct
static void set_next_buddy(struct sched_entity *se);
/*
- * The dequeue_task method is called before nr_running is
- * decreased. We remove the task from the rbtree and
- * update the fair scheduling stats:
+ * Basically dequeue_task_fair(), except it can deal with dequeue_entity()
+ * failing half-way through and resume the dequeue later.
+ *
+ * Returns:
+ * -1 - dequeue delayed
+ * 0 - dequeue throttled
+ * 1 - dequeue complete
*/
-static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
{
- struct cfs_rq *cfs_rq;
- struct sched_entity *se = &p->se;
- int task_sleep = flags & DEQUEUE_SLEEP;
- int idle_h_nr_running = task_has_idle_policy(p);
bool was_sched_idle = sched_idle_rq(rq);
int rq_h_nr_running = rq->cfs.h_nr_running;
+ bool task_sleep = flags & DEQUEUE_SLEEP;
+ struct task_struct *p = NULL;
+ int idle_h_nr_running = 0;
+ int h_nr_running = 0;
+ struct cfs_rq *cfs_rq;
- util_est_dequeue(&rq->cfs, p);
+ if (entity_is_task(se)) {
+ p = task_of(se);
+ h_nr_running = 1;
+ idle_h_nr_running = task_has_idle_policy(p);
+ }
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
dequeue_entity(cfs_rq, se, flags);
- cfs_rq->h_nr_running--;
+ cfs_rq->h_nr_running -= h_nr_running;
cfs_rq->idle_h_nr_running -= idle_h_nr_running;
if (cfs_rq_is_idle(cfs_rq))
- idle_h_nr_running = 1;
+ idle_h_nr_running = h_nr_running;
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
- goto dequeue_throttle;
+ return 0;
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
@@ -6912,20 +6921,18 @@ static bool dequeue_task_fair(struct rq
se_update_runnable(se);
update_cfs_group(se);
- cfs_rq->h_nr_running--;
+ cfs_rq->h_nr_running -= h_nr_running;
cfs_rq->idle_h_nr_running -= idle_h_nr_running;
if (cfs_rq_is_idle(cfs_rq))
- idle_h_nr_running = 1;
+ idle_h_nr_running = h_nr_running;
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
- goto dequeue_throttle;
-
+ return 0;
}
- /* At this point se is NULL and we are at root level*/
- sub_nr_running(rq, 1);
+ sub_nr_running(rq, h_nr_running);
if (rq_h_nr_running && !rq->cfs.h_nr_running)
dl_server_stop(&rq->fair_server);
@@ -6934,10 +6941,22 @@ static bool dequeue_task_fair(struct rq
if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
rq->next_balance = jiffies;
-dequeue_throttle:
- util_est_update(&rq->cfs, p, task_sleep);
- hrtick_update(rq);
+ return 1;
+}
+/*
+ * The dequeue_task method is called before nr_running is
+ * decreased. We remove the task from the rbtree and
+ * update the fair scheduling stats:
+ */
+static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+{
+ util_est_dequeue(&rq->cfs, p);
+ if (dequeue_entities(rq, &p->se, flags) < 0)
+ return false;
+
+ util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
+ hrtick_update(rq);
return true;
}
On 27/07/24 12:27, Peter Zijlstra wrote:
> Working towards delaying dequeue, notably also inside the hierachy,
> rework dequeue_task_fair() such that it can 'resume' an interrupted
> hierarchy walk.
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
> kernel/sched/fair.c | 61 ++++++++++++++++++++++++++++++++++------------------
> 1 file changed, 40 insertions(+), 21 deletions(-)
>
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6861,34 +6861,43 @@ enqueue_task_fair(struct rq *rq, struct
> static void set_next_buddy(struct sched_entity *se);
>
> /*
> - * The dequeue_task method is called before nr_running is
> - * decreased. We remove the task from the rbtree and
> - * update the fair scheduling stats:
> + * Basically dequeue_task_fair(), except it can deal with dequeue_entity()
> + * failing half-way through and resume the dequeue later.
> + *
> + * Returns:
> + * -1 - dequeue delayed
> + * 0 - dequeue throttled
> + * 1 - dequeue complete
> */
> -static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> +static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
> {
> - struct cfs_rq *cfs_rq;
> - struct sched_entity *se = &p->se;
> - int task_sleep = flags & DEQUEUE_SLEEP;
> - int idle_h_nr_running = task_has_idle_policy(p);
> bool was_sched_idle = sched_idle_rq(rq);
> int rq_h_nr_running = rq->cfs.h_nr_running;
> + bool task_sleep = flags & DEQUEUE_SLEEP;
> + struct task_struct *p = NULL;
> + int idle_h_nr_running = 0;
> + int h_nr_running = 0;
> + struct cfs_rq *cfs_rq;
>
> - util_est_dequeue(&rq->cfs, p);
> + if (entity_is_task(se)) {
> + p = task_of(se);
> + h_nr_running = 1;
> + idle_h_nr_running = task_has_idle_policy(p);
> + }
>
This leaves the *h_nr_running to 0 for non-task entities. IIUC this makes
sense for ->sched_delayed entities (they should be empty of tasks), not so
sure for the other case. However, this only ends up being used for non-task
entities in:
- pick_next_entity(), if se->sched_delayed
- unregister_fair_sched_group()
IIRC unregister_fair_sched_group() can only happen after the group has been
drained, so it would then indeed be empty of tasks, but I reckon this could
do with a comment/assert in dequeue_entities(), no? Or did I get too
confused by cgroups again?
On Fri, Aug 09, 2024 at 06:53:30PM +0200, Valentin Schneider wrote:
> On 27/07/24 12:27, Peter Zijlstra wrote:
> > Working towards delaying dequeue, notably also inside the hierachy,
> > rework dequeue_task_fair() such that it can 'resume' an interrupted
> > hierarchy walk.
> >
> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > ---
> > kernel/sched/fair.c | 61 ++++++++++++++++++++++++++++++++++------------------
> > 1 file changed, 40 insertions(+), 21 deletions(-)
> >
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -6861,34 +6861,43 @@ enqueue_task_fair(struct rq *rq, struct
> > static void set_next_buddy(struct sched_entity *se);
> >
> > /*
> > - * The dequeue_task method is called before nr_running is
> > - * decreased. We remove the task from the rbtree and
> > - * update the fair scheduling stats:
> > + * Basically dequeue_task_fair(), except it can deal with dequeue_entity()
> > + * failing half-way through and resume the dequeue later.
> > + *
> > + * Returns:
> > + * -1 - dequeue delayed
> > + * 0 - dequeue throttled
> > + * 1 - dequeue complete
> > */
> > -static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> > +static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
> > {
> > - struct cfs_rq *cfs_rq;
> > - struct sched_entity *se = &p->se;
> > - int task_sleep = flags & DEQUEUE_SLEEP;
> > - int idle_h_nr_running = task_has_idle_policy(p);
> > bool was_sched_idle = sched_idle_rq(rq);
> > int rq_h_nr_running = rq->cfs.h_nr_running;
> > + bool task_sleep = flags & DEQUEUE_SLEEP;
> > + struct task_struct *p = NULL;
> > + int idle_h_nr_running = 0;
> > + int h_nr_running = 0;
> > + struct cfs_rq *cfs_rq;
> >
> > - util_est_dequeue(&rq->cfs, p);
> > + if (entity_is_task(se)) {
> > + p = task_of(se);
> > + h_nr_running = 1;
> > + idle_h_nr_running = task_has_idle_policy(p);
> > + }
> >
>
> This leaves the *h_nr_running to 0 for non-task entities. IIUC this makes
> sense for ->sched_delayed entities (they should be empty of tasks), not so
> sure for the other case. However, this only ends up being used for non-task
> entities in:
> - pick_next_entity(), if se->sched_delayed
> - unregister_fair_sched_group()
>
> IIRC unregister_fair_sched_group() can only happen after the group has been
> drained, so it would then indeed be empty of tasks, but I reckon this could
> do with a comment/assert in dequeue_entities(), no? Or did I get too
> confused by cgroups again?
>
Yeah, so I did have me a patch that made all this work for cfs bandwidth
control as well. And then we need all this for throttled cgroup entries
as well.
Anyway... I had the patch, it worked, but then I remembered you were
going to rewrite all that anyway and I was making a terrible mess of
things, so I made it go away again.
On 11/08/24 00:17, Peter Zijlstra wrote:
> On Fri, Aug 09, 2024 at 06:53:30PM +0200, Valentin Schneider wrote:
>> On 27/07/24 12:27, Peter Zijlstra wrote:
>> > -static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>> > +static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
>> > {
>> > - struct cfs_rq *cfs_rq;
>> > - struct sched_entity *se = &p->se;
>> > - int task_sleep = flags & DEQUEUE_SLEEP;
>> > - int idle_h_nr_running = task_has_idle_policy(p);
>> > bool was_sched_idle = sched_idle_rq(rq);
>> > int rq_h_nr_running = rq->cfs.h_nr_running;
>> > + bool task_sleep = flags & DEQUEUE_SLEEP;
>> > + struct task_struct *p = NULL;
>> > + int idle_h_nr_running = 0;
>> > + int h_nr_running = 0;
>> > + struct cfs_rq *cfs_rq;
>> >
>> > - util_est_dequeue(&rq->cfs, p);
>> > + if (entity_is_task(se)) {
>> > + p = task_of(se);
>> > + h_nr_running = 1;
>> > + idle_h_nr_running = task_has_idle_policy(p);
>> > + }
>> >
>>
>> This leaves the *h_nr_running to 0 for non-task entities. IIUC this makes
>> sense for ->sched_delayed entities (they should be empty of tasks), not so
>> sure for the other case. However, this only ends up being used for non-task
>> entities in:
>> - pick_next_entity(), if se->sched_delayed
>> - unregister_fair_sched_group()
>>
>> IIRC unregister_fair_sched_group() can only happen after the group has been
>> drained, so it would then indeed be empty of tasks, but I reckon this could
>> do with a comment/assert in dequeue_entities(), no? Or did I get too
>> confused by cgroups again?
>>
>
> Yeah, so I did have me a patch that made all this work for cfs bandwidth
> control as well. And then we need all this for throttled cgroup entries
> as well.
>
> Anyway... I had the patch, it worked, but then I remembered you were
> going to rewrite all that anyway and I was making a terrible mess of
> things, so I made it go away again.
Heh, sounds like someone needs to get back to it then :-)
The following commit has been merged into the sched/core branch of tip:
Commit-ID: fab4a808ba9fb59b691d7096eed9b1494812ffd6
Gitweb: https://git.kernel.org/tip/fab4a808ba9fb59b691d7096eed9b1494812ffd6
Author: Peter Zijlstra <peterz@infradead.org>
AuthorDate: Wed, 03 Apr 2024 09:50:41 +02:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Sat, 17 Aug 2024 11:06:41 +02:00
sched/fair: Re-organize dequeue_task_fair()
Working towards delaying dequeue, notably also inside the hierachy,
rework dequeue_task_fair() such that it can 'resume' an interrupted
hierarchy walk.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <vschneid@redhat.com>
Tested-by: Valentin Schneider <vschneid@redhat.com>
Link: https://lkml.kernel.org/r/20240727105028.977256873@infradead.org
---
kernel/sched/fair.c | 62 +++++++++++++++++++++++++++++---------------
1 file changed, 41 insertions(+), 21 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 03f76b3..59b00d7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6861,34 +6861,43 @@ enqueue_throttle:
static void set_next_buddy(struct sched_entity *se);
/*
- * The dequeue_task method is called before nr_running is
- * decreased. We remove the task from the rbtree and
- * update the fair scheduling stats:
+ * Basically dequeue_task_fair(), except it can deal with dequeue_entity()
+ * failing half-way through and resume the dequeue later.
+ *
+ * Returns:
+ * -1 - dequeue delayed
+ * 0 - dequeue throttled
+ * 1 - dequeue complete
*/
-static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
{
- struct cfs_rq *cfs_rq;
- struct sched_entity *se = &p->se;
- int task_sleep = flags & DEQUEUE_SLEEP;
- int idle_h_nr_running = task_has_idle_policy(p);
bool was_sched_idle = sched_idle_rq(rq);
int rq_h_nr_running = rq->cfs.h_nr_running;
+ bool task_sleep = flags & DEQUEUE_SLEEP;
+ struct task_struct *p = NULL;
+ int idle_h_nr_running = 0;
+ int h_nr_running = 0;
+ struct cfs_rq *cfs_rq;
- util_est_dequeue(&rq->cfs, p);
+ if (entity_is_task(se)) {
+ p = task_of(se);
+ h_nr_running = 1;
+ idle_h_nr_running = task_has_idle_policy(p);
+ }
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
dequeue_entity(cfs_rq, se, flags);
- cfs_rq->h_nr_running--;
+ cfs_rq->h_nr_running -= h_nr_running;
cfs_rq->idle_h_nr_running -= idle_h_nr_running;
if (cfs_rq_is_idle(cfs_rq))
- idle_h_nr_running = 1;
+ idle_h_nr_running = h_nr_running;
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
- goto dequeue_throttle;
+ return 0;
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
@@ -6912,20 +6921,18 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
se_update_runnable(se);
update_cfs_group(se);
- cfs_rq->h_nr_running--;
+ cfs_rq->h_nr_running -= h_nr_running;
cfs_rq->idle_h_nr_running -= idle_h_nr_running;
if (cfs_rq_is_idle(cfs_rq))
- idle_h_nr_running = 1;
+ idle_h_nr_running = h_nr_running;
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
- goto dequeue_throttle;
-
+ return 0;
}
- /* At this point se is NULL and we are at root level*/
- sub_nr_running(rq, 1);
+ sub_nr_running(rq, h_nr_running);
if (rq_h_nr_running && !rq->cfs.h_nr_running)
dl_server_stop(&rq->fair_server);
@@ -6934,10 +6941,23 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
rq->next_balance = jiffies;
-dequeue_throttle:
- util_est_update(&rq->cfs, p, task_sleep);
- hrtick_update(rq);
+ return 1;
+}
+
+/*
+ * The dequeue_task method is called before nr_running is
+ * decreased. We remove the task from the rbtree and
+ * update the fair scheduling stats:
+ */
+static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+{
+ util_est_dequeue(&rq->cfs, p);
+
+ if (dequeue_entities(rq, &p->se, flags) < 0)
+ return false;
+ util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
+ hrtick_update(rq);
return true;
}
© 2016 - 2025 Red Hat, Inc.