With delayed dequeued feature, a sleeping sched_entity remains queued in
the rq until its lag has elapsed. As a result, it stays also visible
in the statistics that are used to balance the system and in particular
the field cfs.h_nr_queued when the sched_entity is associated to a task.
Create a new h_nr_runnable that tracks all queued and runnable tasks and
use it when balancing the load on the system.
h_nr_runnable will be used in several places to make decision on load
balance:
- PELT runnable_avg
- deciding if a group is overloaded or has spare capacity
- numa stats
- reduced capacity management
- load balance
- nohz kick
It should be noticed that the rq->nr_running still counts the delayed
dequeued tasks as delayed dequeue is a fair feature that is meaningless
at core level.
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
---
kernel/sched/debug.c | 1 +
kernel/sched/fair.c | 45 ++++++++++++++++++++++++++++++--------------
kernel/sched/pelt.c | 4 ++--
kernel/sched/sched.h | 10 ++++------
4 files changed, 38 insertions(+), 22 deletions(-)
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 08d6c2b7caa3..fd711cc4d44c 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -844,6 +844,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
spread = right_vruntime - left_vruntime;
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread));
SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
+ SEQ_printf(m, " .%-30s: %d\n", "h_nr_runnable", cfs_rq->h_nr_runnable);
SEQ_printf(m, " .%-30s: %d\n", "h_nr_queued", cfs_rq->h_nr_queued);
SEQ_printf(m, " .%-30s: %d\n", "h_nr_delayed", cfs_rq->h_nr_delayed);
SEQ_printf(m, " .%-30s: %d\n", "idle_nr_running",
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c3cc9f784afe..d5736bde3682 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2128,7 +2128,7 @@ static void update_numa_stats(struct task_numa_env *env,
ns->load += cpu_load(rq);
ns->runnable += cpu_runnable(rq);
ns->util += cpu_util_cfs(cpu);
- ns->nr_running += rq->cfs.h_nr_queued;
+ ns->nr_running += rq->cfs.h_nr_runnable;
ns->compute_capacity += capacity_of(cpu);
if (find_idle && idle_core < 0 && !rq->nr_running && idle_cpu(cpu)) {
@@ -5396,7 +5396,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* When enqueuing a sched_entity, we must:
* - Update loads to have both entity and cfs_rq synced with now.
* - For group_entity, update its runnable_weight to reflect the new
- * h_nr_queued of its group cfs_rq.
+ * h_nr_runnable of its group cfs_rq.
* - For group_entity, update its weight to reflect the new share of
* its group cfs_rq
* - Add its new weight to cfs_rq->load.weight
@@ -5471,18 +5471,21 @@ static void set_delayed(struct sched_entity *se)
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ cfs_rq->h_nr_runnable--;
cfs_rq->h_nr_delayed++;
if (cfs_rq_throttled(cfs_rq))
break;
}
}
-static void clear_delayed(struct sched_entity *se)
+static void clear_delayed(struct sched_entity *se, bool running)
{
se->sched_delayed = 0;
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ if (running)
+ cfs_rq->h_nr_runnable++;
cfs_rq->h_nr_delayed--;
if (cfs_rq_throttled(cfs_rq))
break;
@@ -5491,7 +5494,7 @@ static void clear_delayed(struct sched_entity *se)
static inline void finish_delayed_dequeue_entity(struct sched_entity *se)
{
- clear_delayed(se);
+ clear_delayed(se, false);
if (sched_feat(DELAY_ZERO) && se->vlag > 0)
se->vlag = 0;
}
@@ -5534,7 +5537,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* When dequeuing a sched_entity, we must:
* - Update loads to have both entity and cfs_rq synced with now.
* - For group_entity, update its runnable_weight to reflect the new
- * h_nr_queued of its group cfs_rq.
+ * h_nr_runnable of its group cfs_rq.
* - Subtract its previous weight from cfs_rq->load.weight.
* - For group entity, update its weight to reflect the new share
* of its group cfs_rq.
@@ -5934,7 +5937,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
- long queued_delta, idle_task_delta, delayed_delta, dequeue = 1;
+ long queued_delta, runnable_delta, idle_task_delta, delayed_delta, dequeue = 1;
long rq_h_nr_queued = rq->cfs.h_nr_queued;
raw_spin_lock(&cfs_b->lock);
@@ -5966,6 +5969,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
rcu_read_unlock();
queued_delta = cfs_rq->h_nr_queued;
+ runnable_delta = cfs_rq->h_nr_runnable;
idle_task_delta = cfs_rq->idle_h_nr_running;
delayed_delta = cfs_rq->h_nr_delayed;
for_each_sched_entity(se) {
@@ -5990,6 +5994,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
idle_task_delta = cfs_rq->h_nr_queued;
qcfs_rq->h_nr_queued -= queued_delta;
+ qcfs_rq->h_nr_runnable -= runnable_delta;
qcfs_rq->idle_h_nr_running -= idle_task_delta;
qcfs_rq->h_nr_delayed -= delayed_delta;
@@ -6013,6 +6018,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
idle_task_delta = cfs_rq->h_nr_queued;
qcfs_rq->h_nr_queued -= queued_delta;
+ qcfs_rq->h_nr_runnable -= runnable_delta;
qcfs_rq->idle_h_nr_running -= idle_task_delta;
qcfs_rq->h_nr_delayed -= delayed_delta;
}
@@ -6040,7 +6046,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
- long queued_delta, idle_task_delta, delayed_delta;
+ long queued_delta, runnable_delta, idle_task_delta, delayed_delta;
long rq_h_nr_queued = rq->cfs.h_nr_queued;
se = cfs_rq->tg->se[cpu_of(rq)];
@@ -6075,6 +6081,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
}
queued_delta = cfs_rq->h_nr_queued;
+ runnable_delta = cfs_rq->h_nr_runnable;
idle_task_delta = cfs_rq->idle_h_nr_running;
delayed_delta = cfs_rq->h_nr_delayed;
for_each_sched_entity(se) {
@@ -6093,6 +6100,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
idle_task_delta = cfs_rq->h_nr_queued;
qcfs_rq->h_nr_queued += queued_delta;
+ qcfs_rq->h_nr_runnable += runnable_delta;
qcfs_rq->idle_h_nr_running += idle_task_delta;
qcfs_rq->h_nr_delayed += delayed_delta;
@@ -6111,6 +6119,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
idle_task_delta = cfs_rq->h_nr_queued;
qcfs_rq->h_nr_queued += queued_delta;
+ qcfs_rq->h_nr_runnable += runnable_delta;
qcfs_rq->idle_h_nr_running += idle_task_delta;
qcfs_rq->h_nr_delayed += delayed_delta;
@@ -6966,7 +6975,7 @@ requeue_delayed_entity(struct sched_entity *se)
}
update_load_avg(cfs_rq, se, 0);
- clear_delayed(se);
+ clear_delayed(se, true);
}
/*
@@ -7030,6 +7039,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
enqueue_entity(cfs_rq, se, flags);
slice = cfs_rq_min_slice(cfs_rq);
+ if (!h_nr_delayed)
+ cfs_rq->h_nr_runnable++;
cfs_rq->h_nr_queued++;
cfs_rq->idle_h_nr_running += idle_h_nr_running;
cfs_rq->h_nr_delayed += h_nr_delayed;
@@ -7054,6 +7065,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
se->slice = slice;
slice = cfs_rq_min_slice(cfs_rq);
+ if (!h_nr_delayed)
+ cfs_rq->h_nr_runnable++;
cfs_rq->h_nr_queued++;
cfs_rq->idle_h_nr_running += idle_h_nr_running;
cfs_rq->h_nr_delayed += h_nr_delayed;
@@ -7144,6 +7157,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
break;
}
+ if (!h_nr_delayed)
+ cfs_rq->h_nr_runnable -= h_nr_queued;
cfs_rq->h_nr_queued -= h_nr_queued;
cfs_rq->idle_h_nr_running -= idle_h_nr_running;
cfs_rq->h_nr_delayed -= h_nr_delayed;
@@ -7183,6 +7198,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
se->slice = slice;
slice = cfs_rq_min_slice(cfs_rq);
+ if (!h_nr_delayed)
+ cfs_rq->h_nr_runnable -= h_nr_queued;
cfs_rq->h_nr_queued -= h_nr_queued;
cfs_rq->idle_h_nr_running -= idle_h_nr_running;
cfs_rq->h_nr_delayed -= h_nr_delayed;
@@ -10319,7 +10336,7 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
* When there is more than 1 task, the group_overloaded case already
* takes care of cpu with reduced capacity
*/
- if (rq->cfs.h_nr_queued != 1)
+ if (rq->cfs.h_nr_runnable != 1)
return false;
return check_cpu_capacity(rq, sd);
@@ -10354,7 +10371,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->group_load += load;
sgs->group_util += cpu_util_cfs(i);
sgs->group_runnable += cpu_runnable(rq);
- sgs->sum_h_nr_running += rq->cfs.h_nr_queued;
+ sgs->sum_h_nr_running += rq->cfs.h_nr_runnable;
nr_running = rq->nr_running;
sgs->sum_nr_running += nr_running;
@@ -10669,7 +10686,7 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
sgs->group_util += cpu_util_without(i, p);
sgs->group_runnable += cpu_runnable_without(rq, p);
local = task_running_on_cpu(i, p);
- sgs->sum_h_nr_running += rq->cfs.h_nr_queued - local;
+ sgs->sum_h_nr_running += rq->cfs.h_nr_runnable - local;
nr_running = rq->nr_running - local;
sgs->sum_nr_running += nr_running;
@@ -11451,7 +11468,7 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env,
if (rt > env->fbq_type)
continue;
- nr_running = rq->cfs.h_nr_queued;
+ nr_running = rq->cfs.h_nr_runnable;
if (!nr_running)
continue;
@@ -11610,7 +11627,7 @@ static int need_active_balance(struct lb_env *env)
* available on dst_cpu.
*/
if (env->idle &&
- (env->src_rq->cfs.h_nr_queued == 1)) {
+ (env->src_rq->cfs.h_nr_runnable == 1)) {
if ((check_cpu_capacity(env->src_rq, sd)) &&
(capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
return 1;
@@ -12353,7 +12370,7 @@ static void nohz_balancer_kick(struct rq *rq)
* If there's a runnable CFS task and the current CPU has reduced
* capacity, kick the ILB to see if there's a better CPU to run on:
*/
- if (rq->cfs.h_nr_queued >= 1 && check_cpu_capacity(rq, sd)) {
+ if (rq->cfs.h_nr_runnable >= 1 && check_cpu_capacity(rq, sd)) {
flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
}
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index 2bad0b508dfc..7a8534a2deff 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -275,7 +275,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load)
*
* group: [ see update_cfs_group() ]
* se_weight() = tg->weight * grq->load_avg / tg->load_avg
- * se_runnable() = grq->h_nr_queued
+ * se_runnable() = grq->h_nr_runnable
*
* runnable_sum = se_runnable() * runnable = grq->runnable_sum
* runnable_avg = runnable_sum
@@ -321,7 +321,7 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
{
if (___update_load_sum(now, &cfs_rq->avg,
scale_load_down(cfs_rq->load.weight),
- cfs_rq->h_nr_queued - cfs_rq->h_nr_delayed,
+ cfs_rq->h_nr_runnable,
cfs_rq->curr != NULL)) {
___update_load_avg(&cfs_rq->avg, 1);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4677e5412c40..e0b05ab43abd 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -646,7 +646,8 @@ struct balance_callback {
struct cfs_rq {
struct load_weight load;
unsigned int nr_running;
- unsigned int h_nr_queued; /* SCHED_{NORMAL,BATCH,IDLE} */
+ unsigned int h_nr_runnable; /* SCHED_{NORMAL,BATCH,IDLE} */
+ unsigned int h_nr_queued;
unsigned int idle_nr_running; /* SCHED_IDLE */
unsigned int idle_h_nr_running; /* SCHED_IDLE */
unsigned int h_nr_delayed;
@@ -898,11 +899,8 @@ struct dl_rq {
static inline void se_update_runnable(struct sched_entity *se)
{
- if (!entity_is_task(se)) {
- struct cfs_rq *cfs_rq = se->my_q;
-
- se->runnable_weight = cfs_rq->h_nr_queued - cfs_rq->h_nr_delayed;
- }
+ if (!entity_is_task(se))
+ se->runnable_weight = se->my_q->h_nr_runnable;
}
static inline long se_runnable(struct sched_entity *se)
--
2.43.0
On Fri, Nov 29, 2024 at 05:17:49PM +0100, Vincent Guittot wrote: > With delayed dequeued feature, a sleeping sched_entity remains queued in > the rq until its lag has elapsed. As a result, it stays also visible > in the statistics that are used to balance the system and in particular > the field cfs.h_nr_queued when the sched_entity is associated to a task. > > Create a new h_nr_runnable that tracks all queued and runnable tasks and > use it when balancing the load on the system. > > h_nr_runnable will be used in several places to make decision on load > balance: > - PELT runnable_avg > - deciding if a group is overloaded or has spare capacity > - numa stats > - reduced capacity management > - load balance > - nohz kick > > It should be noticed that the rq->nr_running still counts the delayed > dequeued tasks as delayed dequeue is a fair feature that is meaningless > at core level. Since you're doing a v3, could you please split this into 2 patches, one adding the accounting, and then a separate patch making use of it?
On Mon, 2 Dec 2024 at 10:54, Peter Zijlstra <peterz@infradead.org> wrote: > > On Fri, Nov 29, 2024 at 05:17:49PM +0100, Vincent Guittot wrote: > > With delayed dequeued feature, a sleeping sched_entity remains queued in > > the rq until its lag has elapsed. As a result, it stays also visible > > in the statistics that are used to balance the system and in particular > > the field cfs.h_nr_queued when the sched_entity is associated to a task. > > > > Create a new h_nr_runnable that tracks all queued and runnable tasks and > > use it when balancing the load on the system. > > > > h_nr_runnable will be used in several places to make decision on load > > balance: > > - PELT runnable_avg > > - deciding if a group is overloaded or has spare capacity > > - numa stats > > - reduced capacity management > > - load balance > > - nohz kick > > > > It should be noticed that the rq->nr_running still counts the delayed > > dequeued tasks as delayed dequeue is a fair feature that is meaningless > > at core level. > > Since you're doing a v3, could you please split this into 2 patches, one > adding the accounting, and then a separate patch making use of it? ok
© 2016 - 2024 Red Hat, Inc.