[PATCH] sched/fair: Force idle aware load balancing

Fernand Sieber posted 1 patch 4 days ago
There is a newer version of this series
kernel/sched/fair.c  | 40 +++++++++++++++++++++++++++-------------
kernel/sched/sched.h | 12 ++++++++++++
2 files changed, 39 insertions(+), 13 deletions(-)
[PATCH] sched/fair: Force idle aware load balancing
Posted by Fernand Sieber 4 days ago
Consider force idle wasted capacity when computing if a group is idle or
overloaded. We use a rather crude mechanism based on the current force idle
state of the rq. It may be preferable to use a decaying average, similar
to other load metrics, to avoid jittering.

If the busiest group has force idle, make it a task migration. This way we
will try to move one task regardless of the load. There are still
subsequent checks later on to verify that this doesn't cause more force
idle on the destination.

===

Testing

Testing is aimed at measuring perceived guest noise on hypervisor system
with time shared scenarios.

Setup is on system where the load is nearing 100% which should allow no
steal time. The system has 64 CPUs, with 8 VMs, each VM using core
scheduling with 8 vCPUs per VM, time shared.

7 VMs are running stressors (`stress-ng --cpu 0`) while the last VM is
running the hwlat tracer with a width of 100ms, a period of 300ms, and
a threshold of 100us. Each VM runs a cookied non vCPU VMM process that
adds a light level of noise which forces some level of load balancing.

Signed-off-by: Fernand Sieber <sieberf@amazon.com>

The test scenario is ran 10x60s and the average noise is measured.

At baseline, we measure about 1.20% of noise (computed from hwlat
breaches). With the proposed patch, the noise drops to 0.63%.
---
 kernel/sched/fair.c  | 40 +++++++++++++++++++++++++++-------------
 kernel/sched/sched.h | 12 ++++++++++++
 2 files changed, 39 insertions(+), 13 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5b752324270b..ab8c9aa09107 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9932,6 +9932,7 @@ struct sg_lb_stats {
 	unsigned int nr_numa_running;
 	unsigned int nr_preferred_running;
 #endif
+	unsigned int forceidle_weight;
 };
 
 /*
@@ -10135,15 +10136,15 @@ static inline int sg_imbalanced(struct sched_group *group)
 static inline bool
 group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
 {
-	if (sgs->sum_nr_running < sgs->group_weight)
+	if (sgs->sum_nr_running < (sgs->group_weight - sgs->forceidle_weight))
 		return true;
 
-	if ((sgs->group_capacity * imbalance_pct) <
-			(sgs->group_runnable * 100))
+	if ((sgs->group_capacity * imbalance_pct * (sgs->group_weight - sgs->forceidle_weight)) <
+			(sgs->group_runnable * 100 * sgs->group_weight))
 		return false;
 
-	if ((sgs->group_capacity * 100) >
-			(sgs->group_util * imbalance_pct))
+	if ((sgs->group_capacity * 100 * (sgs->group_weight - sgs->forceidle_weight)) >
+			(sgs->group_util * imbalance_pct * sgs->group_weight))
 		return true;
 
 	return false;
@@ -10160,15 +10161,15 @@ group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
 static inline bool
 group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
 {
-	if (sgs->sum_nr_running <= sgs->group_weight)
+	if (sgs->sum_nr_running <= (sgs->group_weight - sgs->forceidle_weight))
 		return false;
 
-	if ((sgs->group_capacity * 100) <
-			(sgs->group_util * imbalance_pct))
+	if ((sgs->group_capacity * 100 * (sgs->group_weight - sgs->forceidle_weight)) <
+			(sgs->group_util * imbalance_pct * sgs->group_weight))
 		return true;
 
-	if ((sgs->group_capacity * imbalance_pct) <
-			(sgs->group_runnable * 100))
+	if ((sgs->group_capacity * imbalance_pct * (sgs->group_weight - sgs->forceidle_weight)) <
+			(sgs->group_runnable * 100 * sgs->group_weight))
 		return true;
 
 	return false;
@@ -10371,13 +10372,19 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 		nr_running = rq->nr_running;
 		sgs->sum_nr_running += nr_running;
 
+		/*
+		 * Ignore force idle if we are balancing within the SMT mask
+		 */
+		if (rq_in_forceidle(rq) && !(env->sd->flags & SD_SHARE_CPUCAPACITY))
+			sgs->forceidle_weight++;
+
 		if (cpu_overutilized(i))
 			*sg_overutilized = 1;
 
 		/*
 		 * No need to call idle_cpu() if nr_running is not 0
 		 */
-		if (!nr_running && idle_cpu(i)) {
+		if (!rq_in_forceidle(rq) && !nr_running && idle_cpu(i)) {
 			sgs->idle_cpus++;
 			/* Idle cpu can't have misfit task */
 			continue;
@@ -10691,10 +10698,16 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
 		nr_running = rq->nr_running - local;
 		sgs->sum_nr_running += nr_running;
 
+		/*
+		 * Ignore force idle if we are balancing within the SMT mask
+		 */
+		if (rq_in_forceidle(rq) && !(sd->flags & SD_SHARE_CPUCAPACITY))
+			sgs->forceidle_weight++;
+
 		/*
 		 * No need to call idle_cpu_without() if nr_running is not 0
 		 */
-		if (!nr_running && idle_cpu_without(i, p))
+		if (!rq_in_forceidle(rq) && !nr_running && idle_cpu_without(i, p))
 			sgs->idle_cpus++;
 
 		/* Check if task fits in the CPU */
@@ -11123,7 +11136,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 		return;
 	}
 
-	if (busiest->group_type == group_smt_balance) {
+	if (busiest->group_type == group_smt_balance ||
+	    busiest->forceidle_weight) {
 		/* Reduce number of tasks sharing CPU capacity */
 		env->migration_type = migrate_task;
 		env->imbalance = 1;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index adfb6e3409d7..fdee101b1a66 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1468,6 +1468,13 @@ static inline bool sched_core_enqueued(struct task_struct *p)
 	return !RB_EMPTY_NODE(&p->core_node);
 }
 
+static inline bool rq_in_forceidle(struct rq *rq)
+{
+	return rq->core->core_forceidle_count > 0 &&
+		rq->nr_running &&
+		rq->curr == rq->idle;
+}
+
 extern void sched_core_enqueue(struct rq *rq, struct task_struct *p);
 extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags);
 
@@ -1513,6 +1520,11 @@ static inline bool sched_group_cookie_match(struct rq *rq,
 	return true;
 }
 
+static inline bool rq_in_forceidle(struct rq *rq)
+{
+	return false;
+}
+
 #endif /* !CONFIG_SCHED_CORE */
 
 #ifdef CONFIG_RT_GROUP_SCHED
-- 
2.43.0




Amazon Development Centre (South Africa) (Proprietary) Limited
29 Gogosoa Street, Observatory, Cape Town, Western Cape, 7925, South Africa
Registration Number: 2004 / 034463 / 07
Re: [PATCH] sched/fair: Force idle aware load balancing
Posted by Vincent Guittot 3 days, 7 hours ago
On Thu, 27 Nov 2025 at 21:28, Fernand Sieber <sieberf@amazon.com> wrote:
>
> Consider force idle wasted capacity when computing if a group is idle or
> overloaded. We use a rather crude mechanism based on the current force idle
> state of the rq. It may be preferable to use a decaying average, similar
> to other load metrics, to avoid jittering.
>
> If the busiest group has force idle, make it a task migration. This way we
> will try to move one task regardless of the load. There are still
> subsequent checks later on to verify that this doesn't cause more force
> idle on the destination.
>
> ===
>
> Testing
>
> Testing is aimed at measuring perceived guest noise on hypervisor system
> with time shared scenarios.
>
> Setup is on system where the load is nearing 100% which should allow no
> steal time. The system has 64 CPUs, with 8 VMs, each VM using core
> scheduling with 8 vCPUs per VM, time shared.
>
> 7 VMs are running stressors (`stress-ng --cpu 0`) while the last VM is
> running the hwlat tracer with a width of 100ms, a period of 300ms, and
> a threshold of 100us. Each VM runs a cookied non vCPU VMM process that
> adds a light level of noise which forces some level of load balancing.
>
> Signed-off-by: Fernand Sieber <sieberf@amazon.com>
>
> The test scenario is ran 10x60s and the average noise is measured.
>
> At baseline, we measure about 1.20% of noise (computed from hwlat
> breaches). With the proposed patch, the noise drops to 0.63%.
> ---
>  kernel/sched/fair.c  | 40 +++++++++++++++++++++++++++-------------
>  kernel/sched/sched.h | 12 ++++++++++++
>  2 files changed, 39 insertions(+), 13 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 5b752324270b..ab8c9aa09107 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -9932,6 +9932,7 @@ struct sg_lb_stats {
>         unsigned int nr_numa_running;
>         unsigned int nr_preferred_running;
>  #endif
> +       unsigned int forceidle_weight;
>  };
>
>  /*
> @@ -10135,15 +10136,15 @@ static inline int sg_imbalanced(struct sched_group *group)
>  static inline bool
>  group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
>  {
> -       if (sgs->sum_nr_running < sgs->group_weight)
> +       if (sgs->sum_nr_running < (sgs->group_weight - sgs->forceidle_weight))
>                 return true;
>
> -       if ((sgs->group_capacity * imbalance_pct) <
> -                       (sgs->group_runnable * 100))
> +       if ((sgs->group_capacity * imbalance_pct * (sgs->group_weight - sgs->forceidle_weight)) <
> +                       (sgs->group_runnable * 100 * sgs->group_weight))

so you apply a ratio on group capacity based on the number of forced
idle but what if you have heterogeneous systems ?

>                 return false;
>
> -       if ((sgs->group_capacity * 100) >
> -                       (sgs->group_util * imbalance_pct))
> +       if ((sgs->group_capacity * 100 * (sgs->group_weight - sgs->forceidle_weight)) >
> +                       (sgs->group_util * imbalance_pct * sgs->group_weight))
>                 return true;
>
>         return false;
> @@ -10160,15 +10161,15 @@ group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
>  static inline bool
>  group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
>  {
> -       if (sgs->sum_nr_running <= sgs->group_weight)
> +       if (sgs->sum_nr_running <= (sgs->group_weight - sgs->forceidle_weight))
>                 return false;
>
> -       if ((sgs->group_capacity * 100) <
> -                       (sgs->group_util * imbalance_pct))
> +       if ((sgs->group_capacity * 100 * (sgs->group_weight - sgs->forceidle_weight)) <
> +                       (sgs->group_util * imbalance_pct * sgs->group_weight))
>                 return true;
>
> -       if ((sgs->group_capacity * imbalance_pct) <
> -                       (sgs->group_runnable * 100))
> +       if ((sgs->group_capacity * imbalance_pct * (sgs->group_weight - sgs->forceidle_weight)) <
> +                       (sgs->group_runnable * 100 * sgs->group_weight))
>                 return true;
>
>         return false;
> @@ -10371,13 +10372,19 @@ static inline void update_sg_lb_stats(struct lb_env *env,
>                 nr_running = rq->nr_running;
>                 sgs->sum_nr_running += nr_running;
>
> +               /*
> +                * Ignore force idle if we are balancing within the SMT mask
> +                */
> +               if (rq_in_forceidle(rq) && !(env->sd->flags & SD_SHARE_CPUCAPACITY))
> +                       sgs->forceidle_weight++;
> +
>                 if (cpu_overutilized(i))
>                         *sg_overutilized = 1;
>
>                 /*
>                  * No need to call idle_cpu() if nr_running is not 0
>                  */
> -               if (!nr_running && idle_cpu(i)) {
> +               if (!rq_in_forceidle(rq) && !nr_running && idle_cpu(i)) {
>                         sgs->idle_cpus++;
>                         /* Idle cpu can't have misfit task */
>                         continue;
> @@ -10691,10 +10698,16 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
>                 nr_running = rq->nr_running - local;
>                 sgs->sum_nr_running += nr_running;
>
> +               /*
> +                * Ignore force idle if we are balancing within the SMT mask
> +                */
> +               if (rq_in_forceidle(rq) && !(sd->flags & SD_SHARE_CPUCAPACITY))
> +                       sgs->forceidle_weight++;
> +
>                 /*
>                  * No need to call idle_cpu_without() if nr_running is not 0
>                  */
> -               if (!nr_running && idle_cpu_without(i, p))
> +               if (!rq_in_forceidle(rq) && !nr_running && idle_cpu_without(i, p))
>                         sgs->idle_cpus++;
>
>                 /* Check if task fits in the CPU */
> @@ -11123,7 +11136,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
>                 return;
>         }
>
> -       if (busiest->group_type == group_smt_balance) {
> +       if (busiest->group_type == group_smt_balance ||
> +           busiest->forceidle_weight) {
>                 /* Reduce number of tasks sharing CPU capacity */
>                 env->migration_type = migrate_task;
>                 env->imbalance = 1;
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index adfb6e3409d7..fdee101b1a66 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1468,6 +1468,13 @@ static inline bool sched_core_enqueued(struct task_struct *p)
>         return !RB_EMPTY_NODE(&p->core_node);
>  }
>
> +static inline bool rq_in_forceidle(struct rq *rq)
> +{
> +       return rq->core->core_forceidle_count > 0 &&
> +               rq->nr_running &&
> +               rq->curr == rq->idle;
> +}
> +
>  extern void sched_core_enqueue(struct rq *rq, struct task_struct *p);
>  extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags);
>
> @@ -1513,6 +1520,11 @@ static inline bool sched_group_cookie_match(struct rq *rq,
>         return true;
>  }
>
> +static inline bool rq_in_forceidle(struct rq *rq)
> +{
> +       return false;
> +}
> +
>  #endif /* !CONFIG_SCHED_CORE */
>
>  #ifdef CONFIG_RT_GROUP_SCHED
> --
> 2.43.0
>
>
>
>
> Amazon Development Centre (South Africa) (Proprietary) Limited
> 29 Gogosoa Street, Observatory, Cape Town, Western Cape, 7925, South Africa
> Registration Number: 2004 / 034463 / 07
>
Re: [PATCH] sched/fair: Force idle aware load balancing
Posted by Peter Zijlstra 3 days, 6 hours ago
On Fri, Nov 28, 2025 at 02:55:36PM +0100, Vincent Guittot wrote:
> On Thu, 27 Nov 2025 at 21:28, Fernand Sieber <sieberf@amazon.com> wrote:

> > @@ -10135,15 +10136,15 @@ static inline int sg_imbalanced(struct sched_group *group)
> >  static inline bool
> >  group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
> >  {
> > -       if (sgs->sum_nr_running < sgs->group_weight)
> > +       if (sgs->sum_nr_running < (sgs->group_weight - sgs->forceidle_weight))
> >                 return true;
> >
> > -       if ((sgs->group_capacity * imbalance_pct) <
> > -                       (sgs->group_runnable * 100))
> > +       if ((sgs->group_capacity * imbalance_pct * (sgs->group_weight - sgs->forceidle_weight)) <
> > +                       (sgs->group_runnable * 100 * sgs->group_weight))
> 
> so you apply a ratio on group capacity based on the number of forced
> idle but what if you have heterogeneous systems ?

Ah, good point. I suppose tracking force_idle_capacity in
update_sg_lb_stats() should be possible, and then subtract that from
group_capacity or so.
Re: [PATCH] sched/fair: Force idle aware load balancing
Posted by Fernand Sieber 7 hours ago
On Fri, 28 Nov 2025 at 14:58, Peter Zijlstra <peterz@infradead.org> wrote:
> On Fri, Nov 28, 2025 at 02:55:36PM +0100, Vincent Guittot wrote:
> > On Thu, 27 Nov 2025 at 21:28, Fernand Sieber <sieberf@amazon.com> wrote:
>
> > > @@ -10135,15 +10136,15 @@ static inline int sg_imbalanced(struct sched_group *group)
> > >  static inline bool
> > >  group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
> > >  {
> > > -       if (sgs->sum_nr_running < sgs->group_weight)
> > > +       if (sgs->sum_nr_running < (sgs->group_weight - sgs->forceidle_weight))
> > >                 return true;
> > >
> > > -       if ((sgs->group_capacity * imbalance_pct) <
> > > -                       (sgs->group_runnable * 100))
> > > +       if ((sgs->group_capacity * imbalance_pct * (sgs->group_weight - sgs->forceidle_weight)) <
> > > +                       (sgs->group_runnable * 100 * sgs->group_weight))
> >
> > so you apply a ratio on group capacity based on the number of forced
> > idle but what if you have heterogeneous systems ?
>
> Ah, good point. I suppose tracking force_idle_capacity in
> update_sg_lb_stats() should be possible, and then subtract that from
> group_capacity or so.

Thanks. Addressed in rev2, by keeping track of both forceidle_weight and
forceidle_capacity:
https://lore.kernel.org/lkml/20251201124223.247107-1-sieberf@amazon.com/



Amazon Development Centre (South Africa) (Proprietary) Limited
29 Gogosoa Street, Observatory, Cape Town, Western Cape, 7925, South Africa
Registration Number: 2004 / 034463 / 07
Re: [PATCH] sched/fair: Force idle aware load balancing
Posted by Peter Zijlstra 3 days, 9 hours ago
On Thu, Nov 27, 2025 at 10:27:17PM +0200, Fernand Sieber wrote:

> @@ -11123,7 +11136,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
>  		return;
>  	}
>  
> -	if (busiest->group_type == group_smt_balance) {
> +	if (busiest->group_type == group_smt_balance ||
> +	    busiest->forceidle_weight) {

Should we not instead make it so that we select group_smt_balance in
this case?

Anyway, the patch doesn't seem horrible to me. Vincent?

>  		/* Reduce number of tasks sharing CPU capacity */
>  		env->migration_type = migrate_task;
>  		env->imbalance = 1;
Re: [PATCH] sched/fair: Force idle aware load balancing
Posted by Vincent Guittot 3 days, 7 hours ago
On Fri, 28 Nov 2025 at 12:14, Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Thu, Nov 27, 2025 at 10:27:17PM +0200, Fernand Sieber wrote:
>
> > @@ -11123,7 +11136,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
> >               return;
> >       }
> >
> > -     if (busiest->group_type == group_smt_balance) {
> > +     if (busiest->group_type == group_smt_balance ||
> > +         busiest->forceidle_weight) {
>
> Should we not instead make it so that we select group_smt_balance in
> this case?

Why do we need this test ? We have already removed forced idle cpus
from statistics ?

I suppose Fernand wants to cover cases where there is 1 task per CPUs
so we are balanced but one CPU is forced idle and we want to force
migrating a task to then try to move back another one ? In this case
it should be detected early and become group_imbalanced type
Also what happens if we could migrate more than one task


>
> Anyway, the patch doesn't seem horrible to me. Vincent?
>
> >               /* Reduce number of tasks sharing CPU capacity */
> >               env->migration_type = migrate_task;
> >               env->imbalance = 1;
Re: [PATCH] sched/fair: Force idle aware load balancing
Posted by Fernand Sieber 7 hours ago
On Fri, 28 Nov 2025 at 14:50, Vincent Guittot <vincent.guittot@linaro.org> wrote:
> On Fri, 28 Nov 2025 at 12:14, Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Thu, Nov 27, 2025 at 10:27:17PM +0200, Fernand Sieber wrote:
> >
> > > @@ -11123,7 +11136,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
> > >               return;
> > >       }
> > >
> > > -     if (busiest->group_type == group_smt_balance) {
> > > +     if (busiest->group_type == group_smt_balance ||
> > > +         busiest->forceidle_weight) {
> >
> > Should we not instead make it so that we select group_smt_balance in
> > this case?
>
> Why do we need this test ? We have already removed forced idle cpus
> from statistics ?
>
> I suppose Fernand wants to cover cases where there is 1 task per CPUs
> so we are balanced but one CPU is forced idle and we want to force
> migrating a task to then try to move back another one ? In this case
> it should be detected early and become group_imbalanced type
> Also what happens if we could migrate more than one task

I've removed this override in v2, it doesn't seem to make much a
difference after doing more benchmarking.

When I traced LB inefficiencies, I noticed in some situations that a
large imbalance (overloaded vs spare capacity) was detected, but
remediation was delayed. So the intention of the override was to "nudge"
the LB to take a remediation action immediately, regardless of the load
to move, with the idea that it's better to migrate anything now rather
than waste capacity in force idle for longer.

This override was probably not the right tool for it. If I get a chance
I'll try to dive deeper and provide more details.

One different thing I noticed is that the task_hot check has a cookie
check which is more or less bound to fail on busy large system running
lots of different cookied tasks (e.g hypervisor on large servers with
cookied time shared vCPUs) because there's almost zero chance that the
target CPU is randomly running the same cookie as the migrating task.
This delays migrations unnecessarily if the run queues are shorts and
there are no valid spare candidates. Need to think more about that one,
but if you have any ideas let me know.. ? Maybe instead of having this
check the list of migrating tasks should be sorted to prioritize
matching cookie tasks first if any, similar than proposed in the cache
aware scheduling RFC?
https://lwn.net/ml/all/26e7bfa88163e13ba1ebefbb54ecf5f42d84f884.1760206683.git.tim.c.chen@linux.intel.com/



Amazon Development Centre (South Africa) (Proprietary) Limited
29 Gogosoa Street, Observatory, Cape Town, Western Cape, 7925, South Africa
Registration Number: 2004 / 034463 / 07
[PATCH v2] sched/fair: Force idle aware load balancing
Posted by Fernand Sieber 8 hours ago
Consider force idle wasted capacity when computing if a group is idle or
overloaded. We use a rather crude mechanism based on the current force idle
state of the rq. It may be preferable to use a decaying average, similar
to other load metrics, to avoid jittering.

If the busiest group has force idle, make it a task migration. This way we
will try to move one task regardless of the load. There are still
subsequent checks later on to verify that this doesn't cause more force
idle on the destination.

===

rev1->rev2:
* addressed feedback about asym scheduling
* removed redundant force idle check for idle cpus
* removed migrate_task override for LB with force idle (no perf gains)

===

Testing

Testing is aimed at measuring perceived guest noise on hypervisor system
with time shared scenarios.

Setup is on system where the load is nearing 100% which should allow no
steal time. The system has 64 CPUs, with 8 VMs, each VM using core
scheduling with 8 vCPUs per VM, time shared.

7 VMs are running stressors (`stress-ng --cpu 0`) while the last VM is
running the hwlat tracer with a width of 100ms, a period of 300ms, and
a threshold of 100us. Each VM runs a cookied non vCPU VMM process that
adds a light level of noise which forces some level of load balancing.

The test scenario is ran 10x60s and the average noise is measured.

At baseline, we measure about 1.20% of noise (computed from hwlat
breaches). With the proposed patch, the noise drops to 0.63%.

Signed-off-by: Fernand Sieber <sieberf@amazon.com>
---
 kernel/sched/fair.c  | 67 ++++++++++++++++++++++++++++++++++++++++----
 kernel/sched/sched.h | 12 ++++++++
 2 files changed, 73 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5b752324270b..c4ef8aaf1142 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9932,6 +9932,10 @@ struct sg_lb_stats {
 	unsigned int nr_numa_running;
 	unsigned int nr_preferred_running;
 #endif
+#ifdef CONFIG_SCHED_CORE
+	unsigned int forceidle_weight;
+	unsigned long forceidle_capacity;
+#endif
 };
 
 /*
@@ -10120,6 +10124,29 @@ static inline int sg_imbalanced(struct sched_group *group)
 	return group->sgc->imbalance;
 }
 
+
+#ifdef CONFIG_SCHED_CORE
+static inline unsigned int sgs_available_weight(struct sg_lb_stats *sgs)
+{
+	return sgs->group_weight - sgs->forceidle_weight;
+}
+
+static inline unsigned long sgs_available_capacity(struct sg_lb_stats *sgs)
+{
+	return sgs->group_capacity - sgs->forceidle_capacity;
+}
+#else
+static inline unsigned int sgs_available_weight(struct sg_lb_stats *sgs)
+{
+	return sgs->group_weight;
+}
+
+static inline unsigned long sgs_available_capacity(struct sg_lb_stats *sgs)
+{
+	return sgs->group_capacity;
+}
+#endif /* CONFIG_SCHED_CORE */
+
 /*
  * group_has_capacity returns true if the group has spare capacity that could
  * be used by some tasks.
@@ -10135,14 +10162,14 @@ static inline int sg_imbalanced(struct sched_group *group)
 static inline bool
 group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
 {
-	if (sgs->sum_nr_running < sgs->group_weight)
+	if (sgs->sum_nr_running < sgs_available_weight(sgs))
 		return true;
 
-	if ((sgs->group_capacity * imbalance_pct) <
+	if ((sgs_available_capacity(sgs) * imbalance_pct) <
 			(sgs->group_runnable * 100))
 		return false;
 
-	if ((sgs->group_capacity * 100) >
+	if ((sgs_available_capacity(sgs) * 100) >
 			(sgs->group_util * imbalance_pct))
 		return true;
 
@@ -10160,14 +10187,14 @@ group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
 static inline bool
 group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
 {
-	if (sgs->sum_nr_running <= sgs->group_weight)
+	if (sgs->sum_nr_running <= sgs_available_weight(sgs))
 		return false;
 
-	if ((sgs->group_capacity * 100) <
+	if ((sgs_available_capacity(sgs) * 100) <
 			(sgs->group_util * imbalance_pct))
 		return true;
 
-	if ((sgs->group_capacity * imbalance_pct) <
+	if ((sgs_available_capacity(sgs) * imbalance_pct) <
 			(sgs->group_runnable * 100))
 		return true;
 
@@ -10336,6 +10363,30 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
 	return check_cpu_capacity(rq, sd);
 }
 
+#ifdef CONFIG_SCHED_CORE
+static inline void
+update_forceidle_capacity(struct sched_domain *sd,
+			  struct sg_lb_stats *sgs,
+			  struct rq *rq)
+{
+	/*
+	 * Ignore force idle if we are balancing within the SMT mask
+	 */
+	if (sd->flags & SD_SHARE_CPUCAPACITY)
+		return;
+
+	if (rq_in_forceidle(rq)) {
+		sgs->forceidle_weight++;
+		sgs->forceidle_capacity += rq->cpu_capacity;
+	}
+}
+#else
+static inline void
+update_forceidle_capacity(struct sched_domain *sd,
+			  struct sg_lb_stats *sgs,
+			  struct rq *rq) {}
+#endif /* !CONFIG_SCHED_CORE */
+
 /**
  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
  * @env: The load balancing environment.
@@ -10371,6 +10422,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 		nr_running = rq->nr_running;
 		sgs->sum_nr_running += nr_running;
 
+		update_forceidle_capacity(env->sd, sgs, rq);
+
 		if (cpu_overutilized(i))
 			*sg_overutilized = 1;
 
@@ -10691,6 +10744,8 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
 		nr_running = rq->nr_running - local;
 		sgs->sum_nr_running += nr_running;
 
+		update_forceidle_capacity(sd, sgs, rq);
+
 		/*
 		 * No need to call idle_cpu_without() if nr_running is not 0
 		 */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index adfb6e3409d7..fdee101b1a66 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1468,6 +1468,13 @@ static inline bool sched_core_enqueued(struct task_struct *p)
 	return !RB_EMPTY_NODE(&p->core_node);
 }
 
+static inline bool rq_in_forceidle(struct rq *rq)
+{
+	return rq->core->core_forceidle_count > 0 &&
+		rq->nr_running &&
+		rq->curr == rq->idle;
+}
+
 extern void sched_core_enqueue(struct rq *rq, struct task_struct *p);
 extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags);
 
@@ -1513,6 +1520,11 @@ static inline bool sched_group_cookie_match(struct rq *rq,
 	return true;
 }
 
+static inline bool rq_in_forceidle(struct rq *rq)
+{
+	return false;
+}
+
 #endif /* !CONFIG_SCHED_CORE */
 
 #ifdef CONFIG_RT_GROUP_SCHED
-- 
2.43.0




Amazon Development Centre (South Africa) (Proprietary) Limited
29 Gogosoa Street, Observatory, Cape Town, Western Cape, 7925, South Africa
Registration Number: 2004 / 034463 / 07