[RFC PATCH 1/2] Revert "sched/fair: Make sched-idle CPU selection consistent throughout"

Abel Wu posted 2 patches 11 months ago
[RFC PATCH 1/2] Revert "sched/fair: Make sched-idle CPU selection consistent throughout"
Posted by Abel Wu 11 months ago
This reverts commit 17346452b25b98acfb395d2a82ec2e4ad0cb7a01.

The above commit tried to unify selection policy between idle cpus
and SCHED_IDLE ones in fast- and slow-path of select_task_rq_fair()
by treating them equally (although the SCHED_IDLE cpus are turned
to be given more preference in slowpath). The test results seemed
solid, but the setup didn't take cgroup hierarchy into account,
which actually made some of our important services get affected.

The cgroup hierarchy in our production environment looks like below,
which might be common in modern containerized setup:

			  root
			/	\
		kubepods	system.slice
		/	\\		\
	guaranteed	besteffort	containerd

	(where 'X=A' means A is SCHED_IDLE cgroup)

The cpu is treated as SCHED_IDLE if only besteffort is running, which
is given at least equal preference as the idle cpus when deciding where
to run a newly woken task. But the SCHED_IDLE cpus do not necessarily
mean they can be preempted soon enough to start serving the wakee, and
containerd and other services under system.slice are the case that have
to wait in runqueue since they can not preempt kubepods, while idle cpus
are possible out there untouched.

So prioritize idle cpus over SCHED_IDLE ones to avoid undesired delay
like orchestration operations as much as possible.

Signed-off-by: Abel Wu <wuyun.abel@bytedance.com>
---
 kernel/sched/fair.c | 49 +++++++++++++++++++++++++++------------------
 1 file changed, 30 insertions(+), 19 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ae0350088ac1..379764bd2795 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7446,7 +7446,7 @@ sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *
 	unsigned int min_exit_latency = UINT_MAX;
 	u64 latest_idle_timestamp = 0;
 	int least_loaded_cpu = this_cpu;
-	int shallowest_idle_cpu = -1;
+	int shallowest_idle_cpu = -1, si_cpu = -1;
 	int i;
 
 	/* Check if we have any choice: */
@@ -7460,9 +7460,6 @@ sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *
 		if (!sched_core_cookie_match(rq, p))
 			continue;
 
-		if (sched_idle_cpu(i))
-			return i;
-
 		if (available_idle_cpu(i)) {
 			struct cpuidle_state *idle = idle_get_state(rq);
 			if (idle && idle->exit_latency < min_exit_latency) {
@@ -7484,7 +7481,12 @@ sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *
 				latest_idle_timestamp = rq->idle_stamp;
 				shallowest_idle_cpu = i;
 			}
-		} else if (shallowest_idle_cpu == -1) {
+		} else if (shallowest_idle_cpu == -1 && si_cpu == -1) {
+			if (sched_idle_cpu(i)) {
+				si_cpu = i;
+				continue;
+			}
+
 			load = cpu_load(cpu_rq(i));
 			if (load < min_load) {
 				min_load = load;
@@ -7493,7 +7495,11 @@ sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *
 		}
 	}
 
-	return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
+	if (shallowest_idle_cpu != -1)
+		return shallowest_idle_cpu;
+	if (si_cpu != -1)
+		return si_cpu;
+	return least_loaded_cpu;
 }
 
 static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct task_struct *p,
@@ -7549,11 +7555,14 @@ static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct tas
 	return new_cpu;
 }
 
-static inline int __select_idle_cpu(int cpu, struct task_struct *p)
+static inline int __select_idle_cpu(int cpu, struct task_struct *p, int *si_cpu)
 {
-	if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) &&
-	    sched_cpu_cookie_match(cpu_rq(cpu), p))
+	if (!sched_cpu_cookie_match(cpu_rq(cpu), p))
+		return -1;
+	if (available_idle_cpu(cpu))
 		return cpu;
+	if (*si_cpu == -1 && sched_idle_cpu(cpu))
+		*si_cpu = cpu;
 
 	return -1;
 }
@@ -7649,7 +7658,7 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
  */
 static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
 {
-	int cpu;
+	int cpu, si_cpu = -1;
 
 	for_each_cpu_and(cpu, cpu_smt_mask(target), p->cpus_ptr) {
 		if (cpu == target)
@@ -7660,11 +7669,13 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
 		 */
 		if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
 			continue;
-		if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
+		if (available_idle_cpu(cpu))
 			return cpu;
+		if (si_cpu == -1 && sched_idle_cpu(cpu))
+			si_cpu = cpu;
 	}
 
-	return -1;
+	return si_cpu;
 }
 
 #else /* CONFIG_SCHED_SMT */
@@ -7680,7 +7691,7 @@ static inline bool test_idle_cores(int cpu)
 
 static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
 {
-	return __select_idle_cpu(core, p);
+	return __select_idle_cpu(core, p, idle_cpu);
 }
 
 static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
@@ -7728,10 +7739,10 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
 						return i;
 				} else {
 					if (--nr <= 0)
-						return -1;
-					idle_cpu = __select_idle_cpu(cpu, p);
-					if ((unsigned int)idle_cpu < nr_cpumask_bits)
 						return idle_cpu;
+					i = __select_idle_cpu(cpu, p, &idle_cpu);
+					if ((unsigned int)i < nr_cpumask_bits)
+						return i;
 				}
 			}
 			cpumask_andnot(cpus, cpus, sched_group_span(sg));
@@ -7746,9 +7757,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
 
 		} else {
 			if (--nr <= 0)
-				return -1;
-			idle_cpu = __select_idle_cpu(cpu, p);
-			if ((unsigned int)idle_cpu < nr_cpumask_bits)
+				return idle_cpu;
+			i = __select_idle_cpu(cpu, p, &idle_cpu);
+			if ((unsigned int)i < nr_cpumask_bits)
 				break;
 		}
 	}
-- 
2.37.3
Re: [RFC PATCH 1/2] Revert "sched/fair: Make sched-idle CPU selection consistent throughout"
Posted by Vincent Guittot 11 months ago
On Mon, 10 Mar 2025 at 08:41, Abel Wu <wuyun.abel@bytedance.com> wrote:
>
> This reverts commit 17346452b25b98acfb395d2a82ec2e4ad0cb7a01.
>
> The above commit tried to unify selection policy between idle cpus
> and SCHED_IDLE ones in fast- and slow-path of select_task_rq_fair()
> by treating them equally (although the SCHED_IDLE cpus are turned
> to be given more preference in slowpath). The test results seemed
> solid, but the setup didn't take cgroup hierarchy into account,
> which actually made some of our important services get affected.
>
> The cgroup hierarchy in our production environment looks like below,
> which might be common in modern containerized setup:
>
>                           root
>                         /       \
>                 kubepods        system.slice
>                 /       \\              \
>         guaranteed      besteffort      containerd
>
>         (where 'X=A' means A is SCHED_IDLE cgroup)
>
> The cpu is treated as SCHED_IDLE if only besteffort is running, which
> is given at least equal preference as the idle cpus when deciding where
> to run a newly woken task. But the SCHED_IDLE cpus do not necessarily
> mean they can be preempted soon enough to start serving the wakee, and

Could you give us more details why the SCHED_IDLE cpu which runs only
besteffort can't be preempted soon enough ?

because kubepods vs system.slice is not sched_idle when comparing the
entities ? some maybe the definition of sched_idle_cpu should be fixed
instead

a sched_idle_cpu should be preempted immediately otherwise it's not a
sched idle cpu and the definition is meaningless

> containerd and other services under system.slice are the case that have
> to wait in runqueue since they can not preempt kubepods, while idle cpus
> are possible out there untouched.
>
> So prioritize idle cpus over SCHED_IDLE ones to avoid undesired delay
> like orchestration operations as much as possible.
>
> Signed-off-by: Abel Wu <wuyun.abel@bytedance.com>
> ---
>  kernel/sched/fair.c | 49 +++++++++++++++++++++++++++------------------
>  1 file changed, 30 insertions(+), 19 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index ae0350088ac1..379764bd2795 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7446,7 +7446,7 @@ sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *
>         unsigned int min_exit_latency = UINT_MAX;
>         u64 latest_idle_timestamp = 0;
>         int least_loaded_cpu = this_cpu;
> -       int shallowest_idle_cpu = -1;
> +       int shallowest_idle_cpu = -1, si_cpu = -1;
>         int i;
>
>         /* Check if we have any choice: */
> @@ -7460,9 +7460,6 @@ sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *
>                 if (!sched_core_cookie_match(rq, p))
>                         continue;
>
> -               if (sched_idle_cpu(i))
> -                       return i;
> -
>                 if (available_idle_cpu(i)) {
>                         struct cpuidle_state *idle = idle_get_state(rq);
>                         if (idle && idle->exit_latency < min_exit_latency) {
> @@ -7484,7 +7481,12 @@ sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *
>                                 latest_idle_timestamp = rq->idle_stamp;
>                                 shallowest_idle_cpu = i;
>                         }
> -               } else if (shallowest_idle_cpu == -1) {
> +               } else if (shallowest_idle_cpu == -1 && si_cpu == -1) {
> +                       if (sched_idle_cpu(i)) {
> +                               si_cpu = i;
> +                               continue;
> +                       }
> +
>                         load = cpu_load(cpu_rq(i));
>                         if (load < min_load) {
>                                 min_load = load;
> @@ -7493,7 +7495,11 @@ sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *
>                 }
>         }
>
> -       return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
> +       if (shallowest_idle_cpu != -1)
> +               return shallowest_idle_cpu;
> +       if (si_cpu != -1)
> +               return si_cpu;
> +       return least_loaded_cpu;
>  }
>
>  static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct task_struct *p,
> @@ -7549,11 +7555,14 @@ static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct tas
>         return new_cpu;
>  }
>
> -static inline int __select_idle_cpu(int cpu, struct task_struct *p)
> +static inline int __select_idle_cpu(int cpu, struct task_struct *p, int *si_cpu)
>  {
> -       if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) &&
> -           sched_cpu_cookie_match(cpu_rq(cpu), p))
> +       if (!sched_cpu_cookie_match(cpu_rq(cpu), p))
> +               return -1;
> +       if (available_idle_cpu(cpu))
>                 return cpu;
> +       if (*si_cpu == -1 && sched_idle_cpu(cpu))
> +               *si_cpu = cpu;
>
>         return -1;
>  }
> @@ -7649,7 +7658,7 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
>   */
>  static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
>  {
> -       int cpu;
> +       int cpu, si_cpu = -1;
>
>         for_each_cpu_and(cpu, cpu_smt_mask(target), p->cpus_ptr) {
>                 if (cpu == target)
> @@ -7660,11 +7669,13 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
>                  */
>                 if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
>                         continue;
> -               if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
> +               if (available_idle_cpu(cpu))
>                         return cpu;
> +               if (si_cpu == -1 && sched_idle_cpu(cpu))
> +                       si_cpu = cpu;
>         }
>
> -       return -1;
> +       return si_cpu;
>  }
>
>  #else /* CONFIG_SCHED_SMT */
> @@ -7680,7 +7691,7 @@ static inline bool test_idle_cores(int cpu)
>
>  static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
>  {
> -       return __select_idle_cpu(core, p);
> +       return __select_idle_cpu(core, p, idle_cpu);
>  }
>
>  static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
> @@ -7728,10 +7739,10 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
>                                                 return i;
>                                 } else {
>                                         if (--nr <= 0)
> -                                               return -1;
> -                                       idle_cpu = __select_idle_cpu(cpu, p);
> -                                       if ((unsigned int)idle_cpu < nr_cpumask_bits)
>                                                 return idle_cpu;
> +                                       i = __select_idle_cpu(cpu, p, &idle_cpu);
> +                                       if ((unsigned int)i < nr_cpumask_bits)
> +                                               return i;
>                                 }
>                         }
>                         cpumask_andnot(cpus, cpus, sched_group_span(sg));
> @@ -7746,9 +7757,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
>
>                 } else {
>                         if (--nr <= 0)
> -                               return -1;
> -                       idle_cpu = __select_idle_cpu(cpu, p);
> -                       if ((unsigned int)idle_cpu < nr_cpumask_bits)
> +                               return idle_cpu;
> +                       i = __select_idle_cpu(cpu, p, &idle_cpu);
> +                       if ((unsigned int)i < nr_cpumask_bits)
>                                 break;
>                 }
>         }
> --
> 2.37.3
>
Re: Re: [RFC PATCH 1/2] Revert "sched/fair: Make sched-idle CPU selection consistent throughout"
Posted by Abel Wu 11 months ago
Hi Vincent,

On 3/12/25 12:58 AM, Vincent Guittot wrote:
> On Mon, 10 Mar 2025 at 08:41, Abel Wu <wuyun.abel@bytedance.com> wrote:
>>
>> This reverts commit 17346452b25b98acfb395d2a82ec2e4ad0cb7a01.
>>
>> The above commit tried to unify selection policy between idle cpus
>> and SCHED_IDLE ones in fast- and slow-path of select_task_rq_fair()
>> by treating them equally (although the SCHED_IDLE cpus are turned
>> to be given more preference in slowpath). The test results seemed
>> solid, but the setup didn't take cgroup hierarchy into account,
>> which actually made some of our important services get affected.
>>
>> The cgroup hierarchy in our production environment looks like below,
>> which might be common in modern containerized setup:
>>
>>                            root
>>                          /       \
>>                  kubepods        system.slice
>>                  /       \\              \
>>          guaranteed      besteffort      containerd
>>
>>          (where 'X=A' means A is SCHED_IDLE cgroup)
>>
>> The cpu is treated as SCHED_IDLE if only besteffort is running, which
>> is given at least equal preference as the idle cpus when deciding where
>> to run a newly woken task. But the SCHED_IDLE cpus do not necessarily
>> mean they can be preempted soon enough to start serving the wakee, and
> 
> Could you give us more details why the SCHED_IDLE cpu which runs only
> besteffort can't be preempted soon enough ?
> 
> because kubepods vs system.slice is not sched_idle when comparing the

Yes, exactly. What's worse is that kubepods.weight is the sum of all its
children and can easily reach ~3000, while system.weight is default to
100. The weight configuration can be optimized, but it's another topic.

> entities ? some maybe the definition of sched_idle_cpu should be fixed
> instead

Yes, there are at least two ways to fix it:

  1) Let sched_idle_cpu() depends on a specific task, just like Josh
     mentioned in the reply of 2nd patch. So if sched_idle_cpu(cpu, p)
     returns true, then

	a) this rq only contains hierarchical idle tasks, and
	b) p can preempt current immediately

     Please see my reply to Josh to check the details.

  2) Or get rid of sched_idle_cpu() entirely. This needs some careful
     rework. Now the users of cfs_rq::h_nr_idle are two parts:

	a) select_task_rq, including sched_balance_find_dst_group_cpu()
	   and select_idle_*(). The former is handled by this series by
	   simply ignoring sched_idle_cpus, which should be safe since
	   sched_idle_cpus do not always follow the goal of the slowpath
	   to find a least loaded cpu to help load balancing. While the
	   latter is roughly designed as following:

	   - Must search within target LLC domain, since L3$ miss is
	     much more costly than L1/L2$
	   - To use cache more wisely, start searching from the L1/L2$
	     cache hot cpus like prev/recent_used/..
	   - Cheers if found an idle cpu that can preempt immediately.
	     This helps maximize using cpu bandwidth, hence improving
	     total throughput
	   - (?)
	   - Return target anyway, at least it might be cache hot

	   It could be less optimal if simply remove sched_idle_cpu and
	   return @target if no idle cpu available, because @target can
	   be heavy loaded and the cache may not hot any longer when the
	   wakee finally hit cpu. So in order not to fight with the load
	   balancing, shall we tiebreak on cpu_load() for the non-idle
	   cpus?

	b) load balance: sched_balance_domains() and dequeue_entities().
	   We could leave this as-is, but I would propose using h_idle
	   instead: if the on_cpu task is hierarchically idle when
	   triggering normal load balancing, then we guess it's a less
	   loaded cpu and can reduce balance interval. The rationale
	   behind is that the idle entities usually get very limited
	   bandwidth if any hierarchically non-idle tasks available.
	   The heuristics may have false positives which can generally
	   be divided into 3 cases:

		(The numbers represents hierarchical shares%)

						   C
		   A		   B		  / \
		 /  \\		 /  \\		 80  20
		99    1		50   50		// \
					      100  (X)

	   - Case A) The hierarchical share of h_idle tasks is indeed
	     small. So in this case they are just get scheduled to take
	     some breath, and the possibility of false positive is low
	     enough to be safely ignored.

	   - Case B) The h_idle & !h_idle tasks equally share bandwidth
	     which usually means the !h_idle part becomes less loaded
	     and pulling some load might be preferred.

	   - Case C) The hierarchical share of h_idle tasks dominates
	     which usually means their !h_idle parents are allowed to
	     use a big portion of bandwidth. In this case speedup the
	     balance is still fine because we could pull some !h_idle
	     tasks for the most 'important' cgroup.

	   So the heuristics of using rq->curr's h_idle to judge the need
	   of pulling (load balancing) seems fine.

     And as a result cfs_rq::h_nr_idle can be removed and its maintenance
     cost in hotpath can be saved.

Which way do you prefer? It would be much appreciated if you can shed some
light on this.

> 
> a sched_idle_cpu should be preempted immediately otherwise it's not a
> sched idle cpu and the definition is meaningless

Agree.

Thanks!
	Abel
Re: Re: [RFC PATCH 1/2] Revert "sched/fair: Make sched-idle CPU selection consistent throughout"
Posted by Vincent Guittot 10 months, 3 weeks ago
On Thu, 13 Mar 2025 at 08:18, Abel Wu <wuyun.abel@bytedance.com> wrote:
>
> Hi Vincent,
>
> On 3/12/25 12:58 AM, Vincent Guittot wrote:
> > On Mon, 10 Mar 2025 at 08:41, Abel Wu <wuyun.abel@bytedance.com> wrote:
> >>
> >> This reverts commit 17346452b25b98acfb395d2a82ec2e4ad0cb7a01.
> >>
> >> The above commit tried to unify selection policy between idle cpus
> >> and SCHED_IDLE ones in fast- and slow-path of select_task_rq_fair()
> >> by treating them equally (although the SCHED_IDLE cpus are turned
> >> to be given more preference in slowpath). The test results seemed
> >> solid, but the setup didn't take cgroup hierarchy into account,
> >> which actually made some of our important services get affected.
> >>
> >> The cgroup hierarchy in our production environment looks like below,
> >> which might be common in modern containerized setup:
> >>
> >>                            root
> >>                          /       \
> >>                  kubepods        system.slice
> >>                  /       \\              \
> >>          guaranteed      besteffort      containerd
> >>
> >>          (where 'X=A' means A is SCHED_IDLE cgroup)
> >>
> >> The cpu is treated as SCHED_IDLE if only besteffort is running, which
> >> is given at least equal preference as the idle cpus when deciding where
> >> to run a newly woken task. But the SCHED_IDLE cpus do not necessarily
> >> mean they can be preempted soon enough to start serving the wakee, and
> >
> > Could you give us more details why the SCHED_IDLE cpu which runs only
> > besteffort can't be preempted soon enough ?
> >
> > because kubepods vs system.slice is not sched_idle when comparing the
>
> Yes, exactly. What's worse is that kubepods.weight is the sum of all its
> children and can easily reach ~3000, while system.weight is default to
> 100. The weight configuration can be optimized, but it's another topic.
>
> > entities ? some maybe the definition of sched_idle_cpu should be fixed
> > instead
>
> Yes, there are at least two ways to fix it:
>
>   1) Let sched_idle_cpu() depends on a specific task, just like Josh
>      mentioned in the reply of 2nd patch. So if sched_idle_cpu(cpu, p)
>      returns true, then
>
>         a) this rq only contains hierarchical idle tasks, and
>         b) p can preempt current immediately
>
>      Please see my reply to Josh to check the details.

yeah, that should be the solution which covers all cases but at the
cost of walking cgroup hierarchy which is far from being ideal

Could we change h_nr_idle to only track fully sched idle tasks; I mean
tasks with a full sched_idle cgroup hierarchy ? so we would be sure to
preempt those sched idle cpus

Then, for other case of sched idle task or sched idle group childs of
a non sched idle group then we don't consider those cpu as sched idle
cpu


>
>   2) Or get rid of sched_idle_cpu() entirely. This needs some careful
>      rework. Now the users of cfs_rq::h_nr_idle are two parts:
>
>         a) select_task_rq, including sched_balance_find_dst_group_cpu()
>            and select_idle_*(). The former is handled by this series by
>            simply ignoring sched_idle_cpus, which should be safe since
>            sched_idle_cpus do not always follow the goal of the slowpath
>            to find a least loaded cpu to help load balancing. While the
>            latter is roughly designed as following:
>
>            - Must search within target LLC domain, since L3$ miss is
>              much more costly than L1/L2$
>            - To use cache more wisely, start searching from the L1/L2$
>              cache hot cpus like prev/recent_used/..
>            - Cheers if found an idle cpu that can preempt immediately.
>              This helps maximize using cpu bandwidth, hence improving
>              total throughput
>            - (?)
>            - Return target anyway, at least it might be cache hot
>
>            It could be less optimal if simply remove sched_idle_cpu and
>            return @target if no idle cpu available, because @target can
>            be heavy loaded and the cache may not hot any longer when the
>            wakee finally hit cpu. So in order not to fight with the load
>            balancing, shall we tiebreak on cpu_load() for the non-idle
>            cpus?
>
>         b) load balance: sched_balance_domains() and dequeue_entities().
>            We could leave this as-is, but I would propose using h_idle
>            instead: if the on_cpu task is hierarchically idle when
>            triggering normal load balancing, then we guess it's a less
>            loaded cpu and can reduce balance interval. The rationale
>            behind is that the idle entities usually get very limited
>            bandwidth if any hierarchically non-idle tasks available.
>            The heuristics may have false positives which can generally
>            be divided into 3 cases:
>
>                 (The numbers represents hierarchical shares%)
>
>                                                    C
>                    A               B              / \
>                  /  \\           /  \\           80  20
>                 99    1         50   50         // \
>                                               100  (X)


How the sched_idle group in B can have the same share/weight as the
not sched idle one in case B ?


>
>            - Case A) The hierarchical share of h_idle tasks is indeed
>              small. So in this case they are just get scheduled to take
>              some breath, and the possibility of false positive is low
>              enough to be safely ignored.
>
>            - Case B) The h_idle & !h_idle tasks equally share bandwidth
>              which usually means the !h_idle part becomes less loaded
>              and pulling some load might be preferred.
>
>            - Case C) The hierarchical share of h_idle tasks dominates
>              which usually means their !h_idle parents are allowed to
>              use a big portion of bandwidth. In this case speedup the
>              balance is still fine because we could pull some !h_idle
>              tasks for the most 'important' cgroup.
>
>            So the heuristics of using rq->curr's h_idle to judge the need
>            of pulling (load balancing) seems fine.
>
>      And as a result cfs_rq::h_nr_idle can be removed and its maintenance
>      cost in hotpath can be saved.
>
> Which way do you prefer? It would be much appreciated if you can shed some
> light on this.
>
> >
> > a sched_idle_cpu should be preempted immediately otherwise it's not a
> > sched idle cpu and the definition is meaningless
>
> Agree.
>
> Thanks!
>         Abel
>
Re: Re: [RFC PATCH 1/2] Revert "sched/fair: Make sched-idle CPU selection consistent throughout"
Posted by Abel Wu 10 months, 3 weeks ago
On 3/19/25 5:17 PM, Vincent Guittot wrote:
> On Thu, 13 Mar 2025 at 08:18, Abel Wu <wuyun.abel@bytedance.com> wrote:
>>
>> Hi Vincent,
>>
>> On 3/12/25 12:58 AM, Vincent Guittot wrote:
>>> On Mon, 10 Mar 2025 at 08:41, Abel Wu <wuyun.abel@bytedance.com> wrote:
>>>>
>>>> This reverts commit 17346452b25b98acfb395d2a82ec2e4ad0cb7a01.
>>>>
>>>> The above commit tried to unify selection policy between idle cpus
>>>> and SCHED_IDLE ones in fast- and slow-path of select_task_rq_fair()
>>>> by treating them equally (although the SCHED_IDLE cpus are turned
>>>> to be given more preference in slowpath). The test results seemed
>>>> solid, but the setup didn't take cgroup hierarchy into account,
>>>> which actually made some of our important services get affected.
>>>>
>>>> The cgroup hierarchy in our production environment looks like below,
>>>> which might be common in modern containerized setup:
>>>>
>>>>                             root
>>>>                           /       \
>>>>                   kubepods        system.slice
>>>>                   /       \\              \
>>>>           guaranteed      besteffort      containerd
>>>>
>>>>           (where 'X=A' means A is SCHED_IDLE cgroup)
>>>>
>>>> The cpu is treated as SCHED_IDLE if only besteffort is running, which
>>>> is given at least equal preference as the idle cpus when deciding where
>>>> to run a newly woken task. But the SCHED_IDLE cpus do not necessarily
>>>> mean they can be preempted soon enough to start serving the wakee, and
>>>
>>> Could you give us more details why the SCHED_IDLE cpu which runs only
>>> besteffort can't be preempted soon enough ?
>>>
>>> because kubepods vs system.slice is not sched_idle when comparing the
>>
>> Yes, exactly. What's worse is that kubepods.weight is the sum of all its
>> children and can easily reach ~3000, while system.weight is default to
>> 100. The weight configuration can be optimized, but it's another topic.
>>
>>> entities ? some maybe the definition of sched_idle_cpu should be fixed
>>> instead
>>
>> Yes, there are at least two ways to fix it:
>>
>>    1) Let sched_idle_cpu() depends on a specific task, just like Josh
>>       mentioned in the reply of 2nd patch. So if sched_idle_cpu(cpu, p)
>>       returns true, then
>>
>>          a) this rq only contains hierarchical idle tasks, and
>>          b) p can preempt current immediately
>>
>>       Please see my reply to Josh to check the details.
> 
> yeah, that should be the solution which covers all cases but at the
> cost of walking cgroup hierarchy which is far from being ideal

Only comparing curr vs wakee doesn't solve the problem. A cpu can be
treated as SCHED_IDLE iff *all* its SCHED_IDLE entities can be preempted
by the wakee.

> 
> Could we change h_nr_idle to only track fully sched idle tasks; I mean
> tasks with a full sched_idle cgroup hierarchy ? so we would be sure to
> preempt those sched idle cpus
> 
> Then, for other case of sched idle task or sched idle group childs of
> a non sched idle group then we don't consider those cpu as sched idle
> cpu

Ahthough this is correct, I think it would be too much since this kind
of setup is rare to the best of my knowledge.

> 
> 
>>
>>    2) Or get rid of sched_idle_cpu() entirely. This needs some careful
>>       rework. Now the users of cfs_rq::h_nr_idle are two parts:
>>
>>          a) select_task_rq, including sched_balance_find_dst_group_cpu()
>>             and select_idle_*(). The former is handled by this series by
>>             simply ignoring sched_idle_cpus, which should be safe since
>>             sched_idle_cpus do not always follow the goal of the slowpath
>>             to find a least loaded cpu to help load balancing. While the
>>             latter is roughly designed as following:
>>
>>             - Must search within target LLC domain, since L3$ miss is
>>               much more costly than L1/L2$
>>             - To use cache more wisely, start searching from the L1/L2$
>>               cache hot cpus like prev/recent_used/..
>>             - Cheers if found an idle cpu that can preempt immediately.
>>               This helps maximize using cpu bandwidth, hence improving
>>               total throughput
>>             - (?)
>>             - Return target anyway, at least it might be cache hot
>>
>>             It could be less optimal if simply remove sched_idle_cpu and
>>             return @target if no idle cpu available, because @target can
>>             be heavy loaded and the cache may not hot any longer when the
>>             wakee finally hit cpu. So in order not to fight with the load
>>             balancing, shall we tiebreak on cpu_load() for the non-idle
>>             cpus?

What do you think to choose a less loaded cpu if no idle ones available?
So the wakee will probably get better served, and also helps load balance.

>>
>>          b) load balance: sched_balance_domains() and dequeue_entities().
>>             We could leave this as-is, but I would propose using h_idle
>>             instead: if the on_cpu task is hierarchically idle when
>>             triggering normal load balancing, then we guess it's a less
>>             loaded cpu and can reduce balance interval. The rationale
>>             behind is that the idle entities usually get very limited
>>             bandwidth if any hierarchically non-idle tasks available.
>>             The heuristics may have false positives which can generally
>>             be divided into 3 cases:
>>
>>                  (The numbers represents hierarchical shares%)
>>
>>                                                     C
>>                     A               B              / \
>>                   /  \\           /  \\           80  20
>>                  99    1         50   50         // \
>>                                                100  (X)
> 
> 
> How the sched_idle group in B can have the same share/weight as the
> not sched idle one in case B ?

It can't, but theoretically several SCHED_IDLE siblings can be summed up
to match a niced SCHED_NORMAL entity.

				    B
				    |
		 ---------------------------------------
		 |     ||      ||      ||      ||      ||
		15	3	3	3	3	3

> 
> 
>>
>>             - Case A) The hierarchical share of h_idle tasks is indeed
>>               small. So in this case they are just get scheduled to take
>>               some breath, and the possibility of false positive is low
>>               enough to be safely ignored.
>>
>>             - Case B) The h_idle & !h_idle tasks equally share bandwidth
>>               which usually means the !h_idle part becomes less loaded
>>               and pulling some load might be preferred.
>>
>>             - Case C) The hierarchical share of h_idle tasks dominates
>>               which usually means their !h_idle parents are allowed to
>>               use a big portion of bandwidth. In this case speedup the
>>               balance is still fine because we could pull some !h_idle
>>               tasks for the most 'important' cgroup.
>>
>>             So the heuristics of using rq->curr's h_idle to judge the need
>>             of pulling (load balancing) seems fine.
>>
>>       And as a result cfs_rq::h_nr_idle can be removed and its maintenance
>>       cost in hotpath can be saved.
>>
>> Which way do you prefer? It would be much appreciated if you can shed some
>> light on this.
>>
>>>
>>> a sched_idle_cpu should be preempted immediately otherwise it's not a
>>> sched idle cpu and the definition is meaningless
>>
>> Agree.
>>
>> Thanks!
>>          Abel
>>
Re: Re: [RFC PATCH 1/2] Revert "sched/fair: Make sched-idle CPU selection consistent throughout"
Posted by Vincent Guittot 10 months, 3 weeks ago
On Wed, 19 Mar 2025 at 11:36, Abel Wu <wuyun.abel@bytedance.com> wrote:
>
> On 3/19/25 5:17 PM, Vincent Guittot wrote:
> > On Thu, 13 Mar 2025 at 08:18, Abel Wu <wuyun.abel@bytedance.com> wrote:
> >>
> >> Hi Vincent,
> >>
> >> On 3/12/25 12:58 AM, Vincent Guittot wrote:
> >>> On Mon, 10 Mar 2025 at 08:41, Abel Wu <wuyun.abel@bytedance.com> wrote:
> >>>>
> >>>> This reverts commit 17346452b25b98acfb395d2a82ec2e4ad0cb7a01.
> >>>>
> >>>> The above commit tried to unify selection policy between idle cpus
> >>>> and SCHED_IDLE ones in fast- and slow-path of select_task_rq_fair()
> >>>> by treating them equally (although the SCHED_IDLE cpus are turned
> >>>> to be given more preference in slowpath). The test results seemed
> >>>> solid, but the setup didn't take cgroup hierarchy into account,
> >>>> which actually made some of our important services get affected.
> >>>>
> >>>> The cgroup hierarchy in our production environment looks like below,
> >>>> which might be common in modern containerized setup:
> >>>>
> >>>>                             root
> >>>>                           /       \
> >>>>                   kubepods        system.slice
> >>>>                   /       \\              \
> >>>>           guaranteed      besteffort      containerd
> >>>>
> >>>>           (where 'X=A' means A is SCHED_IDLE cgroup)
> >>>>
> >>>> The cpu is treated as SCHED_IDLE if only besteffort is running, which
> >>>> is given at least equal preference as the idle cpus when deciding where
> >>>> to run a newly woken task. But the SCHED_IDLE cpus do not necessarily
> >>>> mean they can be preempted soon enough to start serving the wakee, and
> >>>
> >>> Could you give us more details why the SCHED_IDLE cpu which runs only
> >>> besteffort can't be preempted soon enough ?
> >>>
> >>> because kubepods vs system.slice is not sched_idle when comparing the
> >>
> >> Yes, exactly. What's worse is that kubepods.weight is the sum of all its
> >> children and can easily reach ~3000, while system.weight is default to
> >> 100. The weight configuration can be optimized, but it's another topic.
> >>
> >>> entities ? some maybe the definition of sched_idle_cpu should be fixed
> >>> instead
> >>
> >> Yes, there are at least two ways to fix it:
> >>
> >>    1) Let sched_idle_cpu() depends on a specific task, just like Josh
> >>       mentioned in the reply of 2nd patch. So if sched_idle_cpu(cpu, p)
> >>       returns true, then
> >>
> >>          a) this rq only contains hierarchical idle tasks, and
> >>          b) p can preempt current immediately
> >>
> >>       Please see my reply to Josh to check the details.
> >
> > yeah, that should be the solution which covers all cases but at the
> > cost of walking cgroup hierarchy which is far from being ideal
>
> Only comparing curr vs wakee doesn't solve the problem. A cpu can be
> treated as SCHED_IDLE iff *all* its SCHED_IDLE entities can be preempted
> by the wakee.
>
> >
> > Could we change h_nr_idle to only track fully sched idle tasks; I mean
> > tasks with a full sched_idle cgroup hierarchy ? so we would be sure to
> > preempt those sched idle cpus
> >
> > Then, for other case of sched idle task or sched idle group childs of
> > a non sched idle group then we don't consider those cpu as sched idle
> > cpu
>
> Ahthough this is correct, I think it would be too much since this kind
> of setup is rare to the best of my knowledge.

But that's the only case that we are sure

>
> >
> >
> >>
> >>    2) Or get rid of sched_idle_cpu() entirely. This needs some careful
> >>       rework. Now the users of cfs_rq::h_nr_idle are two parts:
> >>
> >>          a) select_task_rq, including sched_balance_find_dst_group_cpu()
> >>             and select_idle_*(). The former is handled by this series by
> >>             simply ignoring sched_idle_cpus, which should be safe since
> >>             sched_idle_cpus do not always follow the goal of the slowpath
> >>             to find a least loaded cpu to help load balancing. While the
> >>             latter is roughly designed as following:
> >>
> >>             - Must search within target LLC domain, since L3$ miss is
> >>               much more costly than L1/L2$
> >>             - To use cache more wisely, start searching from the L1/L2$
> >>               cache hot cpus like prev/recent_used/..
> >>             - Cheers if found an idle cpu that can preempt immediately.
> >>               This helps maximize using cpu bandwidth, hence improving
> >>               total throughput
> >>             - (?)
> >>             - Return target anyway, at least it might be cache hot
> >>
> >>             It could be less optimal if simply remove sched_idle_cpu and
> >>             return @target if no idle cpu available, because @target can
> >>             be heavy loaded and the cache may not hot any longer when the
> >>             wakee finally hit cpu. So in order not to fight with the load
> >>             balancing, shall we tiebreak on cpu_load() for the non-idle
> >>             cpus?
>
> What do you think to choose a less loaded cpu if no idle ones available?
> So the wakee will probably get better served, and also helps load balance.

I'm not a fan of adding more than idle selection and we already
compared load in wake_affine.
So we should only look at a cpu that we can preempt immediately: idle
cpus or cpus with full hierarchy sched idle entities

>
> >>
> >>          b) load balance: sched_balance_domains() and dequeue_entities().
> >>             We could leave this as-is, but I would propose using h_idle
> >>             instead: if the on_cpu task is hierarchically idle when
> >>             triggering normal load balancing, then we guess it's a less
> >>             loaded cpu and can reduce balance interval. The rationale
> >>             behind is that the idle entities usually get very limited
> >>             bandwidth if any hierarchically non-idle tasks available.
> >>             The heuristics may have false positives which can generally
> >>             be divided into 3 cases:
> >>
> >>                  (The numbers represents hierarchical shares%)
> >>
> >>                                                     C
> >>                     A               B              / \
> >>                   /  \\           /  \\           80  20
> >>                  99    1         50   50         // \
> >>                                                100  (X)
> >
> >
> > How the sched_idle group in B can have the same share/weight as the
> > not sched idle one in case B ?
>
> It can't, but theoretically several SCHED_IDLE siblings can be summed up
> to match a niced SCHED_NORMAL entity.
>
>                                     B
>                                     |
>                  ---------------------------------------
>                  |     ||      ||      ||      ||      ||
>                 15      3       3       3       3       3
>
> >
> >
> >>
> >>             - Case A) The hierarchical share of h_idle tasks is indeed
> >>               small. So in this case they are just get scheduled to take
> >>               some breath, and the possibility of false positive is low
> >>               enough to be safely ignored.
> >>
> >>             - Case B) The h_idle & !h_idle tasks equally share bandwidth
> >>               which usually means the !h_idle part becomes less loaded
> >>               and pulling some load might be preferred.
> >>
> >>             - Case C) The hierarchical share of h_idle tasks dominates
> >>               which usually means their !h_idle parents are allowed to
> >>               use a big portion of bandwidth. In this case speedup the
> >>               balance is still fine because we could pull some !h_idle
> >>               tasks for the most 'important' cgroup.
> >>
> >>             So the heuristics of using rq->curr's h_idle to judge the need
> >>             of pulling (load balancing) seems fine.
> >>
> >>       And as a result cfs_rq::h_nr_idle can be removed and its maintenance
> >>       cost in hotpath can be saved.
> >>
> >> Which way do you prefer? It would be much appreciated if you can shed some
> >> light on this.
> >>
> >>>
> >>> a sched_idle_cpu should be preempted immediately otherwise it's not a
> >>> sched idle cpu and the definition is meaningless
> >>
> >> Agree.
> >>
> >> Thanks!
> >>          Abel
> >>
>
Re: Re: [RFC PATCH 1/2] Revert "sched/fair: Make sched-idle CPU selection consistent throughout"
Posted by Abel Wu 10 months, 3 weeks ago
On 3/19/25 6:58 PM, Vincent Guittot wrote:
> On Wed, 19 Mar 2025 at 11:36, Abel Wu <wuyun.abel@bytedance.com> wrote:
>>
>> What do you think to choose a less loaded cpu if no idle ones available?
>> So the wakee will probably get better served, and also helps load balance.
> 
> I'm not a fan of adding more than idle selection and we already
> compared load in wake_affine.
> So we should only look at a cpu that we can preempt immediately: idle
> cpus or cpus with full hierarchy sched idle entities

OK, I will have a try. Thanks!