Prioritize idle cpus over SCHED_IDLE ones

[RFC PATCH 1/2] Revert "sched/fair: Make sched-idle CPU selection consistent throughout"

Posted by Abel Wu 11 months ago

This reverts commit 17346452b25b98acfb395d2a82ec2e4ad0cb7a01.

The above commit tried to unify selection policy between idle cpus
and SCHED_IDLE ones in fast- and slow-path of select_task_rq_fair()
by treating them equally (although the SCHED_IDLE cpus are turned
to be given more preference in slowpath). The test results seemed
solid, but the setup didn't take cgroup hierarchy into account,
which actually made some of our important services get affected.

The cgroup hierarchy in our production environment looks like below,
which might be common in modern containerized setup:

			  root
			/	\
		kubepods	system.slice
		/	\\		\
	guaranteed	besteffort	containerd

	(where 'X=A' means A is SCHED_IDLE cgroup)

The cpu is treated as SCHED_IDLE if only besteffort is running, which
is given at least equal preference as the idle cpus when deciding where
to run a newly woken task. But the SCHED_IDLE cpus do not necessarily
mean they can be preempted soon enough to start serving the wakee, and
containerd and other services under system.slice are the case that have
to wait in runqueue since they can not preempt kubepods, while idle cpus
are possible out there untouched.

So prioritize idle cpus over SCHED_IDLE ones to avoid undesired delay
like orchestration operations as much as possible.

Signed-off-by: Abel Wu <wuyun.abel@bytedance.com>
---
 kernel/sched/fair.c | 49 +++++++++++++++++++++++++++------------------
 1 file changed, 30 insertions(+), 19 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ae0350088ac1..379764bd2795 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7446,7 +7446,7 @@ sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *
 	unsigned int min_exit_latency = UINT_MAX;
 	u64 latest_idle_timestamp = 0;
 	int least_loaded_cpu = this_cpu;
-	int shallowest_idle_cpu = -1;
+	int shallowest_idle_cpu = -1, si_cpu = -1;
 	int i;
 
 	/* Check if we have any choice: */
@@ -7460,9 +7460,6 @@ sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *
 		if (!sched_core_cookie_match(rq, p))
 			continue;
 
-		if (sched_idle_cpu(i))
-			return i;
-
 		if (available_idle_cpu(i)) {
 			struct cpuidle_state *idle = idle_get_state(rq);
 			if (idle && idle->exit_latency < min_exit_latency) {
@@ -7484,7 +7481,12 @@ sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *
 				latest_idle_timestamp = rq->idle_stamp;
 				shallowest_idle_cpu = i;
 			}
-		} else if (shallowest_idle_cpu == -1) {
+		} else if (shallowest_idle_cpu == -1 && si_cpu == -1) {
+			if (sched_idle_cpu(i)) {
+				si_cpu = i;
+				continue;
+			}
+
 			load = cpu_load(cpu_rq(i));
 			if (load < min_load) {
 				min_load = load;
@@ -7493,7 +7495,11 @@ sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *
 		}
 	}
 
-	return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
+	if (shallowest_idle_cpu != -1)
+		return shallowest_idle_cpu;
+	if (si_cpu != -1)
+		return si_cpu;
+	return least_loaded_cpu;
 }
 
 static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct task_struct *p,
@@ -7549,11 +7555,14 @@ static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct tas
 	return new_cpu;
 }
 
-static inline int __select_idle_cpu(int cpu, struct task_struct *p)
+static inline int __select_idle_cpu(int cpu, struct task_struct *p, int *si_cpu)
 {
-	if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) &&
-	    sched_cpu_cookie_match(cpu_rq(cpu), p))
+	if (!sched_cpu_cookie_match(cpu_rq(cpu), p))
+		return -1;
+	if (available_idle_cpu(cpu))
 		return cpu;
+	if (*si_cpu == -1 && sched_idle_cpu(cpu))
+		*si_cpu = cpu;
 
 	return -1;
 }
@@ -7649,7 +7658,7 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
  */
 static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
 {
-	int cpu;
+	int cpu, si_cpu = -1;
 
 	for_each_cpu_and(cpu, cpu_smt_mask(target), p->cpus_ptr) {
 		if (cpu == target)
@@ -7660,11 +7669,13 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
 		 */
 		if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
 			continue;
-		if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
+		if (available_idle_cpu(cpu))
 			return cpu;
+		if (si_cpu == -1 && sched_idle_cpu(cpu))
+			si_cpu = cpu;
 	}
 
-	return -1;
+	return si_cpu;
 }
 
 #else /* CONFIG_SCHED_SMT */
@@ -7680,7 +7691,7 @@ static inline bool test_idle_cores(int cpu)
 
 static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
 {
-	return __select_idle_cpu(core, p);
+	return __select_idle_cpu(core, p, idle_cpu);
 }
 
 static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
@@ -7728,10 +7739,10 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
 						return i;
 				} else {
 					if (--nr <= 0)
-						return -1;
-					idle_cpu = __select_idle_cpu(cpu, p);
-					if ((unsigned int)idle_cpu < nr_cpumask_bits)
 						return idle_cpu;
+					i = __select_idle_cpu(cpu, p, &idle_cpu);
+					if ((unsigned int)i < nr_cpumask_bits)
+						return i;
 				}
 			}
 			cpumask_andnot(cpus, cpus, sched_group_span(sg));
@@ -7746,9 +7757,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
 
 		} else {
 			if (--nr <= 0)
-				return -1;
-			idle_cpu = __select_idle_cpu(cpu, p);
-			if ((unsigned int)idle_cpu < nr_cpumask_bits)
+				return idle_cpu;
+			i = __select_idle_cpu(cpu, p, &idle_cpu);
+			if ((unsigned int)i < nr_cpumask_bits)
 				break;
 		}
 	}
-- 
2.37.3

Re: [RFC PATCH 1/2] Revert "sched/fair: Make sched-idle CPU selection consistent throughout"

Posted by Vincent Guittot 11 months ago

On Mon, 10 Mar 2025 at 08:41, Abel Wu <wuyun.abel@bytedance.com> wrote:
>
> This reverts commit 17346452b25b98acfb395d2a82ec2e4ad0cb7a01.
>
> The above commit tried to unify selection policy between idle cpus
> and SCHED_IDLE ones in fast- and slow-path of select_task_rq_fair()
> by treating them equally (although the SCHED_IDLE cpus are turned
> to be given more preference in slowpath). The test results seemed
> solid, but the setup didn't take cgroup hierarchy into account,
> which actually made some of our important services get affected.
>
> The cgroup hierarchy in our production environment looks like below,
> which might be common in modern containerized setup:
>
>                           root
>                         /       \
>                 kubepods        system.slice
>                 /       \\              \
>         guaranteed      besteffort      containerd
>
>         (where 'X=A' means A is SCHED_IDLE cgroup)
>
> The cpu is treated as SCHED_IDLE if only besteffort is running, which
> is given at least equal preference as the idle cpus when deciding where
> to run a newly woken task. But the SCHED_IDLE cpus do not necessarily
> mean they can be preempted soon enough to start serving the wakee, and

Could you give us more details why the SCHED_IDLE cpu which runs only
besteffort can't be preempted soon enough ?

because kubepods vs system.slice is not sched_idle when comparing the
entities ? some maybe the definition of sched_idle_cpu should be fixed
instead

a sched_idle_cpu should be preempted immediately otherwise it's not a
sched idle cpu and the definition is meaningless

> containerd and other services under system.slice are the case that have
> to wait in runqueue since they can not preempt kubepods, while idle cpus
> are possible out there untouched.
>
> So prioritize idle cpus over SCHED_IDLE ones to avoid undesired delay
> like orchestration operations as much as possible.
>
> Signed-off-by: Abel Wu <wuyun.abel@bytedance.com>
> ---
>  kernel/sched/fair.c | 49 +++++++++++++++++++++++++++------------------
>  1 file changed, 30 insertions(+), 19 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index ae0350088ac1..379764bd2795 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7446,7 +7446,7 @@ sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *
>         unsigned int min_exit_latency = UINT_MAX;
>         u64 latest_idle_timestamp = 0;
>         int least_loaded_cpu = this_cpu;
> -       int shallowest_idle_cpu = -1;
> +       int shallowest_idle_cpu = -1, si_cpu = -1;
>         int i;
>
>         /* Check if we have any choice: */
> @@ -7460,9 +7460,6 @@ sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *
>                 if (!sched_core_cookie_match(rq, p))
>                         continue;
>
> -               if (sched_idle_cpu(i))
> -                       return i;
> -
>                 if (available_idle_cpu(i)) {
>                         struct cpuidle_state *idle = idle_get_state(rq);
>                         if (idle && idle->exit_latency < min_exit_latency) {
> @@ -7484,7 +7481,12 @@ sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *
>                                 latest_idle_timestamp = rq->idle_stamp;
>                                 shallowest_idle_cpu = i;
>                         }
> -               } else if (shallowest_idle_cpu == -1) {
> +               } else if (shallowest_idle_cpu == -1 && si_cpu == -1) {
> +                       if (sched_idle_cpu(i)) {
> +                               si_cpu = i;
> +                               continue;
> +                       }
> +
>                         load = cpu_load(cpu_rq(i));
>                         if (load < min_load) {
>                                 min_load = load;
> @@ -7493,7 +7495,11 @@ sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *
>                 }
>         }
>
> -       return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
> +       if (shallowest_idle_cpu != -1)
> +               return shallowest_idle_cpu;
> +       if (si_cpu != -1)
> +               return si_cpu;
> +       return least_loaded_cpu;
>  }
>
>  static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct task_struct *p,
> @@ -7549,11 +7555,14 @@ static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct tas
>         return new_cpu;
>  }
>
> -static inline int __select_idle_cpu(int cpu, struct task_struct *p)
> +static inline int __select_idle_cpu(int cpu, struct task_struct *p, int *si_cpu)
>  {
> -       if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) &&
> -           sched_cpu_cookie_match(cpu_rq(cpu), p))
> +       if (!sched_cpu_cookie_match(cpu_rq(cpu), p))
> +               return -1;
> +       if (available_idle_cpu(cpu))
>                 return cpu;
> +       if (*si_cpu == -1 && sched_idle_cpu(cpu))
> +               *si_cpu = cpu;
>
>         return -1;
>  }
> @@ -7649,7 +7658,7 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
>   */
>  static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
>  {
> -       int cpu;
> +       int cpu, si_cpu = -1;
>
>         for_each_cpu_and(cpu, cpu_smt_mask(target), p->cpus_ptr) {
>                 if (cpu == target)
> @@ -7660,11 +7669,13 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
>                  */
>                 if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
>                         continue;
> -               if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
> +               if (available_idle_cpu(cpu))
>                         return cpu;
> +               if (si_cpu == -1 && sched_idle_cpu(cpu))
> +                       si_cpu = cpu;
>         }
>
> -       return -1;
> +       return si_cpu;
>  }
>
>  #else /* CONFIG_SCHED_SMT */
> @@ -7680,7 +7691,7 @@ static inline bool test_idle_cores(int cpu)
>
>  static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
>  {
> -       return __select_idle_cpu(core, p);
> +       return __select_idle_cpu(core, p, idle_cpu);
>  }
>
>  static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
> @@ -7728,10 +7739,10 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
>                                                 return i;
>                                 } else {
>                                         if (--nr <= 0)
> -                                               return -1;
> -                                       idle_cpu = __select_idle_cpu(cpu, p);
> -                                       if ((unsigned int)idle_cpu < nr_cpumask_bits)
>                                                 return idle_cpu;
> +                                       i = __select_idle_cpu(cpu, p, &idle_cpu);
> +                                       if ((unsigned int)i < nr_cpumask_bits)
> +                                               return i;
>                                 }
>                         }
>                         cpumask_andnot(cpus, cpus, sched_group_span(sg));
> @@ -7746,9 +7757,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
>
>                 } else {
>                         if (--nr <= 0)
> -                               return -1;
> -                       idle_cpu = __select_idle_cpu(cpu, p);
> -                       if ((unsigned int)idle_cpu < nr_cpumask_bits)
> +                               return idle_cpu;
> +                       i = __select_idle_cpu(cpu, p, &idle_cpu);
> +                       if ((unsigned int)i < nr_cpumask_bits)
>                                 break;
>                 }
>         }
> --
> 2.37.3
>