When doing load balance and the target cfs_rq is in throttled hierarchy,
whether to allow balancing there is a question.
The good side to allow balancing is: if the target CPU is idle or less
loaded and the being balanced task is holding some kernel resources,
then it seems a good idea to balance the task there and let the task get
the CPU earlier and release kernel resources sooner. The bad part is, if
the task is not holding any kernel resources, then the balance seems not
that useful.
While theoretically it's debatable, a performance test[0] which involves
200 cgroups and each cgroup runs hackbench(20 sender, 20 receiver) in
pipe mode showed a performance degradation on AMD Genoa when allowing
load balance to throttled cfs_rq. Analysis[1] showed hackbench doesn't
like task migration across LLC boundary. For this reason, add a check in
can_migrate_task() to forbid balancing to a cfs_rq that is in throttled
hierarchy. This reduced task migration a lot and performance restored.
[0]: https://lore.kernel.org/lkml/20250822110701.GB289@bytedance/
[1]: https://lore.kernel.org/lkml/20250903101102.GB42@bytedance/
Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
---
kernel/sched/fair.c | 13 +++++++++----
1 file changed, 9 insertions(+), 4 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3dbdfaa697477..00ee59993b6a3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9369,14 +9369,19 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
/*
* We do not migrate tasks that are:
* 1) delayed dequeued unless we migrate load, or
- * 2) cannot be migrated to this CPU due to cpus_ptr, or
- * 3) running (obviously), or
- * 4) are cache-hot on their current CPU, or
- * 5) are blocked on mutexes (if SCHED_PROXY_EXEC is enabled)
+ * 2) target cfs_rq is in throttled hierarchy, or
+ * 3) cannot be migrated to this CPU due to cpus_ptr, or
+ * 4) running (obviously), or
+ * 5) are cache-hot on their current CPU, or
+ * 6) are blocked on mutexes (if SCHED_PROXY_EXEC is enabled)
*/
if ((p->se.sched_delayed) && (env->migration_type != migrate_load))
return 0;
+ if (task_group(p) &&
+ throttled_hierarchy(task_group(p)->cfs_rq[env->dst_cpu]))
+ return 0;
+
/*
* We want to prioritize the migration of eligible tasks.
* For ineligible tasks we soft-limit them and only allow
--
2.39.5
Hi Aaron,
kernel test robot noticed the following build errors:
[auto build test ERROR on 5b726e9bf9544a349090879a513a5e00da486c14]
url: https://github.com/intel-lab-lkp/linux/commits/Aaron-Lu/sched-fair-Propagate-load-for-throttled-cfs_rq/20250910-175310
base: 5b726e9bf9544a349090879a513a5e00da486c14
patch link: https://lore.kernel.org/r/20250910095044.278-5-ziqianlu%40bytedance.com
patch subject: [PATCH 4/4] sched/fair: Do not balance task to a throttled cfs_rq
config: i386-randconfig-012-20250911 (https://download.01.org/0day-ci/archive/20250911/202509110908.a2P8HZ8A-lkp@intel.com/config)
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250911/202509110908.a2P8HZ8A-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202509110908.a2P8HZ8A-lkp@intel.com/
All errors (new ones prefixed by >>):
>> kernel/sched/fair.c:9382:41: error: no member named 'cfs_rq' in 'struct task_group'
9382 | throttled_hierarchy(task_group(p)->cfs_rq[env->dst_cpu]))
| ~~~~~~~~~~~~~ ^
1 error generated.
vim +9382 kernel/sched/fair.c
9356
9357 /*
9358 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
9359 */
9360 static
9361 int can_migrate_task(struct task_struct *p, struct lb_env *env)
9362 {
9363 long degrades, hot;
9364
9365 lockdep_assert_rq_held(env->src_rq);
9366 if (p->sched_task_hot)
9367 p->sched_task_hot = 0;
9368
9369 /*
9370 * We do not migrate tasks that are:
9371 * 1) delayed dequeued unless we migrate load, or
9372 * 2) target cfs_rq is in throttled hierarchy, or
9373 * 3) cannot be migrated to this CPU due to cpus_ptr, or
9374 * 4) running (obviously), or
9375 * 5) are cache-hot on their current CPU, or
9376 * 6) are blocked on mutexes (if SCHED_PROXY_EXEC is enabled)
9377 */
9378 if ((p->se.sched_delayed) && (env->migration_type != migrate_load))
9379 return 0;
9380
9381 if (task_group(p) &&
> 9382 throttled_hierarchy(task_group(p)->cfs_rq[env->dst_cpu]))
9383 return 0;
9384
9385 /*
9386 * We want to prioritize the migration of eligible tasks.
9387 * For ineligible tasks we soft-limit them and only allow
9388 * them to migrate when nr_balance_failed is non-zero to
9389 * avoid load-balancing trying very hard to balance the load.
9390 */
9391 if (!env->sd->nr_balance_failed &&
9392 task_is_ineligible_on_dst_cpu(p, env->dst_cpu))
9393 return 0;
9394
9395 /* Disregard percpu kthreads; they are where they need to be. */
9396 if (kthread_is_per_cpu(p))
9397 return 0;
9398
9399 if (task_is_blocked(p))
9400 return 0;
9401
9402 if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
9403 int cpu;
9404
9405 schedstat_inc(p->stats.nr_failed_migrations_affine);
9406
9407 env->flags |= LBF_SOME_PINNED;
9408
9409 /*
9410 * Remember if this task can be migrated to any other CPU in
9411 * our sched_group. We may want to revisit it if we couldn't
9412 * meet load balance goals by pulling other tasks on src_cpu.
9413 *
9414 * Avoid computing new_dst_cpu
9415 * - for NEWLY_IDLE
9416 * - if we have already computed one in current iteration
9417 * - if it's an active balance
9418 */
9419 if (env->idle == CPU_NEWLY_IDLE ||
9420 env->flags & (LBF_DST_PINNED | LBF_ACTIVE_LB))
9421 return 0;
9422
9423 /* Prevent to re-select dst_cpu via env's CPUs: */
9424 cpu = cpumask_first_and_and(env->dst_grpmask, env->cpus, p->cpus_ptr);
9425
9426 if (cpu < nr_cpu_ids) {
9427 env->flags |= LBF_DST_PINNED;
9428 env->new_dst_cpu = cpu;
9429 }
9430
9431 return 0;
9432 }
9433
9434 /* Record that we found at least one task that could run on dst_cpu */
9435 env->flags &= ~LBF_ALL_PINNED;
9436
9437 if (task_on_cpu(env->src_rq, p) ||
9438 task_current_donor(env->src_rq, p)) {
9439 schedstat_inc(p->stats.nr_failed_migrations_running);
9440 return 0;
9441 }
9442
9443 /*
9444 * Aggressive migration if:
9445 * 1) active balance
9446 * 2) destination numa is preferred
9447 * 3) task is cache cold, or
9448 * 4) too many balance attempts have failed.
9449 */
9450 if (env->flags & LBF_ACTIVE_LB)
9451 return 1;
9452
9453 degrades = migrate_degrades_locality(p, env);
9454 if (!degrades)
9455 hot = task_hot(p, env);
9456 else
9457 hot = degrades > 0;
9458
9459 if (!hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
9460 if (hot)
9461 p->sched_task_hot = 1;
9462 return 1;
9463 }
9464
9465 schedstat_inc(p->stats.nr_failed_migrations_hot);
9466 return 0;
9467 }
9468
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
When doing load balance and the target cfs_rq is in throttled hierarchy,
whether to allow balancing there is a question.
The good side to allow balancing is: if the target CPU is idle or less
loaded and the being balanced task is holding some kernel resources,
then it seems a good idea to balance the task there and let the task get
the CPU earlier and release kernel resources sooner. The bad part is, if
the task is not holding any kernel resources, then the balance seems not
that useful.
While theoretically it's debatable, a performance test[0] which involves
200 cgroups and each cgroup runs hackbench(20 sender, 20 receiver) in
pipe mode showed a performance degradation on AMD Genoa when allowing
load balance to throttled cfs_rq. Analysis[1] showed hackbench doesn't
like task migration across LLC boundary. For this reason, add a check in
can_migrate_task() to forbid balancing to a cfs_rq that is in throttled
hierarchy. This reduced task migration a lot and performance restored.
[0]: https://lore.kernel.org/lkml/20250822110701.GB289@bytedance/
[1]: https://lore.kernel.org/lkml/20250903101102.GB42@bytedance/
Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
---
update: fix build error reported by kernel test robot when
CONFIG_FAIR_GROUP_SCHED is not set.
kernel/sched/fair.c | 22 ++++++++++++++++++----
1 file changed, 18 insertions(+), 4 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3dbdfaa697477..18a30ae35441a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5737,6 +5737,11 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
return cfs_bandwidth_used() && cfs_rq->throttle_count;
}
+static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu)
+{
+ return throttled_hierarchy(task_group(p)->cfs_rq[dst_cpu]);
+}
+
static inline bool task_is_throttled(struct task_struct *p)
{
return cfs_bandwidth_used() && p->throttled;
@@ -6733,6 +6738,11 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
return 0;
}
+static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu)
+{
+ return 0;
+}
+
#ifdef CONFIG_FAIR_GROUP_SCHED
void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent) {}
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -9369,14 +9379,18 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
/*
* We do not migrate tasks that are:
* 1) delayed dequeued unless we migrate load, or
- * 2) cannot be migrated to this CPU due to cpus_ptr, or
- * 3) running (obviously), or
- * 4) are cache-hot on their current CPU, or
- * 5) are blocked on mutexes (if SCHED_PROXY_EXEC is enabled)
+ * 2) target cfs_rq is in throttled hierarchy, or
+ * 3) cannot be migrated to this CPU due to cpus_ptr, or
+ * 4) running (obviously), or
+ * 5) are cache-hot on their current CPU, or
+ * 6) are blocked on mutexes (if SCHED_PROXY_EXEC is enabled)
*/
if ((p->se.sched_delayed) && (env->migration_type != migrate_load))
return 0;
+ if (lb_throttled_hierarchy(p, env->dst_cpu))
+ return 0;
+
/*
* We want to prioritize the migration of eligible tasks.
* For ineligible tasks we soft-limit them and only allow
--
2.39.5
Hello Aaron,
On 9/12/2025 9:14 AM, Aaron Lu wrote:
> When doing load balance and the target cfs_rq is in throttled hierarchy,
> whether to allow balancing there is a question.
>
> The good side to allow balancing is: if the target CPU is idle or less
> loaded and the being balanced task is holding some kernel resources,
> then it seems a good idea to balance the task there and let the task get
> the CPU earlier and release kernel resources sooner. The bad part is, if
> the task is not holding any kernel resources, then the balance seems not
> that useful.
>
> While theoretically it's debatable, a performance test[0] which involves
> 200 cgroups and each cgroup runs hackbench(20 sender, 20 receiver) in
> pipe mode showed a performance degradation on AMD Genoa when allowing
> load balance to throttled cfs_rq. Analysis[1] showed hackbench doesn't
> like task migration across LLC boundary. For this reason, add a check in
> can_migrate_task() to forbid balancing to a cfs_rq that is in throttled
> hierarchy. This reduced task migration a lot and performance restored.
>
> [0]: https://lore.kernel.org/lkml/20250822110701.GB289@bytedance/
> [1]: https://lore.kernel.org/lkml/20250903101102.GB42@bytedance/
> Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
Thank you for updating the patch. Feel free to include:
Reviewed-by: K Prateek Nayak <kprateek.nayak@amd.com>
--
Thanks and Regards,
Prateek
> ---
> update: fix build error reported by kernel test robot when
> CONFIG_FAIR_GROUP_SCHED is not set.
>
> kernel/sched/fair.c | 22 ++++++++++++++++++----
> 1 file changed, 18 insertions(+), 4 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 3dbdfaa697477..18a30ae35441a 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5737,6 +5737,11 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
> return cfs_bandwidth_used() && cfs_rq->throttle_count;
> }
>
> +static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu)
> +{
> + return throttled_hierarchy(task_group(p)->cfs_rq[dst_cpu]);
> +}
> +
> static inline bool task_is_throttled(struct task_struct *p)
> {
> return cfs_bandwidth_used() && p->throttled;
> @@ -6733,6 +6738,11 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
> return 0;
> }
>
> +static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu)
> +{
> + return 0;
> +}
> +
> #ifdef CONFIG_FAIR_GROUP_SCHED
> void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent) {}
> static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
> @@ -9369,14 +9379,18 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
> /*
> * We do not migrate tasks that are:
> * 1) delayed dequeued unless we migrate load, or
> - * 2) cannot be migrated to this CPU due to cpus_ptr, or
> - * 3) running (obviously), or
> - * 4) are cache-hot on their current CPU, or
> - * 5) are blocked on mutexes (if SCHED_PROXY_EXEC is enabled)
> + * 2) target cfs_rq is in throttled hierarchy, or
> + * 3) cannot be migrated to this CPU due to cpus_ptr, or
> + * 4) running (obviously), or
> + * 5) are cache-hot on their current CPU, or
> + * 6) are blocked on mutexes (if SCHED_PROXY_EXEC is enabled)
> */
> if ((p->se.sched_delayed) && (env->migration_type != migrate_load))
> return 0;
>
> + if (lb_throttled_hierarchy(p, env->dst_cpu))
> + return 0;
> +
> /*
> * We want to prioritize the migration of eligible tasks.
> * For ineligible tasks we soft-limit them and only allow
The following commit has been merged into the sched/core branch of tip:
Commit-ID: 0d4eaf8caf8cd633b23e949e2996b420052c2d45
Gitweb: https://git.kernel.org/tip/0d4eaf8caf8cd633b23e949e2996b420052c2d45
Author: Aaron Lu <ziqianlu@bytedance.com>
AuthorDate: Fri, 12 Sep 2025 11:44:28 +08:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Mon, 15 Sep 2025 09:38:38 +02:00
sched/fair: Do not balance task to a throttled cfs_rq
When doing load balance and the target cfs_rq is in throttled hierarchy,
whether to allow balancing there is a question.
The good side to allow balancing is: if the target CPU is idle or less
loaded and the being balanced task is holding some kernel resources,
then it seems a good idea to balance the task there and let the task get
the CPU earlier and release kernel resources sooner. The bad part is, if
the task is not holding any kernel resources, then the balance seems not
that useful.
While theoretically it's debatable, a performance test[0] which involves
200 cgroups and each cgroup runs hackbench(20 sender, 20 receiver) in
pipe mode showed a performance degradation on AMD Genoa when allowing
load balance to throttled cfs_rq. Analysis[1] showed hackbench doesn't
like task migration across LLC boundary. For this reason, add a check in
can_migrate_task() to forbid balancing to a cfs_rq that is in throttled
hierarchy. This reduced task migration a lot and performance restored.
[0]: https://lore.kernel.org/lkml/20250822110701.GB289@bytedance/
[1]: https://lore.kernel.org/lkml/20250903101102.GB42@bytedance/
Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: K Prateek Nayak <kprateek.nayak@amd.com>
---
kernel/sched/fair.c | 22 ++++++++++++++++++----
1 file changed, 18 insertions(+), 4 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3dbdfaa..18a30ae 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5737,6 +5737,11 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
return cfs_bandwidth_used() && cfs_rq->throttle_count;
}
+static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu)
+{
+ return throttled_hierarchy(task_group(p)->cfs_rq[dst_cpu]);
+}
+
static inline bool task_is_throttled(struct task_struct *p)
{
return cfs_bandwidth_used() && p->throttled;
@@ -6733,6 +6738,11 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
return 0;
}
+static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu)
+{
+ return 0;
+}
+
#ifdef CONFIG_FAIR_GROUP_SCHED
void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent) {}
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -9369,14 +9379,18 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
/*
* We do not migrate tasks that are:
* 1) delayed dequeued unless we migrate load, or
- * 2) cannot be migrated to this CPU due to cpus_ptr, or
- * 3) running (obviously), or
- * 4) are cache-hot on their current CPU, or
- * 5) are blocked on mutexes (if SCHED_PROXY_EXEC is enabled)
+ * 2) target cfs_rq is in throttled hierarchy, or
+ * 3) cannot be migrated to this CPU due to cpus_ptr, or
+ * 4) running (obviously), or
+ * 5) are cache-hot on their current CPU, or
+ * 6) are blocked on mutexes (if SCHED_PROXY_EXEC is enabled)
*/
if ((p->se.sched_delayed) && (env->migration_type != migrate_load))
return 0;
+ if (lb_throttled_hierarchy(p, env->dst_cpu))
+ return 0;
+
/*
* We want to prioritize the migration of eligible tasks.
* For ineligible tasks we soft-limit them and only allow
© 2016 - 2026 Red Hat, Inc.