When doing load balance and the target cfs_rq is in throttled hierarchy,
whether to allow balancing there is a question.
The good side to allow balancing is: if the target CPU is idle or less
loaded and the being balanced task is holding some kernel resources,
then it seems a good idea to balance the task there and let the task get
the CPU earlier and release kernel resources sooner. The bad part is, if
the task is not holding any kernel resources, then the balance seems not
that useful.
While theoretically it's debatable, a performance test[0] which involves
200 cgroups and each cgroup runs hackbench(20 sender, 20 receiver) in
pipe mode showed a performance degradation on AMD Genoa when allowing
load balance to throttled cfs_rq. Analysis[1] showed hackbench doesn't
like task migration across LLC boundary. For this reason, add a check in
can_migrate_task() to forbid balancing to a cfs_rq that is in throttled
hierarchy. This reduced task migration a lot and performance restored.
[0]: https://lore.kernel.org/lkml/20250822110701.GB289@bytedance/
[1]: https://lore.kernel.org/lkml/20250903101102.GB42@bytedance/
Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
---
kernel/sched/fair.c | 13 +++++++++----
1 file changed, 9 insertions(+), 4 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3dbdfaa697477..00ee59993b6a3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9369,14 +9369,19 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
/*
* We do not migrate tasks that are:
* 1) delayed dequeued unless we migrate load, or
- * 2) cannot be migrated to this CPU due to cpus_ptr, or
- * 3) running (obviously), or
- * 4) are cache-hot on their current CPU, or
- * 5) are blocked on mutexes (if SCHED_PROXY_EXEC is enabled)
+ * 2) target cfs_rq is in throttled hierarchy, or
+ * 3) cannot be migrated to this CPU due to cpus_ptr, or
+ * 4) running (obviously), or
+ * 5) are cache-hot on their current CPU, or
+ * 6) are blocked on mutexes (if SCHED_PROXY_EXEC is enabled)
*/
if ((p->se.sched_delayed) && (env->migration_type != migrate_load))
return 0;
+ if (task_group(p) &&
+ throttled_hierarchy(task_group(p)->cfs_rq[env->dst_cpu]))
+ return 0;
+
/*
* We want to prioritize the migration of eligible tasks.
* For ineligible tasks we soft-limit them and only allow
--
2.39.5
Hi Aaron, kernel test robot noticed the following build errors: [auto build test ERROR on 5b726e9bf9544a349090879a513a5e00da486c14] url: https://github.com/intel-lab-lkp/linux/commits/Aaron-Lu/sched-fair-Propagate-load-for-throttled-cfs_rq/20250910-175310 base: 5b726e9bf9544a349090879a513a5e00da486c14 patch link: https://lore.kernel.org/r/20250910095044.278-5-ziqianlu%40bytedance.com patch subject: [PATCH 4/4] sched/fair: Do not balance task to a throttled cfs_rq config: i386-randconfig-012-20250911 (https://download.01.org/0day-ci/archive/20250911/202509110908.a2P8HZ8A-lkp@intel.com/config) compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261) reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250911/202509110908.a2P8HZ8A-lkp@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Closes: https://lore.kernel.org/oe-kbuild-all/202509110908.a2P8HZ8A-lkp@intel.com/ All errors (new ones prefixed by >>): >> kernel/sched/fair.c:9382:41: error: no member named 'cfs_rq' in 'struct task_group' 9382 | throttled_hierarchy(task_group(p)->cfs_rq[env->dst_cpu])) | ~~~~~~~~~~~~~ ^ 1 error generated. vim +9382 kernel/sched/fair.c 9356 9357 /* 9358 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? 9359 */ 9360 static 9361 int can_migrate_task(struct task_struct *p, struct lb_env *env) 9362 { 9363 long degrades, hot; 9364 9365 lockdep_assert_rq_held(env->src_rq); 9366 if (p->sched_task_hot) 9367 p->sched_task_hot = 0; 9368 9369 /* 9370 * We do not migrate tasks that are: 9371 * 1) delayed dequeued unless we migrate load, or 9372 * 2) target cfs_rq is in throttled hierarchy, or 9373 * 3) cannot be migrated to this CPU due to cpus_ptr, or 9374 * 4) running (obviously), or 9375 * 5) are cache-hot on their current CPU, or 9376 * 6) are blocked on mutexes (if SCHED_PROXY_EXEC is enabled) 9377 */ 9378 if ((p->se.sched_delayed) && (env->migration_type != migrate_load)) 9379 return 0; 9380 9381 if (task_group(p) && > 9382 throttled_hierarchy(task_group(p)->cfs_rq[env->dst_cpu])) 9383 return 0; 9384 9385 /* 9386 * We want to prioritize the migration of eligible tasks. 9387 * For ineligible tasks we soft-limit them and only allow 9388 * them to migrate when nr_balance_failed is non-zero to 9389 * avoid load-balancing trying very hard to balance the load. 9390 */ 9391 if (!env->sd->nr_balance_failed && 9392 task_is_ineligible_on_dst_cpu(p, env->dst_cpu)) 9393 return 0; 9394 9395 /* Disregard percpu kthreads; they are where they need to be. */ 9396 if (kthread_is_per_cpu(p)) 9397 return 0; 9398 9399 if (task_is_blocked(p)) 9400 return 0; 9401 9402 if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) { 9403 int cpu; 9404 9405 schedstat_inc(p->stats.nr_failed_migrations_affine); 9406 9407 env->flags |= LBF_SOME_PINNED; 9408 9409 /* 9410 * Remember if this task can be migrated to any other CPU in 9411 * our sched_group. We may want to revisit it if we couldn't 9412 * meet load balance goals by pulling other tasks on src_cpu. 9413 * 9414 * Avoid computing new_dst_cpu 9415 * - for NEWLY_IDLE 9416 * - if we have already computed one in current iteration 9417 * - if it's an active balance 9418 */ 9419 if (env->idle == CPU_NEWLY_IDLE || 9420 env->flags & (LBF_DST_PINNED | LBF_ACTIVE_LB)) 9421 return 0; 9422 9423 /* Prevent to re-select dst_cpu via env's CPUs: */ 9424 cpu = cpumask_first_and_and(env->dst_grpmask, env->cpus, p->cpus_ptr); 9425 9426 if (cpu < nr_cpu_ids) { 9427 env->flags |= LBF_DST_PINNED; 9428 env->new_dst_cpu = cpu; 9429 } 9430 9431 return 0; 9432 } 9433 9434 /* Record that we found at least one task that could run on dst_cpu */ 9435 env->flags &= ~LBF_ALL_PINNED; 9436 9437 if (task_on_cpu(env->src_rq, p) || 9438 task_current_donor(env->src_rq, p)) { 9439 schedstat_inc(p->stats.nr_failed_migrations_running); 9440 return 0; 9441 } 9442 9443 /* 9444 * Aggressive migration if: 9445 * 1) active balance 9446 * 2) destination numa is preferred 9447 * 3) task is cache cold, or 9448 * 4) too many balance attempts have failed. 9449 */ 9450 if (env->flags & LBF_ACTIVE_LB) 9451 return 1; 9452 9453 degrades = migrate_degrades_locality(p, env); 9454 if (!degrades) 9455 hot = task_hot(p, env); 9456 else 9457 hot = degrades > 0; 9458 9459 if (!hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { 9460 if (hot) 9461 p->sched_task_hot = 1; 9462 return 1; 9463 } 9464 9465 schedstat_inc(p->stats.nr_failed_migrations_hot); 9466 return 0; 9467 } 9468 -- 0-DAY CI Kernel Test Service https://github.com/intel/lkp-tests/wiki
When doing load balance and the target cfs_rq is in throttled hierarchy,
whether to allow balancing there is a question.
The good side to allow balancing is: if the target CPU is idle or less
loaded and the being balanced task is holding some kernel resources,
then it seems a good idea to balance the task there and let the task get
the CPU earlier and release kernel resources sooner. The bad part is, if
the task is not holding any kernel resources, then the balance seems not
that useful.
While theoretically it's debatable, a performance test[0] which involves
200 cgroups and each cgroup runs hackbench(20 sender, 20 receiver) in
pipe mode showed a performance degradation on AMD Genoa when allowing
load balance to throttled cfs_rq. Analysis[1] showed hackbench doesn't
like task migration across LLC boundary. For this reason, add a check in
can_migrate_task() to forbid balancing to a cfs_rq that is in throttled
hierarchy. This reduced task migration a lot and performance restored.
[0]: https://lore.kernel.org/lkml/20250822110701.GB289@bytedance/
[1]: https://lore.kernel.org/lkml/20250903101102.GB42@bytedance/
Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
---
update: fix build error reported by kernel test robot when
CONFIG_FAIR_GROUP_SCHED is not set.
kernel/sched/fair.c | 22 ++++++++++++++++++----
1 file changed, 18 insertions(+), 4 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3dbdfaa697477..18a30ae35441a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5737,6 +5737,11 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
return cfs_bandwidth_used() && cfs_rq->throttle_count;
}
+static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu)
+{
+ return throttled_hierarchy(task_group(p)->cfs_rq[dst_cpu]);
+}
+
static inline bool task_is_throttled(struct task_struct *p)
{
return cfs_bandwidth_used() && p->throttled;
@@ -6733,6 +6738,11 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
return 0;
}
+static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu)
+{
+ return 0;
+}
+
#ifdef CONFIG_FAIR_GROUP_SCHED
void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent) {}
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -9369,14 +9379,18 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
/*
* We do not migrate tasks that are:
* 1) delayed dequeued unless we migrate load, or
- * 2) cannot be migrated to this CPU due to cpus_ptr, or
- * 3) running (obviously), or
- * 4) are cache-hot on their current CPU, or
- * 5) are blocked on mutexes (if SCHED_PROXY_EXEC is enabled)
+ * 2) target cfs_rq is in throttled hierarchy, or
+ * 3) cannot be migrated to this CPU due to cpus_ptr, or
+ * 4) running (obviously), or
+ * 5) are cache-hot on their current CPU, or
+ * 6) are blocked on mutexes (if SCHED_PROXY_EXEC is enabled)
*/
if ((p->se.sched_delayed) && (env->migration_type != migrate_load))
return 0;
+ if (lb_throttled_hierarchy(p, env->dst_cpu))
+ return 0;
+
/*
* We want to prioritize the migration of eligible tasks.
* For ineligible tasks we soft-limit them and only allow
--
2.39.5
Hello Aaron, On 9/12/2025 9:14 AM, Aaron Lu wrote: > When doing load balance and the target cfs_rq is in throttled hierarchy, > whether to allow balancing there is a question. > > The good side to allow balancing is: if the target CPU is idle or less > loaded and the being balanced task is holding some kernel resources, > then it seems a good idea to balance the task there and let the task get > the CPU earlier and release kernel resources sooner. The bad part is, if > the task is not holding any kernel resources, then the balance seems not > that useful. > > While theoretically it's debatable, a performance test[0] which involves > 200 cgroups and each cgroup runs hackbench(20 sender, 20 receiver) in > pipe mode showed a performance degradation on AMD Genoa when allowing > load balance to throttled cfs_rq. Analysis[1] showed hackbench doesn't > like task migration across LLC boundary. For this reason, add a check in > can_migrate_task() to forbid balancing to a cfs_rq that is in throttled > hierarchy. This reduced task migration a lot and performance restored. > > [0]: https://lore.kernel.org/lkml/20250822110701.GB289@bytedance/ > [1]: https://lore.kernel.org/lkml/20250903101102.GB42@bytedance/ > Signed-off-by: Aaron Lu <ziqianlu@bytedance.com> Thank you for updating the patch. Feel free to include: Reviewed-by: K Prateek Nayak <kprateek.nayak@amd.com> -- Thanks and Regards, Prateek > --- > update: fix build error reported by kernel test robot when > CONFIG_FAIR_GROUP_SCHED is not set. > > kernel/sched/fair.c | 22 ++++++++++++++++++---- > 1 file changed, 18 insertions(+), 4 deletions(-) > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > index 3dbdfaa697477..18a30ae35441a 100644 > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -5737,6 +5737,11 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) > return cfs_bandwidth_used() && cfs_rq->throttle_count; > } > > +static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu) > +{ > + return throttled_hierarchy(task_group(p)->cfs_rq[dst_cpu]); > +} > + > static inline bool task_is_throttled(struct task_struct *p) > { > return cfs_bandwidth_used() && p->throttled; > @@ -6733,6 +6738,11 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) > return 0; > } > > +static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu) > +{ > + return 0; > +} > + > #ifdef CONFIG_FAIR_GROUP_SCHED > void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent) {} > static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} > @@ -9369,14 +9379,18 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) > /* > * We do not migrate tasks that are: > * 1) delayed dequeued unless we migrate load, or > - * 2) cannot be migrated to this CPU due to cpus_ptr, or > - * 3) running (obviously), or > - * 4) are cache-hot on their current CPU, or > - * 5) are blocked on mutexes (if SCHED_PROXY_EXEC is enabled) > + * 2) target cfs_rq is in throttled hierarchy, or > + * 3) cannot be migrated to this CPU due to cpus_ptr, or > + * 4) running (obviously), or > + * 5) are cache-hot on their current CPU, or > + * 6) are blocked on mutexes (if SCHED_PROXY_EXEC is enabled) > */ > if ((p->se.sched_delayed) && (env->migration_type != migrate_load)) > return 0; > > + if (lb_throttled_hierarchy(p, env->dst_cpu)) > + return 0; > + > /* > * We want to prioritize the migration of eligible tasks. > * For ineligible tasks we soft-limit them and only allow
The following commit has been merged into the sched/core branch of tip:
Commit-ID: 0d4eaf8caf8cd633b23e949e2996b420052c2d45
Gitweb: https://git.kernel.org/tip/0d4eaf8caf8cd633b23e949e2996b420052c2d45
Author: Aaron Lu <ziqianlu@bytedance.com>
AuthorDate: Fri, 12 Sep 2025 11:44:28 +08:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Mon, 15 Sep 2025 09:38:38 +02:00
sched/fair: Do not balance task to a throttled cfs_rq
When doing load balance and the target cfs_rq is in throttled hierarchy,
whether to allow balancing there is a question.
The good side to allow balancing is: if the target CPU is idle or less
loaded and the being balanced task is holding some kernel resources,
then it seems a good idea to balance the task there and let the task get
the CPU earlier and release kernel resources sooner. The bad part is, if
the task is not holding any kernel resources, then the balance seems not
that useful.
While theoretically it's debatable, a performance test[0] which involves
200 cgroups and each cgroup runs hackbench(20 sender, 20 receiver) in
pipe mode showed a performance degradation on AMD Genoa when allowing
load balance to throttled cfs_rq. Analysis[1] showed hackbench doesn't
like task migration across LLC boundary. For this reason, add a check in
can_migrate_task() to forbid balancing to a cfs_rq that is in throttled
hierarchy. This reduced task migration a lot and performance restored.
[0]: https://lore.kernel.org/lkml/20250822110701.GB289@bytedance/
[1]: https://lore.kernel.org/lkml/20250903101102.GB42@bytedance/
Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: K Prateek Nayak <kprateek.nayak@amd.com>
---
kernel/sched/fair.c | 22 ++++++++++++++++++----
1 file changed, 18 insertions(+), 4 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3dbdfaa..18a30ae 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5737,6 +5737,11 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
return cfs_bandwidth_used() && cfs_rq->throttle_count;
}
+static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu)
+{
+ return throttled_hierarchy(task_group(p)->cfs_rq[dst_cpu]);
+}
+
static inline bool task_is_throttled(struct task_struct *p)
{
return cfs_bandwidth_used() && p->throttled;
@@ -6733,6 +6738,11 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
return 0;
}
+static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu)
+{
+ return 0;
+}
+
#ifdef CONFIG_FAIR_GROUP_SCHED
void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent) {}
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -9369,14 +9379,18 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
/*
* We do not migrate tasks that are:
* 1) delayed dequeued unless we migrate load, or
- * 2) cannot be migrated to this CPU due to cpus_ptr, or
- * 3) running (obviously), or
- * 4) are cache-hot on their current CPU, or
- * 5) are blocked on mutexes (if SCHED_PROXY_EXEC is enabled)
+ * 2) target cfs_rq is in throttled hierarchy, or
+ * 3) cannot be migrated to this CPU due to cpus_ptr, or
+ * 4) running (obviously), or
+ * 5) are cache-hot on their current CPU, or
+ * 6) are blocked on mutexes (if SCHED_PROXY_EXEC is enabled)
*/
if ((p->se.sched_delayed) && (env->migration_type != migrate_load))
return 0;
+ if (lb_throttled_hierarchy(p, env->dst_cpu))
+ return 0;
+
/*
* We want to prioritize the migration of eligible tasks.
* For ineligible tasks we soft-limit them and only allow
© 2016 - 2025 Red Hat, Inc.