[tip: sched/core] sched/cache: Prioritize tasks preferring destination LLC during balancing

tip-bot2 for Tim Chen posted 1 patch 4 days, 14 hours ago
kernel/sched/fair.c | 79 +++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 78 insertions(+), 1 deletion(-)
[tip: sched/core] sched/cache: Prioritize tasks preferring destination LLC during balancing
Posted by tip-bot2 for Tim Chen 4 days, 14 hours ago
The following commit has been merged into the sched/core branch of tip:

Commit-ID:     f38cc2f0d8a354551d219e7fd95fce3e96868105
Gitweb:        https://git.kernel.org/tip/f38cc2f0d8a354551d219e7fd95fce3e96868105
Author:        Tim Chen <tim.c.chen@linux.intel.com>
AuthorDate:    Wed, 01 Apr 2026 14:52:24 -07:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 09 Apr 2026 15:49:50 +02:00

sched/cache: Prioritize tasks preferring destination LLC during balancing

During LLC load balancing, first check for tasks that prefer the
destination LLC and balance them to it before others.

Mark source sched groups containing tasks preferring non local LLCs
with the group_llc_balance flag. This ensures the load balancer later
pulls or pushes these tasks toward their preferred LLCs.
The priority of group_llc_balance is lower than that of group_overloaded
and higher than that of all other group types. This is because
group_llc_balance may exacerbate load imbalance, and if the LLC balancing
attempt fails, the nr_balance_failed mechanism will trigger other group
types to rebalance the load.

The load balancer selects the busiest sched_group and migrates tasks
to less busy groups to distribute load across CPUs.

With cache-aware scheduling enabled, the busiest sched_group is
the one with most tasks preferring the destination LLC. If
the group has the llc_balance flag set, cache aware load balancing is
triggered.

Introduce the helper function update_llc_busiest() to identify the
sched_group with the most tasks preferring the destination LLC.

Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Suggested-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
Co-developed-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/baa458f45eab3f602af090c6d6af63dc864f5ec6.1775065312.git.tim.c.chen@linux.intel.com
---
 kernel/sched/fair.c | 79 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 78 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d3812d1..ba4ee9a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9926,6 +9926,16 @@ enum group_type {
 	 */
 	group_imbalanced,
 	/*
+	 * There are tasks running on non-preferred LLC, possible to move
+	 * them to their preferred LLC without creating too much imbalance.
+	 * The priority of group_llc_balance is lower than that of
+	 * group_overloaded and higher than that of all other group types.
+	 * This is because group_llc_balance may exacerbate load imbalance.
+	 * If the LLC balancing attempt fails, the nr_balance_failed
+	 * mechanism will trigger other group types to rebalance the load.
+	 */
+	group_llc_balance,
+	/*
 	 * The CPU is overloaded and can't provide expected CPU cycles to all
 	 * tasks.
 	 */
@@ -10828,6 +10838,7 @@ struct sg_lb_stats {
 	enum group_type group_type;
 	unsigned int group_asym_packing;	/* Tasks should be moved to preferred CPU */
 	unsigned int group_smt_balance;		/* Task on busy SMT be moved */
+	unsigned int group_llc_balance;		/* Tasks should be moved to preferred LLC */
 	unsigned long group_misfit_task_load;	/* A CPU has a task too big for its capacity */
 	unsigned int group_overutilized;	/* At least one CPU is overutilized in the group */
 #ifdef CONFIG_NUMA_BALANCING
@@ -11094,6 +11105,9 @@ group_type group_classify(unsigned int imbalance_pct,
 	if (group_is_overloaded(imbalance_pct, sgs))
 		return group_overloaded;
 
+	if (sgs->group_llc_balance)
+		return group_llc_balance;
+
 	if (sg_imbalanced(group))
 		return group_imbalanced;
 
@@ -11288,11 +11302,63 @@ static void record_sg_llc_stats(struct lb_env *env,
 	if (unlikely(READ_ONCE(sd_share->capacity) != sgs->group_capacity))
 		WRITE_ONCE(sd_share->capacity, sgs->group_capacity);
 }
+
+/*
+ * Do LLC balance on sched group that contains LLC, and have tasks preferring
+ * to run on LLC in idle dst_cpu.
+ */
+static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs,
+			       struct sched_group *group)
+{
+	if (!sched_cache_enabled())
+		return false;
+
+	if (env->sd->flags & SD_SHARE_LLC)
+		return false;
+
+	/*
+	 * Skip cache aware tagging if nr_balanced_failed is sufficiently high.
+	 * Threshold of cache_nice_tries is set to 1 higher than nr_balance_failed
+	 * to avoid excessive task migration at the same time.
+	 */
+	if (env->sd->nr_balance_failed >= env->sd->cache_nice_tries + 1)
+		return false;
+
+	if (sgs->nr_pref_dst_llc &&
+	    can_migrate_llc(cpumask_first(sched_group_span(group)),
+			    env->dst_cpu, 0, true) == mig_llc)
+		return true;
+
+	return false;
+}
+
+static bool update_llc_busiest(struct lb_env *env,
+			       struct sg_lb_stats *busiest,
+			       struct sg_lb_stats *sgs)
+{
+	/*
+	 * There are more tasks that want to run on dst_cpu's LLC.
+	 */
+	return sgs->nr_pref_dst_llc > busiest->nr_pref_dst_llc;
+}
 #else
 static inline void record_sg_llc_stats(struct lb_env *env, struct sg_lb_stats *sgs,
 				       struct sched_group *group)
 {
 }
+
+static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs,
+			       struct sched_group *group)
+{
+	return false;
+}
+
+static bool update_llc_busiest(struct lb_env *env,
+			       struct sg_lb_stats *busiest,
+			       struct sg_lb_stats *sgs)
+{
+	return false;
+}
 #endif
 
 /**
@@ -11394,6 +11460,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 		/* Check for loaded SMT group to be balanced to dst CPU */
 		if (smt_balance(env, sgs, group))
 			sgs->group_smt_balance = 1;
+
+		/* Check for tasks in this group can be moved to their preferred LLC */
+		if (llc_balance(env, sgs, group))
+			sgs->group_llc_balance = 1;
 	}
 
 	sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
@@ -11457,6 +11527,10 @@ static bool update_sd_pick_busiest(struct lb_env *env,
 		/* Select the overloaded group with highest avg_load. */
 		return sgs->avg_load > busiest->avg_load;
 
+	case group_llc_balance:
+		/* Select the group with most tasks preferring dst LLC */
+		return update_llc_busiest(env, busiest, sgs);
+
 	case group_imbalanced:
 		/*
 		 * Select the 1st imbalanced group as we don't have any way to
@@ -11719,6 +11793,7 @@ static bool update_pick_idlest(struct sched_group *idlest,
 			return false;
 		break;
 
+	case group_llc_balance:
 	case group_imbalanced:
 	case group_asym_packing:
 	case group_smt_balance:
@@ -11851,6 +11926,7 @@ sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int
 			return NULL;
 		break;
 
+	case group_llc_balance:
 	case group_imbalanced:
 	case group_asym_packing:
 	case group_smt_balance:
@@ -12349,7 +12425,8 @@ static struct sched_group *sched_balance_find_src_group(struct lb_env *env)
 	 * group's child domain.
 	 */
 	if (sds.prefer_sibling && local->group_type == group_has_spare &&
-	    sibling_imbalance(env, &sds, busiest, local) > 1)
+	    (busiest->group_type == group_llc_balance ||
+	    sibling_imbalance(env, &sds, busiest, local) > 1))
 		goto force_balance;
 
 	if (busiest->group_type != group_overloaded) {