[PATCH v3 11/21] sched/cache: Prioritize tasks preferring destination LLC during balancing

Tim Chen posted 21 patches 1 month, 2 weeks ago
[PATCH v3 11/21] sched/cache: Prioritize tasks preferring destination LLC during balancing
Posted by Tim Chen 1 month, 2 weeks ago
During LLC load balancing, first check for tasks that prefer the
destination LLC and balance them to it before others.

Mark source sched groups containing tasks preferring non local LLCs
with the group_llc_balance flag. This ensures the load balancer later
pulls or pushes these tasks toward their preferred LLCs.

The load balancer selects the busiest sched_group and migrates tasks
to less busy groups to distribute load across CPUs.

With cache-aware scheduling enabled, the busiest sched_group is
the one with most tasks preferring the destination LLC. If
the group has the llc_balance flag set, cache aware load balancing is
triggered.

Introduce the helper function update_llc_busiest() to identify the
sched_group with the most tasks preferring the destination LLC.

Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Co-developed-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
---

Notes:
    v2->v3:
    Consider sd->nr_balance_failed when deciding whether
    LLC load balance should be used.
    (Peter Zijlstra)

 kernel/sched/fair.c | 77 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 76 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b0cf4424d198..43dcf2827298 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9649,6 +9649,11 @@ enum group_type {
 	 * from balancing the load across the system.
 	 */
 	group_imbalanced,
+	/*
+	 * There are tasks running on non-preferred LLC, possible to move
+	 * them to their preferred LLC without creating too much imbalance.
+	 */
+	group_llc_balance,
 	/*
 	 * The CPU is overloaded and can't provide expected CPU cycles to all
 	 * tasks.
@@ -10561,6 +10566,7 @@ struct sg_lb_stats {
 	enum group_type group_type;
 	unsigned int group_asym_packing;	/* Tasks should be moved to preferred CPU */
 	unsigned int group_smt_balance;		/* Task on busy SMT be moved */
+	unsigned int group_llc_balance;		/* Tasks should be moved to preferred LLC */
 	unsigned long group_misfit_task_load;	/* A CPU has a task too big for its capacity */
 #ifdef CONFIG_NUMA_BALANCING
 	unsigned int nr_numa_running;
@@ -10819,6 +10825,9 @@ group_type group_classify(unsigned int imbalance_pct,
 	if (group_is_overloaded(imbalance_pct, sgs))
 		return group_overloaded;
 
+	if (sgs->group_llc_balance)
+		return group_llc_balance;
+
 	if (sg_imbalanced(group))
 		return group_imbalanced;
 
@@ -11012,11 +11021,66 @@ static void record_sg_llc_stats(struct lb_env *env,
 	if (unlikely(READ_ONCE(sd_share->capacity) != sgs->group_capacity))
 		WRITE_ONCE(sd_share->capacity, sgs->group_capacity);
 }
+
+/*
+ * Do LLC balance on sched group that contains LLC, and have tasks preferring
+ * to run on LLC in idle dst_cpu.
+ */
+static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs,
+			       struct sched_group *group)
+{
+	if (!sched_cache_enabled())
+		return false;
+
+	if (env->sd->flags & SD_SHARE_LLC)
+		return false;
+
+	/*
+	 * Don't do cache aware balancing if there
+	 * are too many balance failures.
+	 *
+	 * Should fall back to regular load balancing
+	 * after repeated cache aware balance failures.
+	 */
+	if (env->sd->nr_balance_failed >=
+	    env->sd->cache_nice_tries + 1)
+		return false;
+
+	if (sgs->nr_pref_dst_llc &&
+	    can_migrate_llc(cpumask_first(sched_group_span(group)),
+			    env->dst_cpu, 0, true) == mig_llc)
+		return true;
+
+	return false;
+}
+
+static bool update_llc_busiest(struct lb_env *env,
+			       struct sg_lb_stats *busiest,
+			       struct sg_lb_stats *sgs)
+{
+	/*
+	 * There are more tasks that want to run on dst_cpu's LLC.
+	 */
+	return sgs->nr_pref_dst_llc > busiest->nr_pref_dst_llc;
+}
 #else
 static inline void record_sg_llc_stats(struct lb_env *env, struct sg_lb_stats *sgs,
 				       struct sched_group *group)
 {
 }
+
+static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs,
+			       struct sched_group *group)
+{
+	return false;
+}
+
+static bool update_llc_busiest(struct lb_env *env,
+			       struct sg_lb_stats *busiest,
+			       struct sg_lb_stats *sgs)
+{
+	return false;
+}
 #endif
 
 /**
@@ -11118,6 +11182,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 		/* Check for loaded SMT group to be balanced to dst CPU */
 		if (smt_balance(env, sgs, group))
 			sgs->group_smt_balance = 1;
+
+		/* Check for tasks in this group can be moved to their preferred LLC */
+		if (llc_balance(env, sgs, group))
+			sgs->group_llc_balance = 1;
 	}
 
 	sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
@@ -11181,6 +11249,10 @@ static bool update_sd_pick_busiest(struct lb_env *env,
 		/* Select the overloaded group with highest avg_load. */
 		return sgs->avg_load > busiest->avg_load;
 
+	case group_llc_balance:
+		/* Select the group with most tasks preferring dst LLC */
+		return update_llc_busiest(env, busiest, sgs);
+
 	case group_imbalanced:
 		/*
 		 * Select the 1st imbalanced group as we don't have any way to
@@ -11443,6 +11515,7 @@ static bool update_pick_idlest(struct sched_group *idlest,
 			return false;
 		break;
 
+	case group_llc_balance:
 	case group_imbalanced:
 	case group_asym_packing:
 	case group_smt_balance:
@@ -11575,6 +11648,7 @@ sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int
 			return NULL;
 		break;
 
+	case group_llc_balance:
 	case group_imbalanced:
 	case group_asym_packing:
 	case group_smt_balance:
@@ -12074,7 +12148,8 @@ static struct sched_group *sched_balance_find_src_group(struct lb_env *env)
 	 * group's child domain.
 	 */
 	if (sds.prefer_sibling && local->group_type == group_has_spare &&
-	    sibling_imbalance(env, &sds, busiest, local) > 1)
+	    (busiest->group_type == group_llc_balance ||
+	    sibling_imbalance(env, &sds, busiest, local) > 1))
 		goto force_balance;
 
 	if (busiest->group_type != group_overloaded) {
-- 
2.32.0
Re: [PATCH v3 11/21] sched/cache: Prioritize tasks preferring destination LLC during balancing
Posted by Madadi Vineeth Reddy 1 month, 1 week ago
On 11/02/26 03:48, Tim Chen wrote:
> During LLC load balancing, first check for tasks that prefer the
> destination LLC and balance them to it before others.
> 
> Mark source sched groups containing tasks preferring non local LLCs
> with the group_llc_balance flag. This ensures the load balancer later
> pulls or pushes these tasks toward their preferred LLCs.
> 
> The load balancer selects the busiest sched_group and migrates tasks
> to less busy groups to distribute load across CPUs.
> 
> With cache-aware scheduling enabled, the busiest sched_group is
> the one with most tasks preferring the destination LLC. If
> the group has the llc_balance flag set, cache aware load balancing is
> triggered.
> 
> Introduce the helper function update_llc_busiest() to identify the
> sched_group with the most tasks preferring the destination LLC.
> 
> Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com>
> Co-developed-by: Chen Yu <yu.c.chen@intel.com>
> Signed-off-by: Chen Yu <yu.c.chen@intel.com>
> Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
> ---
> 
> Notes:
>     v2->v3:
>     Consider sd->nr_balance_failed when deciding whether
>     LLC load balance should be used.
>     (Peter Zijlstra)
> 
>  kernel/sched/fair.c | 77 ++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 76 insertions(+), 1 deletion(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index b0cf4424d198..43dcf2827298 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -9649,6 +9649,11 @@ enum group_type {
>  	 * from balancing the load across the system.
>  	 */
>  	group_imbalanced,
> +	/*
> +	 * There are tasks running on non-preferred LLC, possible to move
> +	 * them to their preferred LLC without creating too much imbalance.
> +	 */
> +	group_llc_balance,
>  	/*
>  	 * The CPU is overloaded and can't provide expected CPU cycles to all
>  	 * tasks.
> @@ -10561,6 +10566,7 @@ struct sg_lb_stats {
>  	enum group_type group_type;
>  	unsigned int group_asym_packing;	/* Tasks should be moved to preferred CPU */
>  	unsigned int group_smt_balance;		/* Task on busy SMT be moved */
> +	unsigned int group_llc_balance;		/* Tasks should be moved to preferred LLC */
>  	unsigned long group_misfit_task_load;	/* A CPU has a task too big for its capacity */
>  #ifdef CONFIG_NUMA_BALANCING
>  	unsigned int nr_numa_running;
> @@ -10819,6 +10825,9 @@ group_type group_classify(unsigned int imbalance_pct,
>  	if (group_is_overloaded(imbalance_pct, sgs))
>  		return group_overloaded;
>  
> +	if (sgs->group_llc_balance)
> +		return group_llc_balance;
> +

group_llc_balance is placed before group_imbalanced. In cases where a group is both imbalanced and
contains tasks preferring the destination LLC, LLC balancing will be selected first.

I assume the reasoning is that migrating tasks toward their preferred LLC may also help reduce
imbalance, and in cases where the goals conflict, the nr_balance_failed / cache_nice_tries
logic will eventually fall back to regular load balancing. Is that the intended policy?

It might be helpful to briefly mention this reasoning in the changelog, since this ordering
changes balancing priority.

Thanks,
Vineeth

>  	if (sg_imbalanced(group))
>  		return group_imbalanced;
>  
> @@ -11012,11 +11021,66 @@ static void record_sg_llc_stats(struct lb_env *env,
>  	if (unlikely(READ_ONCE(sd_share->capacity) != sgs->group_capacity))
>  		WRITE_ONCE(sd_share->capacity, sgs->group_capacity);
>  }
> +
> +/*
> + * Do LLC balance on sched group that contains LLC, and have tasks preferring
> + * to run on LLC in idle dst_cpu.
> + */
> +static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs,
> +			       struct sched_group *group)
> +{
> +	if (!sched_cache_enabled())
> +		return false;
> +
> +	if (env->sd->flags & SD_SHARE_LLC)
> +		return false;
> +
> +	/*
> +	 * Don't do cache aware balancing if there
> +	 * are too many balance failures.
> +	 *
> +	 * Should fall back to regular load balancing
> +	 * after repeated cache aware balance failures.
> +	 */
> +	if (env->sd->nr_balance_failed >=
> +	    env->sd->cache_nice_tries + 1)
> +		return false;
> +
> +	if (sgs->nr_pref_dst_llc &&
> +	    can_migrate_llc(cpumask_first(sched_group_span(group)),
> +			    env->dst_cpu, 0, true) == mig_llc)
> +		return true;
> +
> +	return false;
> +}
> +
> +static bool update_llc_busiest(struct lb_env *env,
> +			       struct sg_lb_stats *busiest,
> +			       struct sg_lb_stats *sgs)
> +{
> +	/*
> +	 * There are more tasks that want to run on dst_cpu's LLC.
> +	 */
> +	return sgs->nr_pref_dst_llc > busiest->nr_pref_dst_llc;
> +}
>  #else
>  static inline void record_sg_llc_stats(struct lb_env *env, struct sg_lb_stats *sgs,
>  				       struct sched_group *group)
>  {
>  }
> +
> +static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs,
> +			       struct sched_group *group)
> +{
> +	return false;
> +}
> +
> +static bool update_llc_busiest(struct lb_env *env,
> +			       struct sg_lb_stats *busiest,
> +			       struct sg_lb_stats *sgs)
> +{
> +	return false;
> +}
>  #endif
>  
>  /**
> @@ -11118,6 +11182,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
>  		/* Check for loaded SMT group to be balanced to dst CPU */
>  		if (smt_balance(env, sgs, group))
>  			sgs->group_smt_balance = 1;
> +
> +		/* Check for tasks in this group can be moved to their preferred LLC */
> +		if (llc_balance(env, sgs, group))
> +			sgs->group_llc_balance = 1;
>  	}
>  
>  	sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
> @@ -11181,6 +11249,10 @@ static bool update_sd_pick_busiest(struct lb_env *env,
>  		/* Select the overloaded group with highest avg_load. */
>  		return sgs->avg_load > busiest->avg_load;
>  
> +	case group_llc_balance:
> +		/* Select the group with most tasks preferring dst LLC */
> +		return update_llc_busiest(env, busiest, sgs);
> +
>  	case group_imbalanced:
>  		/*
>  		 * Select the 1st imbalanced group as we don't have any way to
> @@ -11443,6 +11515,7 @@ static bool update_pick_idlest(struct sched_group *idlest,
>  			return false;
>  		break;
>  
> +	case group_llc_balance:
>  	case group_imbalanced:
>  	case group_asym_packing:
>  	case group_smt_balance:
> @@ -11575,6 +11648,7 @@ sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int
>  			return NULL;
>  		break;
>  
> +	case group_llc_balance:
>  	case group_imbalanced:
>  	case group_asym_packing:
>  	case group_smt_balance:
> @@ -12074,7 +12148,8 @@ static struct sched_group *sched_balance_find_src_group(struct lb_env *env)
>  	 * group's child domain.
>  	 */
>  	if (sds.prefer_sibling && local->group_type == group_has_spare &&
> -	    sibling_imbalance(env, &sds, busiest, local) > 1)
> +	    (busiest->group_type == group_llc_balance ||
> +	    sibling_imbalance(env, &sds, busiest, local) > 1))
>  		goto force_balance;
>  
>  	if (busiest->group_type != group_overloaded) {
Re: [PATCH v3 11/21] sched/cache: Prioritize tasks preferring destination LLC during balancing
Posted by Tim Chen 1 month, 1 week ago
On Wed, 2026-02-18 at 00:03 +0530, Madadi Vineeth Reddy wrote:
> On 11/02/26 03:48, Tim Chen wrote:
> > During LLC load balancing, first check for tasks that prefer the
> > destination LLC and balance them to it before others.
> > 
> > Mark source sched groups containing tasks preferring non local LLCs
> > with the group_llc_balance flag. This ensures the load balancer later
> > pulls or pushes these tasks toward their preferred LLCs.
> > 
> > The load balancer selects the busiest sched_group and migrates tasks
> > to less busy groups to distribute load across CPUs.
> > 
> > With cache-aware scheduling enabled, the busiest sched_group is
> > the one with most tasks preferring the destination LLC. If
> > the group has the llc_balance flag set, cache aware load balancing is
> > triggered.
> > 
> > Introduce the helper function update_llc_busiest() to identify the
> > sched_group with the most tasks preferring the destination LLC.
> > 
> > Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com>
> > Co-developed-by: Chen Yu <yu.c.chen@intel.com>
> > Signed-off-by: Chen Yu <yu.c.chen@intel.com>
> > Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
> > ---
> > 
> > Notes:
> >     v2->v3:
> >     Consider sd->nr_balance_failed when deciding whether
> >     LLC load balance should be used.
> >     (Peter Zijlstra)
> > 
> >  kernel/sched/fair.c | 77 ++++++++++++++++++++++++++++++++++++++++++++-
> >  1 file changed, 76 insertions(+), 1 deletion(-)
> > 
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index b0cf4424d198..43dcf2827298 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -9649,6 +9649,11 @@ enum group_type {
> >  	 * from balancing the load across the system.
> >  	 */
> >  	group_imbalanced,
> > +	/*
> > +	 * There are tasks running on non-preferred LLC, possible to move
> > +	 * them to their preferred LLC without creating too much imbalance.
> > +	 */
> > +	group_llc_balance,
> >  	/*
> >  	 * The CPU is overloaded and can't provide expected CPU cycles to all
> >  	 * tasks.
> > @@ -10561,6 +10566,7 @@ struct sg_lb_stats {
> >  	enum group_type group_type;
> >  	unsigned int group_asym_packing;	/* Tasks should be moved to preferred CPU */
> >  	unsigned int group_smt_balance;		/* Task on busy SMT be moved */
> > +	unsigned int group_llc_balance;		/* Tasks should be moved to preferred LLC */
> >  	unsigned long group_misfit_task_load;	/* A CPU has a task too big for its capacity */
> >  #ifdef CONFIG_NUMA_BALANCING
> >  	unsigned int nr_numa_running;
> > @@ -10819,6 +10825,9 @@ group_type group_classify(unsigned int imbalance_pct,
> >  	if (group_is_overloaded(imbalance_pct, sgs))
> >  		return group_overloaded;
> >  
> > +	if (sgs->group_llc_balance)
> > +		return group_llc_balance;
> > +
> 
> group_llc_balance is placed before group_imbalanced. In cases where a group is both imbalanced and
> contains tasks preferring the destination LLC, LLC balancing will be selected first.
> 
> I assume the reasoning is that migrating tasks toward their preferred LLC may also help reduce
> imbalance, and in cases where the goals conflict, the nr_balance_failed / cache_nice_tries
> logic will eventually fall back to regular load balancing. Is that the intended policy?
> 
> It might be helpful to briefly mention this reasoning in the changelog, since this ordering
> changes balancing priority.
> 

group_llc_balance naturally aggregate tasks to LLC and could create imbalance
between LLC domains. 

If we do group_imbalanced first, then after we balanced the load 
and move on to consider group_llc_balance,
group_llc_balance will cause load imbalance between the LLCs again
and undo all the previous load balance work.

It is better to do group_llc_balance to move the tasks to their preferred
LLC first, then let group_imbalanced do adjustments to imbalance in load.
The can_migrate_llc_task() check will prevent group_imbalanced from undoing
the work done previously in group_llc_balance.

Yes, we'll add some comments to explain the reasoning of load balance priority.

Tim