[RFC PATCH v4 06/28] sched: Save the per LLC utilization for better cache aware scheduling

Chen Yu posted 28 patches 1 month, 3 weeks ago
[RFC PATCH v4 06/28] sched: Save the per LLC utilization for better cache aware scheduling
Posted by Chen Yu 1 month, 3 weeks ago
When a system gets busy and a process's preferred LLC
is saturated by too many threads within this process, there are significant
in-LLC task migrations within its preferred LLC. This leads to migration
latency and degrades performance. Ideally, task aggregation should be
inhibited if the task's preferred LLC is overloaded. This implies that a
metric is needed to indicate whether the LLC is busy.

Store the per-LLC utilization calculated via periodic load
balancing. These statistics will be used in subsequent patches to
determine whether tasks should be aggregated to their preferred LLC.

Signed-off-by: Chen Yu <yu.c.chen@intel.com>
---
 include/linux/sched/topology.h |  3 ++
 kernel/sched/fair.c            | 53 ++++++++++++++++++++++++++++++++++
 2 files changed, 56 insertions(+)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 198bb5cc1774..692f8a703b93 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -78,6 +78,9 @@ struct sched_domain_shared {
 	atomic_t	nr_busy_cpus;
 	int		has_idle_cores;
 	int		nr_idle_scan;
+#ifdef CONFIG_SCHED_CACHE
+	unsigned long	util_avg;
+#endif
 };
 
 struct sched_domain {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9e3c6f0eb934..4f79b7652642 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8828,6 +8828,22 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 #ifdef CONFIG_SCHED_CACHE
 static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle);
 
+/* expected to be protected by rcu_read_lock() */
+static bool get_llc_stats(int cpu, unsigned long *util,
+			  unsigned long *cap)
+{
+	struct sched_domain_shared *sd_share;
+
+	sd_share = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+	if (!sd_share)
+		return false;
+
+	*util = READ_ONCE(sd_share->util_avg);
+	*cap = per_cpu(sd_llc_size, cpu) * SCHED_CAPACITY_SCALE;
+
+	return true;
+}
+
 static int select_cache_cpu(struct task_struct *p, int prev_cpu)
 {
 	struct mm_struct *mm = p->mm;
@@ -10670,6 +10686,42 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
 	return check_cpu_capacity(rq, sd);
 }
 
+#ifdef CONFIG_SCHED_CACHE
+/*
+ * Save this sched group's statistic for later use:
+ * The task wakeup and load balance can make better
+ * decision based on these statistics.
+ */
+static void update_sg_if_llc(struct lb_env *env, struct sg_lb_stats *sgs,
+			     struct sched_group *group)
+{
+	/* Find the sched domain that spans this group. */
+	struct sched_domain *sd = env->sd->child;
+	struct sched_domain_shared *sd_share;
+
+	if (!sched_feat(SCHED_CACHE) || env->idle == CPU_NEWLY_IDLE)
+		return;
+
+	/* only care the sched domain that spans 1 LLC */
+	if (!sd || !(sd->flags & SD_SHARE_LLC) ||
+	    !sd->parent || (sd->parent->flags & SD_SHARE_LLC))
+		return;
+
+	sd_share = rcu_dereference(per_cpu(sd_llc_shared,
+				  cpumask_first(sched_group_span(group))));
+	if (!sd_share)
+		return;
+
+	if (likely(READ_ONCE(sd_share->util_avg) != sgs->group_util))
+		WRITE_ONCE(sd_share->util_avg, sgs->group_util);
+}
+#else
+static inline void update_sg_if_llc(struct lb_env *env, struct sg_lb_stats *sgs,
+				    struct sched_group *group)
+{
+}
+#endif
+
 /**
  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
  * @env: The load balancing environment.
@@ -10759,6 +10811,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 
 	sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
 
+	update_sg_if_llc(env, sgs, group);
 	/* Computing avg_load makes sense only when group is overloaded */
 	if (sgs->group_type == group_overloaded)
 		sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
-- 
2.25.1
Re: [RFC PATCH v4 06/28] sched: Save the per LLC utilization for better cache aware scheduling
Posted by Peter Zijlstra 5 days, 13 hours ago
On Sat, Aug 09, 2025 at 01:02:54PM +0800, Chen Yu wrote:

> +#ifdef CONFIG_SCHED_CACHE
> +/*
> + * Save this sched group's statistic for later use:
> + * The task wakeup and load balance can make better
> + * decision based on these statistics.
> + */
> +static void update_sg_if_llc(struct lb_env *env, struct sg_lb_stats *sgs,
> +			     struct sched_group *group)
> +{
> +	/* Find the sched domain that spans this group. */
> +	struct sched_domain *sd = env->sd->child;
> +	struct sched_domain_shared *sd_share;
> +
> +	if (!sched_feat(SCHED_CACHE) || env->idle == CPU_NEWLY_IDLE)
> +		return;
> +
> +	/* only care the sched domain that spans 1 LLC */
> +	if (!sd || !(sd->flags & SD_SHARE_LLC) ||
> +	    !sd->parent || (sd->parent->flags & SD_SHARE_LLC))
> +		return;

Did you want to write:

	if (sd != per_cpu(sd_llc))
		return;

Or something?

> +	sd_share = rcu_dereference(per_cpu(sd_llc_shared,
> +				  cpumask_first(sched_group_span(group))));
> +	if (!sd_share)
> +		return;
> +
> +	if (likely(READ_ONCE(sd_share->util_avg) != sgs->group_util))
> +		WRITE_ONCE(sd_share->util_avg, sgs->group_util);

If you expect it to be different, does that whole load and compare still
matter?

> +}
Re: [RFC PATCH v4 06/28] sched: Save the per LLC utilization for better cache aware scheduling
Posted by Chen, Yu C 4 days, 23 hours ago
On 9/29/2025 10:09 PM, Peter Zijlstra wrote:
> On Sat, Aug 09, 2025 at 01:02:54PM +0800, Chen Yu wrote:
> 
>> +#ifdef CONFIG_SCHED_CACHE
>> +/*
>> + * Save this sched group's statistic for later use:
>> + * The task wakeup and load balance can make better
>> + * decision based on these statistics.
>> + */
>> +static void update_sg_if_llc(struct lb_env *env, struct sg_lb_stats *sgs,
>> +			     struct sched_group *group)
>> +{
>> +	/* Find the sched domain that spans this group. */
>> +	struct sched_domain *sd = env->sd->child;
>> +	struct sched_domain_shared *sd_share;
>> +
>> +	if (!sched_feat(SCHED_CACHE) || env->idle == CPU_NEWLY_IDLE)
>> +		return;
>> +
>> +	/* only care the sched domain that spans 1 LLC */
>> +	if (!sd || !(sd->flags & SD_SHARE_LLC) ||
>> +	    !sd->parent || (sd->parent->flags & SD_SHARE_LLC))
>> +		return;
> 
> Did you want to write:
> 
> 	if (sd != per_cpu(sd_llc))
> 		return;
> 
> Or something?
> 

Ah right, will do it.

>> +	sd_share = rcu_dereference(per_cpu(sd_llc_shared,
>> +				  cpumask_first(sched_group_span(group))));
>> +	if (!sd_share)
>> +		return;
>> +
>> +	if (likely(READ_ONCE(sd_share->util_avg) != sgs->group_util))
>> +		WRITE_ONCE(sd_share->util_avg, sgs->group_util);
> 
> If you expect it to be different, does that whole load and compare still
> matter?
> 

OK, will write to it directly.

thanks,
Chenyu>> +}