[v2] Cache aware scheduling

[PATCH v2 02/23] sched/cache: Record per-LLC utilization to guide cache-aware scheduling decisions

Posted by Tim Chen 2 months ago

From: Chen Yu <yu.c.chen@intel.com>

When a system becomes busy and a process’s preferred LLC is
saturated with too many threads, tasks within that LLC migrate
frequently. These in LLC migrations introduce latency and degrade
performance. To avoid this, task aggregation should be suppressed when
the preferred LLC is overloaded, which requires a metric to indicate
LLC utilization.

Record per LLC utilization/cpu capacity during periodic load
balancing. These statistics will be used in later patches to decide
whether tasks should be aggregated into their preferred LLC.

Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
---

Notes:
    v1->v2:
       Refine the comments in record_sg_llc_stats().(Peter Zijlstra).

 include/linux/sched/topology.h |  4 ++
 kernel/sched/fair.c            | 69 ++++++++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index bbcfdf12aa6e..0ba4697d74ba 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -68,6 +68,10 @@ struct sched_domain_shared {
 	atomic_t	nr_busy_cpus;
 	int		has_idle_cores;
 	int		nr_idle_scan;
+#ifdef CONFIG_SCHED_CACHE
+	unsigned long	util_avg;
+	unsigned long	capacity ____cacheline_aligned_in_smp;
+#endif
 };
 
 struct sched_domain {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cb82f558dc5b..b9f336300f14 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9622,6 +9622,29 @@ static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_
 	return 0;
 }
 
+#ifdef CONFIG_SCHED_CACHE
+/* Called from load balancing paths with rcu_read_lock held */
+static __maybe_unused bool get_llc_stats(int cpu, unsigned long *util,
+					 unsigned long *cap)
+{
+	struct sched_domain_shared *sd_share;
+
+	sd_share = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+	if (!sd_share)
+		return false;
+
+	*util = READ_ONCE(sd_share->util_avg);
+	*cap = READ_ONCE(sd_share->capacity);
+
+	return true;
+}
+#else
+static inline bool get_llc_stats(int cpu, unsigned long *util,
+				 unsigned long *cap)
+{
+	return false;
+}
+#endif
 /*
  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
  */
@@ -10592,6 +10615,51 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
 	return check_cpu_capacity(rq, sd);
 }
 
+#ifdef CONFIG_SCHED_CACHE
+/*
+ * Record the statistics for this scheduler group for later
+ * use. These values guide load balancing on aggregating tasks
+ * to a LLC.
+ */
+static void record_sg_llc_stats(struct lb_env *env,
+				struct sg_lb_stats *sgs,
+				struct sched_group *group)
+{
+	struct sched_domain_shared *sd_share;
+
+	if (!sched_cache_enabled() || env->idle == CPU_NEWLY_IDLE)
+		return;
+
+	/* Only care about sched domain spanning multiple LLCs */
+	if (env->sd->child != rcu_dereference(per_cpu(sd_llc, env->dst_cpu)))
+		return;
+
+	/*
+	 * At this point we know this group spans a LLC domain.
+	 * Record the statistic of this group in its corresponding
+	 * shared LLC domain.
+	 * Note: sd_share cannot be obtained via sd->child->shared, because
+	 * it refers to the domain that covers the local group, while
+	 * sd_share could represent any of the LLC group.
+	 */
+	sd_share = rcu_dereference(per_cpu(sd_llc_shared,
+					   cpumask_first(sched_group_span(group))));
+	if (!sd_share)
+		return;
+
+	if (READ_ONCE(sd_share->util_avg) != sgs->group_util)
+		WRITE_ONCE(sd_share->util_avg, sgs->group_util);
+
+	if (unlikely(READ_ONCE(sd_share->capacity) != sgs->group_capacity))
+		WRITE_ONCE(sd_share->capacity, sgs->group_capacity);
+}
+#else
+static inline void record_sg_llc_stats(struct lb_env *env, struct sg_lb_stats *sgs,
+				       struct sched_group *group)
+{
+}
+#endif
+
 /**
  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
  * @env: The load balancing environment.
@@ -10681,6 +10749,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 
 	sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
 
+	record_sg_llc_stats(env, sgs, group);
 	/* Computing avg_load makes sense only when group is overloaded */
 	if (sgs->group_type == group_overloaded)
 		sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
-- 
2.32.0

Re: [PATCH v2 02/23] sched/cache: Record per-LLC utilization to guide cache-aware scheduling decisions

Posted by Peter Zijlstra 2 months ago

On Wed, Dec 03, 2025 at 03:07:21PM -0800, Tim Chen wrote:

> diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
> index bbcfdf12aa6e..0ba4697d74ba 100644
> --- a/include/linux/sched/topology.h
> +++ b/include/linux/sched/topology.h
> @@ -68,6 +68,10 @@ struct sched_domain_shared {
>  	atomic_t	nr_busy_cpus;
>  	int		has_idle_cores;
>  	int		nr_idle_scan;
> +#ifdef CONFIG_SCHED_CACHE
> +	unsigned long	util_avg;
> +	unsigned long	capacity ____cacheline_aligned_in_smp;

This cacheline annotation confuses me, see below.

> +#endif
>  };
>  
>  struct sched_domain {
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index cb82f558dc5b..b9f336300f14 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -9622,6 +9622,29 @@ static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_
>  	return 0;
>  }
>  
> +#ifdef CONFIG_SCHED_CACHE
> +/* Called from load balancing paths with rcu_read_lock held */
> +static __maybe_unused bool get_llc_stats(int cpu, unsigned long *util,
> +					 unsigned long *cap)
> +{
> +	struct sched_domain_shared *sd_share;
> +
> +	sd_share = rcu_dereference(per_cpu(sd_llc_shared, cpu));
> +	if (!sd_share)
> +		return false;
> +
> +	*util = READ_ONCE(sd_share->util_avg);
> +	*cap = READ_ONCE(sd_share->capacity);

You placed capacity on a separate line, forcing the above to be 2
distinct lines. That seems... sub-optimal?

> +
> +	return true;
> +}
> +#else
> +static inline bool get_llc_stats(int cpu, unsigned long *util,
> +				 unsigned long *cap)
> +{
> +	return false;
> +}
> +#endif
>  /*
>   * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
>   */
> @@ -10592,6 +10615,51 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
>  	return check_cpu_capacity(rq, sd);
>  }
>  
> +#ifdef CONFIG_SCHED_CACHE
> +/*
> + * Record the statistics for this scheduler group for later
> + * use. These values guide load balancing on aggregating tasks
> + * to a LLC.
> + */
> +static void record_sg_llc_stats(struct lb_env *env,
> +				struct sg_lb_stats *sgs,
> +				struct sched_group *group)
> +{
> +	struct sched_domain_shared *sd_share;
> +
> +	if (!sched_cache_enabled() || env->idle == CPU_NEWLY_IDLE)
> +		return;
> +
> +	/* Only care about sched domain spanning multiple LLCs */
> +	if (env->sd->child != rcu_dereference(per_cpu(sd_llc, env->dst_cpu)))
> +		return;
> +
> +	/*
> +	 * At this point we know this group spans a LLC domain.
> +	 * Record the statistic of this group in its corresponding
> +	 * shared LLC domain.
> +	 * Note: sd_share cannot be obtained via sd->child->shared, because
> +	 * it refers to the domain that covers the local group, while
> +	 * sd_share could represent any of the LLC group.
> +	 */
> +	sd_share = rcu_dereference(per_cpu(sd_llc_shared,
> +					   cpumask_first(sched_group_span(group))));
> +	if (!sd_share)
> +		return;
> +
> +	if (READ_ONCE(sd_share->util_avg) != sgs->group_util)
> +		WRITE_ONCE(sd_share->util_avg, sgs->group_util);
> +
> +	if (unlikely(READ_ONCE(sd_share->capacity) != sgs->group_capacity))
> +		WRITE_ONCE(sd_share->capacity, sgs->group_capacity);

And same here.

> +}
> +#else
> +static inline void record_sg_llc_stats(struct lb_env *env, struct sg_lb_stats *sgs,
> +				       struct sched_group *group)
> +{
> +}
> +#endif

Re: [PATCH v2 02/23] sched/cache: Record per-LLC utilization to guide cache-aware scheduling decisions

Posted by Chen, Yu C 1 month, 4 weeks ago

On 12/9/2025 8:21 PM, Peter Zijlstra wrote:
> On Wed, Dec 03, 2025 at 03:07:21PM -0800, Tim Chen wrote:
> 
>> diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
>> index bbcfdf12aa6e..0ba4697d74ba 100644
>> --- a/include/linux/sched/topology.h
>> +++ b/include/linux/sched/topology.h
>> @@ -68,6 +68,10 @@ struct sched_domain_shared {
>>   	atomic_t	nr_busy_cpus;
>>   	int		has_idle_cores;
>>   	int		nr_idle_scan;
>> +#ifdef CONFIG_SCHED_CACHE
>> +	unsigned long	util_avg;
>> +	unsigned long	capacity ____cacheline_aligned_in_smp;
> 
> This cacheline annotation confuses me, see below.
> 
>> +#endif
>>   };
>>   
>>   struct sched_domain {
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index cb82f558dc5b..b9f336300f14 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -9622,6 +9622,29 @@ static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_
>>   	return 0;
>>   }
>>   
>> +#ifdef CONFIG_SCHED_CACHE
>> +/* Called from load balancing paths with rcu_read_lock held */
>> +static __maybe_unused bool get_llc_stats(int cpu, unsigned long *util,
>> +					 unsigned long *cap)
>> +{
>> +	struct sched_domain_shared *sd_share;
>> +
>> +	sd_share = rcu_dereference(per_cpu(sd_llc_shared, cpu));
>> +	if (!sd_share)
>> +		return false;
>> +
>> +	*util = READ_ONCE(sd_share->util_avg);
>> +	*cap = READ_ONCE(sd_share->capacity);
> 
> You placed capacity on a separate line, forcing the above to be 2
> distinct lines. That seems... sub-optimal?
> 

The reason capacity was placed in a separate cache line
is that writes to capacity are not very frequent(cpu hotplug
should not happen too-frequently), while writes to util_avg
tend to be relatively frequent(changes frequently).
If capacity and util_avg were placed in the same cache line,
I’m thinking writes to util_avg might invalidate the entire
cache line. This could potentially cause cache misses when
capacity is read elsewhere, which might lead to false sharing?

thanks,
Chenyu

>> +
>> +	return true;
>> +}
>> +#else
>> +static inline bool get_llc_stats(int cpu, unsigned long *util,
>> +				 unsigned long *cap)
>> +{
>> +	return false;
>> +}
>> +#endif
>>   /*
>>    * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
>>    */
>> @@ -10592,6 +10615,51 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
>>   	return check_cpu_capacity(rq, sd);
>>   }
>>   
>> +#ifdef CONFIG_SCHED_CACHE
>> +/*
>> + * Record the statistics for this scheduler group for later
>> + * use. These values guide load balancing on aggregating tasks
>> + * to a LLC.
>> + */
>> +static void record_sg_llc_stats(struct lb_env *env,
>> +				struct sg_lb_stats *sgs,
>> +				struct sched_group *group)
>> +{
>> +	struct sched_domain_shared *sd_share;
>> +
>> +	if (!sched_cache_enabled() || env->idle == CPU_NEWLY_IDLE)
>> +		return;
>> +
>> +	/* Only care about sched domain spanning multiple LLCs */
>> +	if (env->sd->child != rcu_dereference(per_cpu(sd_llc, env->dst_cpu)))
>> +		return;
>> +
>> +	/*
>> +	 * At this point we know this group spans a LLC domain.
>> +	 * Record the statistic of this group in its corresponding
>> +	 * shared LLC domain.
>> +	 * Note: sd_share cannot be obtained via sd->child->shared, because
>> +	 * it refers to the domain that covers the local group, while
>> +	 * sd_share could represent any of the LLC group.
>> +	 */
>> +	sd_share = rcu_dereference(per_cpu(sd_llc_shared,
>> +					   cpumask_first(sched_group_span(group))));
>> +	if (!sd_share)
>> +		return;
>> +
>> +	if (READ_ONCE(sd_share->util_avg) != sgs->group_util)
>> +		WRITE_ONCE(sd_share->util_avg, sgs->group_util);
>> +
>> +	if (unlikely(READ_ONCE(sd_share->capacity) != sgs->group_capacity))
>> +		WRITE_ONCE(sd_share->capacity, sgs->group_capacity);
> 
> And same here.
> 
>> +}
>> +#else
>> +static inline void record_sg_llc_stats(struct lb_env *env, struct sg_lb_stats *sgs,
>> +				       struct sched_group *group)
>> +{
>> +}
>> +#endif

Re: [PATCH v2 02/23] sched/cache: Record per-LLC utilization to guide cache-aware scheduling decisions

Posted by Peter Zijlstra 1 month, 4 weeks ago

On Wed, Dec 10, 2025 at 11:02:39PM +0900, Chen, Yu C wrote:
> On 12/9/2025 8:21 PM, Peter Zijlstra wrote:
> > On Wed, Dec 03, 2025 at 03:07:21PM -0800, Tim Chen wrote:
> > 
> > > diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
> > > index bbcfdf12aa6e..0ba4697d74ba 100644
> > > --- a/include/linux/sched/topology.h
> > > +++ b/include/linux/sched/topology.h
> > > @@ -68,6 +68,10 @@ struct sched_domain_shared {
> > >   	atomic_t	nr_busy_cpus;
> > >   	int		has_idle_cores;
> > >   	int		nr_idle_scan;
> > > +#ifdef CONFIG_SCHED_CACHE
> > > +	unsigned long	util_avg;
> > > +	unsigned long	capacity ____cacheline_aligned_in_smp;
> > 
> > This cacheline annotation confuses me, see below.
> > 
> > > +#endif
> > >   };
> > >   struct sched_domain {
> > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > index cb82f558dc5b..b9f336300f14 100644
> > > --- a/kernel/sched/fair.c
> > > +++ b/kernel/sched/fair.c
> > > @@ -9622,6 +9622,29 @@ static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_
> > >   	return 0;
> > >   }
> > > +#ifdef CONFIG_SCHED_CACHE
> > > +/* Called from load balancing paths with rcu_read_lock held */
> > > +static __maybe_unused bool get_llc_stats(int cpu, unsigned long *util,
> > > +					 unsigned long *cap)
> > > +{
> > > +	struct sched_domain_shared *sd_share;
> > > +
> > > +	sd_share = rcu_dereference(per_cpu(sd_llc_shared, cpu));
> > > +	if (!sd_share)
> > > +		return false;
> > > +
> > > +	*util = READ_ONCE(sd_share->util_avg);
> > > +	*cap = READ_ONCE(sd_share->capacity);
> > 
> > You placed capacity on a separate line, forcing the above to be 2
> > distinct lines. That seems... sub-optimal?
> > 
> 
> The reason capacity was placed in a separate cache line
> is that writes to capacity are not very frequent(cpu hotplug
> should not happen too-frequently), while writes to util_avg
> tend to be relatively frequent(changes frequently).
> If capacity and util_avg were placed in the same cache line,
> I’m thinking writes to util_avg might invalidate the entire
> cache line. This could potentially cause cache misses when
> capacity is read elsewhere, which might lead to false sharing?

But its introduced here and already read/written together. Is this not
premature optimization?

Re: [PATCH v2 02/23] sched/cache: Record per-LLC utilization to guide cache-aware scheduling decisions

Posted by Chen, Yu C 1 month, 4 weeks ago

On 12/11/2025 12:13 AM, Peter Zijlstra wrote:
> On Wed, Dec 10, 2025 at 11:02:39PM +0900, Chen, Yu C wrote:
>> On 12/9/2025 8:21 PM, Peter Zijlstra wrote:
>>> On Wed, Dec 03, 2025 at 03:07:21PM -0800, Tim Chen wrote:
>>>
>>>> diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
>>>> index bbcfdf12aa6e..0ba4697d74ba 100644
>>>> --- a/include/linux/sched/topology.h
>>>> +++ b/include/linux/sched/topology.h
>>>> @@ -68,6 +68,10 @@ struct sched_domain_shared {
>>>>    	atomic_t	nr_busy_cpus;
>>>>    	int		has_idle_cores;
>>>>    	int		nr_idle_scan;
>>>> +#ifdef CONFIG_SCHED_CACHE
>>>> +	unsigned long	util_avg;
>>>> +	unsigned long	capacity ____cacheline_aligned_in_smp;
>>>
>>> This cacheline annotation confuses me, see below.
>>>
>>>> +#endif
>>>>    };
>>>>    struct sched_domain {
>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>>> index cb82f558dc5b..b9f336300f14 100644
>>>> --- a/kernel/sched/fair.c
>>>> +++ b/kernel/sched/fair.c
>>>> @@ -9622,6 +9622,29 @@ static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_
>>>>    	return 0;
>>>>    }
>>>> +#ifdef CONFIG_SCHED_CACHE
>>>> +/* Called from load balancing paths with rcu_read_lock held */
>>>> +static __maybe_unused bool get_llc_stats(int cpu, unsigned long *util,
>>>> +					 unsigned long *cap)
>>>> +{
>>>> +	struct sched_domain_shared *sd_share;
>>>> +
>>>> +	sd_share = rcu_dereference(per_cpu(sd_llc_shared, cpu));
>>>> +	if (!sd_share)
>>>> +		return false;
>>>> +
>>>> +	*util = READ_ONCE(sd_share->util_avg);
>>>> +	*cap = READ_ONCE(sd_share->capacity);
>>>
>>> You placed capacity on a separate line, forcing the above to be 2
>>> distinct lines. That seems... sub-optimal?
>>>
>>
>> The reason capacity was placed in a separate cache line
>> is that writes to capacity are not very frequent(cpu hotplug
>> should not happen too-frequently), while writes to util_avg
>> tend to be relatively frequent(changes frequently).
>> If capacity and util_avg were placed in the same cache line,
>> I’m thinking writes to util_avg might invalidate the entire
>> cache line. This could potentially cause cache misses when
>> capacity is read elsewhere, which might lead to false sharing?
> 
> But its introduced here and already read/written together. Is this not
> premature optimization?

I see, since they are read together, there could be a pre-load of
adjacent cachelines I suppose, I'll remove this cache alignment and
check if there is any performance impact.

Thanks,
Chenyu