Cache Aware Scheduling

[PATCH v3 07/21] sched/cache: Introduce per CPU's tasks LLC preference counter

Posted by Tim Chen 1 month, 2 weeks ago

The lowest level of sched domain for each CPU is assigned an
array where each element tracks the number of tasks preferring
a given LLC, indexed from 0 to max_llcs - 1. Since each CPU
has its dedicated sd, this implies that each CPU will have
a dedicated task LLC preference counter.

For example, sd->pf[3] = 2 signifies that there
are 2 tasks on this runqueue which prefer to run within LLC3.

The load balancer can use this information to identify busy
runqueues and migrate tasks to their preferred LLC domains.
This array will be reallocated at runtime during sched domain
rebuild.

Introduce the buffer allocation mechanism, and the statistics
will be calculated in the subsequent patch.

Note: the LLC preference statistics of each CPU are reset on
sched domain rebuild and may under count temporarily, until the
CPU becomes idle and the count is cleared. This is a trade off
to avoid complex data synchronization across sched domain builds.

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Co-developed-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
---

Notes:
    v2->v3:
    Allocate preferred LLC buffer in rq->sd rather than
    the rq. That way it automagically gets reallocated
    and old buffer gets recycled during sched domain rebuild.
    (Peter Zijlstra)

 include/linux/sched/topology.h |  4 +++
 kernel/sched/sched.h           |  2 ++
 kernel/sched/topology.c        | 64 +++++++++++++++++++++++++++++++++-
 3 files changed, 69 insertions(+), 1 deletion(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index a4e2fb31f2fd..3aa6c101b2e4 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -102,6 +102,10 @@ struct sched_domain {
 	u64 max_newidle_lb_cost;
 	unsigned long last_decay_max_lb_cost;
 
+#ifdef CONFIG_SCHED_CACHE
+	unsigned int *pf;
+#endif
+
 #ifdef CONFIG_SCHEDSTATS
 	/* sched_balance_rq() stats */
 	unsigned int lb_count[CPU_MAX_IDLE_TYPES];
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 35cea6aa32a4..ac8c7ac1ac0d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3903,6 +3903,8 @@ static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct
 #endif /* !CONFIG_SCHED_MM_CID */
 
 #ifdef CONFIG_SCHED_CACHE
+extern int max_llcs;
+
 static inline bool sched_cache_enabled(void)
 {
 	return false;
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index ca46b5cf7f78..dae78b5915a7 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -21,6 +21,7 @@ void sched_domains_mutex_unlock(void)
 static cpumask_var_t sched_domains_tmpmask;
 static cpumask_var_t sched_domains_tmpmask2;
 static int tl_max_llcs;
+int max_llcs;
 
 static int __init sched_debug_setup(char *str)
 {
@@ -628,6 +629,11 @@ static void destroy_sched_domain(struct sched_domain *sd)
 
 	if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
 		kfree(sd->shared);
+
+#ifdef CONFIG_SCHED_CACHE
+	/* only the bottom sd has pref_llc array */
+	kfree(sd->pf);
+#endif
 	kfree(sd);
 }
 
@@ -747,10 +753,15 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 	if (sd && sd_degenerate(sd)) {
 		tmp = sd;
 		sd = sd->parent;
-		destroy_sched_domain(tmp);
+
 		if (sd) {
 			struct sched_group *sg = sd->groups;
 
+#ifdef CONFIG_SCHED_CACHE
+			/* move pf to parent as child is being destroyed */
+			sd->pf = tmp->pf;
+			tmp->pf = NULL;
+#endif
 			/*
 			 * sched groups hold the flags of the child sched
 			 * domain for convenience. Clear such flags since
@@ -762,6 +773,8 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 
 			sd->child = NULL;
 		}
+
+		destroy_sched_domain(tmp);
 	}
 
 	sched_domain_debug(sd, cpu);
@@ -787,6 +800,46 @@ enum s_alloc {
 	sa_none,
 };
 
+#ifdef CONFIG_SCHED_CACHE
+static bool alloc_sd_pref(const struct cpumask *cpu_map,
+			  struct s_data *d)
+{
+	struct sched_domain *sd;
+	unsigned int *pf;
+	int i;
+
+	for_each_cpu(i, cpu_map) {
+		sd = *per_cpu_ptr(d->sd, i);
+		if (!sd)
+			goto err;
+
+		pf = kcalloc(tl_max_llcs, sizeof(unsigned int), GFP_KERNEL);
+		if (!pf)
+			goto err;
+
+		sd->pf = pf;
+	}
+
+	return true;
+err:
+	for_each_cpu(i, cpu_map) {
+		sd = *per_cpu_ptr(d->sd, i);
+		if (sd) {
+			kfree(sd->pf);
+			sd->pf = NULL;
+		}
+	}
+
+	return false;
+}
+#else
+static bool alloc_sd_pref(const struct cpumask *cpu_map,
+			  struct s_data *d)
+{
+	return false;
+}
+#endif
+
 /*
  * Return the canonical balance CPU for this group, this is the first CPU
  * of this group that's also in the balance mask.
@@ -2710,6 +2763,8 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 		}
 	}
 
+	alloc_sd_pref(cpu_map, &d);
+
 	/* Attach the domains */
 	rcu_read_lock();
 	for_each_cpu(i, cpu_map) {
@@ -2723,6 +2778,13 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 	}
 	rcu_read_unlock();
 
+	/*
+	 * Ensure we see enlarged sd->pf when we use new llc_ids and
+	 * bigger max_llcs.
+	 */
+	smp_mb();
+	max_llcs = tl_max_llcs;
+
 	if (has_asym)
 		static_branch_inc_cpuslocked(&sched_asym_cpucapacity);
 
-- 
2.32.0

Re: [PATCH v3 07/21] sched/cache: Introduce per CPU's tasks LLC preference counter

Posted by Peter Zijlstra 1 month, 1 week ago

On Tue, Feb 10, 2026 at 02:18:47PM -0800, Tim Chen wrote:
> diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
> index a4e2fb31f2fd..3aa6c101b2e4 100644
> --- a/include/linux/sched/topology.h
> +++ b/include/linux/sched/topology.h
> @@ -102,6 +102,10 @@ struct sched_domain {
>  	u64 max_newidle_lb_cost;
>  	unsigned long last_decay_max_lb_cost;
>  
> +#ifdef CONFIG_SCHED_CACHE
> +	unsigned int *pf;

So I'm all for short names; but perhaps this could be better. When
reading this my brain went page-fault, and then WTF :-)

> +#endif
> +
>  #ifdef CONFIG_SCHEDSTATS
>  	/* sched_balance_rq() stats */
>  	unsigned int lb_count[CPU_MAX_IDLE_TYPES];

> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index ca46b5cf7f78..dae78b5915a7 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c

> @@ -2723,6 +2778,13 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
>  	}
>  	rcu_read_unlock();
>  
> +	/*
> +	 * Ensure we see enlarged sd->pf when we use new llc_ids and
> +	 * bigger max_llcs.
> +	 */
> +	smp_mb();
> +	max_llcs = tl_max_llcs;

This seems wrong. This is *after* cpu_attach_domain() which publishes
@sd. How about you do something like:

struct sched_domain {
	...

	unsigned int llc_max;
	unsigned int *llc_counts __counted_by(llc_max);
}

Then you always carry matching information that is published together.

>  	if (has_asym)
>  		static_branch_inc_cpuslocked(&sched_asym_cpucapacity);
>  
> -- 
> 2.32.0
>

Re: [PATCH v3 07/21] sched/cache: Introduce per CPU's tasks LLC preference counter

Posted by Chen, Yu C 1 month, 1 week ago

On 2/20/2026 6:45 PM, Peter Zijlstra wrote:
> On Tue, Feb 10, 2026 at 02:18:47PM -0800, Tim Chen wrote:
>> diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
>> index a4e2fb31f2fd..3aa6c101b2e4 100644
>> --- a/include/linux/sched/topology.h
>> +++ b/include/linux/sched/topology.h
>> @@ -102,6 +102,10 @@ struct sched_domain {
>>   	u64 max_newidle_lb_cost;
>>   	unsigned long last_decay_max_lb_cost;
>>   
>> +#ifdef CONFIG_SCHED_CACHE
>> +	unsigned int *pf;
> 
> So I'm all for short names; but perhaps this could be better. When
> reading this my brain went page-fault, and then WTF :-)
> 

OK, I assume you are suggesting renaming it to llc_counts.

>> +#endif
>> +
>>   #ifdef CONFIG_SCHEDSTATS
>>   	/* sched_balance_rq() stats */
>>   	unsigned int lb_count[CPU_MAX_IDLE_TYPES];
> 
>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
>> index ca46b5cf7f78..dae78b5915a7 100644
>> --- a/kernel/sched/topology.c
>> +++ b/kernel/sched/topology.c
> 
>> @@ -2723,6 +2778,13 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
>>   	}
>>   	rcu_read_unlock();
>>   
>> +	/*
>> +	 * Ensure we see enlarged sd->pf when we use new llc_ids and
>> +	 * bigger max_llcs.
>> +	 */
>> +	smp_mb();
>> +	max_llcs = tl_max_llcs;
> 
> This seems wrong. This is *after* cpu_attach_domain() which publishes
> @sd. How about you do something like:
> 
> struct sched_domain {
> 	...
> 
> 	unsigned int llc_max;
> 	unsigned int *llc_counts __counted_by(llc_max);
> }
> 
> Then you always carry matching information that is published together.
>

OK, we will change it accordingly.

Additionally, with this change we should be able to safely read
the data in the sched_domain by verifying whether the target llc_id
falls within the valid range(to avoid a race condition):

   CPU0                                CPU1
   :
   ...
   build_sched_domains                 update_sg_lb_stats
                                         for_each_cpu_and(i, sg)
                                           sd=rq[i]->sd
     per_cpu(sd_llc_id,i)=new_llc
                                           llc=llc_id(i)
                                           if(llc<sd->llc_max)
                                             safe read sd->pf[llc]
     alloc_sd_pref(cpu_map)
       sd->llc_counts=kzalloc()
       sd->llc_max=max_llc

Thanks,
Chenyu

Re: [PATCH v3 07/21] sched/cache: Introduce per CPU's tasks LLC preference counter

Posted by Peter Zijlstra 1 month, 1 week ago

On Sat, Feb 21, 2026 at 12:57:38AM +0800, Chen, Yu C wrote:

>                                           llc=llc_id(i)
>                                           if(llc<sd->llc_max)
>                                             safe read sd->pf[llc]

Right, except llc_id() is allowed to return negative, so that would need
to be something like:

	if ((unsigned)llc < sd->llc_max)
		sd->llc_count[llc]