Cache Aware Scheduling

[PATCH v3 05/21] sched/cache: Assign preferred LLC ID to processes

Posted by Tim Chen 1 month, 2 weeks ago

With cache-aware scheduling enabled, each task is assigned a
preferred LLC ID. This allows quick identification of the LLC domain
where the task prefers to run, similar to numa_preferred_nid in
NUMA balancing.

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
---

Notes:
    v2->v3:
    Add comments around code handling NUMA balance conflict with cache aware
    scheduling. (Peter Zijlstra)
    
    Check if NUMA balancing is disabled before checking numa_preferred_nid
    (Jianyong Wu)

 include/linux/sched.h |  1 +
 init/init_task.c      |  3 +++
 kernel/sched/fair.c   | 42 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 46 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2817a21ee055..c98bd1c46088 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1411,6 +1411,7 @@ struct task_struct {
 
 #ifdef CONFIG_SCHED_CACHE
 	struct callback_head		cache_work;
+	int				preferred_llc;
 #endif
 
 	struct rseq_data		rseq;
diff --git a/init/init_task.c b/init/init_task.c
index 49b13d7c3985..baa420de2644 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -218,6 +218,9 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
 	.numa_group	= NULL,
 	.numa_faults	= NULL,
 #endif
+#ifdef CONFIG_SCHED_CACHE
+	.preferred_llc  = -1,
+#endif
 #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 	.kasan_depth	= 1,
 #endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bf5f39a01017..0b4ed0f2809d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1273,11 +1273,43 @@ static unsigned long fraction_mm_sched(struct rq *rq,
 	return div64_u64(NICE_0_LOAD * pcpu_sched->runtime, rq->cpu_runtime + 1);
 }
 
+static int get_pref_llc(struct task_struct *p, struct mm_struct *mm)
+{
+	int mm_sched_llc = -1;
+
+	if (!mm)
+		return -1;
+
+	if (mm->sc_stat.cpu != -1) {
+		mm_sched_llc = llc_id(mm->sc_stat.cpu);
+
+#ifdef CONFIG_NUMA_BALANCING
+		/*
+		 * Don't assign preferred LLC if it
+		 * conflicts with NUMA balancing.
+		 * This can happen when sched_setnuma() gets
+		 * called, however it is not much of an issue
+		 * because we expect account_mm_sched() to get
+		 * called fairly regularly -- at a higher rate
+		 * than sched_setnuma() at least -- and thus the
+		 * conflict only exists for a short period of time.
+		 */
+		if (static_branch_likely(&sched_numa_balancing) &&
+		    p->numa_preferred_nid >= 0 &&
+		    cpu_to_node(mm->sc_stat.cpu) != p->numa_preferred_nid)
+			mm_sched_llc = -1;
+#endif
+	}
+
+	return mm_sched_llc;
+}
+
 static inline
 void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
 {
 	struct sched_cache_time *pcpu_sched;
 	struct mm_struct *mm = p->mm;
+	int mm_sched_llc = -1;
 	unsigned long epoch;
 
 	if (!sched_cache_enabled())
@@ -1311,6 +1343,11 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
 		if (mm->sc_stat.cpu != -1)
 			mm->sc_stat.cpu = -1;
 	}
+
+	mm_sched_llc = get_pref_llc(p, mm);
+
+	if (p->preferred_llc != mm_sched_llc)
+		p->preferred_llc = mm_sched_llc;
 }
 
 static void task_tick_cache(struct rq *rq, struct task_struct *p)
@@ -1440,6 +1477,11 @@ void init_sched_mm(struct task_struct *p) { }
 
 static void task_tick_cache(struct rq *rq, struct task_struct *p) { }
 
+static inline int get_pref_llc(struct task_struct *p,
+			       struct mm_struct *mm)
+{
+	return -1;
+}
 #endif
 
 /*
-- 
2.32.0

Re: [PATCH v3 05/21] sched/cache: Assign preferred LLC ID to processes

Posted by Madadi Vineeth Reddy 1 month, 2 weeks ago

On 11/02/26 03:48, Tim Chen wrote:
> With cache-aware scheduling enabled, each task is assigned a
> preferred LLC ID. This allows quick identification of the LLC domain
> where the task prefers to run, similar to numa_preferred_nid in
> NUMA balancing.
> 
> Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
> ---
> 
> Notes:
>     v2->v3:
>     Add comments around code handling NUMA balance conflict with cache aware
>     scheduling. (Peter Zijlstra)
>     
>     Check if NUMA balancing is disabled before checking numa_preferred_nid
>     (Jianyong Wu)
> 
>  include/linux/sched.h |  1 +
>  init/init_task.c      |  3 +++
>  kernel/sched/fair.c   | 42 ++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 46 insertions(+)
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 2817a21ee055..c98bd1c46088 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1411,6 +1411,7 @@ struct task_struct {
>  
>  #ifdef CONFIG_SCHED_CACHE
>  	struct callback_head		cache_work;
> +	int				preferred_llc;
>  #endif
>  
>  	struct rseq_data		rseq;
> diff --git a/init/init_task.c b/init/init_task.c
> index 49b13d7c3985..baa420de2644 100644
> --- a/init/init_task.c
> +++ b/init/init_task.c
> @@ -218,6 +218,9 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
>  	.numa_group	= NULL,
>  	.numa_faults	= NULL,
>  #endif
> +#ifdef CONFIG_SCHED_CACHE
> +	.preferred_llc  = -1,
> +#endif
>  #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
>  	.kasan_depth	= 1,
>  #endif
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index bf5f39a01017..0b4ed0f2809d 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1273,11 +1273,43 @@ static unsigned long fraction_mm_sched(struct rq *rq,
>  	return div64_u64(NICE_0_LOAD * pcpu_sched->runtime, rq->cpu_runtime + 1);
>  }
>  
> +static int get_pref_llc(struct task_struct *p, struct mm_struct *mm)
> +{
> +	int mm_sched_llc = -1;
> +
> +	if (!mm)
> +		return -1;
> +
> +	if (mm->sc_stat.cpu != -1) {
> +		mm_sched_llc = llc_id(mm->sc_stat.cpu);
> +
> +#ifdef CONFIG_NUMA_BALANCING
> +		/*
> +		 * Don't assign preferred LLC if it
> +		 * conflicts with NUMA balancing.
> +		 * This can happen when sched_setnuma() gets
> +		 * called, however it is not much of an issue
> +		 * because we expect account_mm_sched() to get
> +		 * called fairly regularly -- at a higher rate
> +		 * than sched_setnuma() at least -- and thus the
> +		 * conflict only exists for a short period of time.
> +		 */
> +		if (static_branch_likely(&sched_numa_balancing) &&
> +		    p->numa_preferred_nid >= 0 &&
> +		    cpu_to_node(mm->sc_stat.cpu) != p->numa_preferred_nid)
> +			mm_sched_llc = -1;
> +#endif
> +	}
> +
> +	return mm_sched_llc;
> +}
> +
>  static inline
>  void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
>  {
>  	struct sched_cache_time *pcpu_sched;
>  	struct mm_struct *mm = p->mm;
> +	int mm_sched_llc = -1;
>  	unsigned long epoch;
>  
>  	if (!sched_cache_enabled())
> @@ -1311,6 +1343,11 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
>  		if (mm->sc_stat.cpu != -1)
>  			mm->sc_stat.cpu = -1;
>  	}
> +
> +	mm_sched_llc = get_pref_llc(p, mm);
> +
> +	if (p->preferred_llc != mm_sched_llc)
> +		p->preferred_llc = mm_sched_llc;

This writes to p->preferred_llc without using WRITE_ONCE(). If later patches read p->preferred_llc from
load balancing or migration paths on other CPUs, wouldn't this create a data race?

For example:
CPU 0: Task is running, account_mm_sched() writes p->preferred_llc
CPU 1: Load balancer reads p->preferred_llc to make migration decisions

Should this use WRITE_ONCE() and READ_ONCE() at the read sites, unless all accesses are guaranteed to be
under rq->lock?

Thanks,
Vineeth

>  }
>  
>  static void task_tick_cache(struct rq *rq, struct task_struct *p)
> @@ -1440,6 +1477,11 @@ void init_sched_mm(struct task_struct *p) { }
>  
>  static void task_tick_cache(struct rq *rq, struct task_struct *p) { }
>  
> +static inline int get_pref_llc(struct task_struct *p,
> +			       struct mm_struct *mm)
> +{
> +	return -1;
> +}
>  #endif
>  
>  /*

Re: [PATCH v3 05/21] sched/cache: Assign preferred LLC ID to processes

Posted by Chen, Yu C 1 month, 2 weeks ago

On 2/15/2026 2:36 AM, Madadi Vineeth Reddy wrote:
> On 11/02/26 03:48, Tim Chen wrote:

[ ... ]

>> @@ -1311,6 +1343,11 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
>>   		if (mm->sc_stat.cpu != -1)
>>   			mm->sc_stat.cpu = -1;
>>   	}
>> +
>> +	mm_sched_llc = get_pref_llc(p, mm);
>> +
>> +	if (p->preferred_llc != mm_sched_llc)
>> +		p->preferred_llc = mm_sched_llc;
> 
> This writes to p->preferred_llc without using WRITE_ONCE(). If later patches read p->preferred_llc from
> load balancing or migration paths on other CPUs, wouldn't this create a data race?
> 

I suppose you are referring to data inconsistency between CPUs, as 
READ/WRITE_ONCE()
make sure that the latest data is always read from/written to memory rather
than from/to registers.

> For example:
> CPU 0: Task is running, account_mm_sched() writes p->preferred_llc
> CPU 1: Load balancer reads p->preferred_llc to make migration decisions
> 
> Should this use WRITE_ONCE() and READ_ONCE() at the read sites, unless all accesses are guaranteed to be
> under rq->lock?
> 

Actually, I found that p->preferred_llc is only read during task 
enqueue/dequeue in
account_llc_enqueue() and account_llc_dequeue(), which are protected by 
the rq lock.
However, after reviewing the code again, I noticed that in 
migrate_degrades_llc()
(part of the load balance logic), we should check the task's 
preferred_llc instead
of the task's current LLC obtained via task_llc(p). We will therefore 
switch to
using READ/WRITE_ONCE for accesses to this variable in 
migrate_degrades_llc().

thanks,
Chenyu