[v2] Cache aware scheduling

[PATCH v2 05/23] sched/cache: Assign preferred LLC ID to processes

Posted by Tim Chen 2 months ago

With cache-aware scheduling enabled, each task is assigned a
preferred LLC ID. This allows quick identification of the LLC domain
where the task prefers to run, similar to numa_preferred_nid in
NUMA balancing.

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
---

Notes:
    v1->v2: Align preferred LLC with NUMA balancing's preferred node.

 include/linux/sched.h |  1 +
 init/init_task.c      |  3 +++
 kernel/sched/fair.c   | 18 ++++++++++++++++++
 3 files changed, 22 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 278b529c91df..1ad46220cd04 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1408,6 +1408,7 @@ struct task_struct {
 
 #ifdef CONFIG_SCHED_CACHE
 	struct callback_head		cache_work;
+	int				preferred_llc;
 #endif
 
 #ifdef CONFIG_RSEQ
diff --git a/init/init_task.c b/init/init_task.c
index a55e2189206f..44bae72b5b7d 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -191,6 +191,9 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
 	.numa_group	= NULL,
 	.numa_faults	= NULL,
 #endif
+#ifdef CONFIG_SCHED_CACHE
+	.preferred_llc  = -1,
+#endif
 #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 	.kasan_depth	= 1,
 #endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0a3918269906..10cec83f65d5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1300,6 +1300,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
 	struct mm_struct *mm = p->mm;
 	struct mm_sched *pcpu_sched;
 	unsigned long epoch;
+	int mm_sched_llc = -1;
 
 	if (!sched_cache_enabled())
 		return;
@@ -1330,6 +1331,23 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
 		if (mm->mm_sched_cpu != -1)
 			mm->mm_sched_cpu = -1;
 	}
+
+	if (mm->mm_sched_cpu != -1) {
+		mm_sched_llc = llc_id(mm->mm_sched_cpu);
+
+#ifdef CONFIG_NUMA_BALANCING
+		/*
+		 * Don't assign preferred LLC if it
+		 * conflicts with NUMA balancing.
+		 */
+		if (p->numa_preferred_nid >= 0 &&
+		    cpu_to_node(mm->mm_sched_cpu) != p->numa_preferred_nid)
+			mm_sched_llc = -1;
+#endif
+	}
+
+	if (p->preferred_llc != mm_sched_llc)
+		p->preferred_llc = mm_sched_llc;
 }
 
 static void task_tick_cache(struct rq *rq, struct task_struct *p)
-- 
2.32.0

Re: [PATCH v2 05/23] sched/cache: Assign preferred LLC ID to processes

Posted by Vern Hao 1 month, 4 weeks ago

On 2025/12/4 07:07, Tim Chen wrote:
> With cache-aware scheduling enabled, each task is assigned a
> preferred LLC ID. This allows quick identification of the LLC domain
> where the task prefers to run, similar to numa_preferred_nid in
> NUMA balancing.
>
> Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
> ---
>
> Notes:
>      v1->v2: Align preferred LLC with NUMA balancing's preferred node.
>
>   include/linux/sched.h |  1 +
>   init/init_task.c      |  3 +++
>   kernel/sched/fair.c   | 18 ++++++++++++++++++
>   3 files changed, 22 insertions(+)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 278b529c91df..1ad46220cd04 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1408,6 +1408,7 @@ struct task_struct {
>   
>   #ifdef CONFIG_SCHED_CACHE
>   	struct callback_head		cache_work;
> +	int				preferred_llc;
>   #endif
>   
>   #ifdef CONFIG_RSEQ
> diff --git a/init/init_task.c b/init/init_task.c
> index a55e2189206f..44bae72b5b7d 100644
> --- a/init/init_task.c
> +++ b/init/init_task.c
> @@ -191,6 +191,9 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
>   	.numa_group	= NULL,
>   	.numa_faults	= NULL,
>   #endif
> +#ifdef CONFIG_SCHED_CACHE
> +	.preferred_llc  = -1,
> +#endif
>   #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
>   	.kasan_depth	= 1,
>   #endif
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 0a3918269906..10cec83f65d5 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1300,6 +1300,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
>   	struct mm_struct *mm = p->mm;
>   	struct mm_sched *pcpu_sched;
>   	unsigned long epoch;
> +	int mm_sched_llc = -1;
>   
>   	if (!sched_cache_enabled())
>   		return;
> @@ -1330,6 +1331,23 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
>   		if (mm->mm_sched_cpu != -1)
>   			mm->mm_sched_cpu = -1;
>   	}
> +
> +	if (mm->mm_sched_cpu != -1) {
> +		mm_sched_llc = llc_id(mm->mm_sched_cpu);
> +
> +#ifdef CONFIG_NUMA_BALANCING
> +		/*
> +		 * Don't assign preferred LLC if it
> +		 * conflicts with NUMA balancing.
> +		 */
> +		if (p->numa_preferred_nid >= 0 &&

I wonder if the restriction here shouldn't be so strict. In Mel Gorman's 
patch (e496132ebedd sched/fair: Adjust the allowed NUMA imbalance when 
SD_NUMA spans multiple LLCs), the value of the 'imb_numa_nr' is checked 
to determine if |SD_NUMA| imbalance is allowed. Could we use this same 
check to decide whether or not to perform a cross-numa migration?

> +		    cpu_to_node(mm->mm_sched_cpu) != p->numa_preferred_nid)
> +			mm_sched_llc = -1;
> +#endif
> +	}
> +
> +	if (p->preferred_llc != mm_sched_llc)
> +		p->preferred_llc = mm_sched_llc;
>   }
>   
>   static void task_tick_cache(struct rq *rq, struct task_struct *p)

Re: [PATCH v2 05/23] sched/cache: Assign preferred LLC ID to processes

Posted by Tim Chen 1 month, 3 weeks ago

On Fri, 2025-12-12 at 11:34 +0800, Vern Hao wrote:
> On 2025/12/4 07:07, Tim Chen wrote:
> > With cache-aware scheduling enabled, each task is assigned a
> > preferred LLC ID. This allows quick identification of the LLC domain
> > where the task prefers to run, similar to numa_preferred_nid in
> > NUMA balancing.
> > 
> > Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
> > ---
> > 
> > Notes:
> >      v1->v2: Align preferred LLC with NUMA balancing's preferred node.
> > 
> >   include/linux/sched.h |  1 +
> >   init/init_task.c      |  3 +++
> >   kernel/sched/fair.c   | 18 ++++++++++++++++++
> >   3 files changed, 22 insertions(+)
> > 
> > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > index 278b529c91df..1ad46220cd04 100644
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -1408,6 +1408,7 @@ struct task_struct {
> >   
> >   #ifdef CONFIG_SCHED_CACHE
> >   	struct callback_head		cache_work;
> > +	int				preferred_llc;
> >   #endif
> >   
> >   #ifdef CONFIG_RSEQ
> > diff --git a/init/init_task.c b/init/init_task.c
> > index a55e2189206f..44bae72b5b7d 100644
> > --- a/init/init_task.c
> > +++ b/init/init_task.c
> > @@ -191,6 +191,9 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
> >   	.numa_group	= NULL,
> >   	.numa_faults	= NULL,
> >   #endif
> > +#ifdef CONFIG_SCHED_CACHE
> > +	.preferred_llc  = -1,
> > +#endif
> >   #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
> >   	.kasan_depth	= 1,
> >   #endif
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 0a3918269906..10cec83f65d5 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -1300,6 +1300,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
> >   	struct mm_struct *mm = p->mm;
> >   	struct mm_sched *pcpu_sched;
> >   	unsigned long epoch;
> > +	int mm_sched_llc = -1;
> >   
> >   	if (!sched_cache_enabled())
> >   		return;
> > @@ -1330,6 +1331,23 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
> >   		if (mm->mm_sched_cpu != -1)
> >   			mm->mm_sched_cpu = -1;
> >   	}
> > +
> > +	if (mm->mm_sched_cpu != -1) {
> > +		mm_sched_llc = llc_id(mm->mm_sched_cpu);
> > +
> > +#ifdef CONFIG_NUMA_BALANCING
> > +		/*
> > +		 * Don't assign preferred LLC if it
> > +		 * conflicts with NUMA balancing.
> > +		 */
> > +		if (p->numa_preferred_nid >= 0 &&
> 
> I wonder if the restriction here shouldn't be so strict. In Mel Gorman's 
> patch (e496132ebedd sched/fair: Adjust the allowed NUMA imbalance when 
> SD_NUMA spans multiple LLCs), the value of the 'imb_numa_nr' is checked 
> to determine if |SD_NUMA| imbalance is allowed. Could we use this same 
> check to decide whether or not to perform a cross-numa migration?

If we set the preferred LLC that's in a different node other than the preferred
node, the preferred LLC is going to fight with NUMA balancing and bounce
tasks back and forth between nodes. NUMA locality is going to affect performance
more so we'll let NUMA preference take precedence.

Tim 

> 
> > +		    cpu_to_node(mm->mm_sched_cpu) != p->numa_preferred_nid)
> > +			mm_sched_llc = -1;
> > +#endif
> > +	}
> > +
> > +	if (p->preferred_llc != mm_sched_llc)
> > +		p->preferred_llc = mm_sched_llc;
> >   }
> >   
> >   static void task_tick_cache(struct rq *rq, struct task_struct *p)

Re: [PATCH v2 05/23] sched/cache: Assign preferred LLC ID to processes

Posted by Vern Hao 1 month, 3 weeks ago

On 2025/12/16 03:32, Tim Chen wrote:
> On Fri, 2025-12-12 at 11:34 +0800, Vern Hao wrote:
>> On 2025/12/4 07:07, Tim Chen wrote:
>>> With cache-aware scheduling enabled, each task is assigned a
>>> preferred LLC ID. This allows quick identification of the LLC domain
>>> where the task prefers to run, similar to numa_preferred_nid in
>>> NUMA balancing.
>>>
>>> Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
>>> ---
>>>
>>> Notes:
>>>       v1->v2: Align preferred LLC with NUMA balancing's preferred node.
>>>
>>>    include/linux/sched.h |  1 +
>>>    init/init_task.c      |  3 +++
>>>    kernel/sched/fair.c   | 18 ++++++++++++++++++
>>>    3 files changed, 22 insertions(+)
>>>
>>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>>> index 278b529c91df..1ad46220cd04 100644
>>> --- a/include/linux/sched.h
>>> +++ b/include/linux/sched.h
>>> @@ -1408,6 +1408,7 @@ struct task_struct {
>>>    
>>>    #ifdef CONFIG_SCHED_CACHE
>>>    	struct callback_head		cache_work;
>>> +	int				preferred_llc;
>>>    #endif
>>>    
>>>    #ifdef CONFIG_RSEQ
>>> diff --git a/init/init_task.c b/init/init_task.c
>>> index a55e2189206f..44bae72b5b7d 100644
>>> --- a/init/init_task.c
>>> +++ b/init/init_task.c
>>> @@ -191,6 +191,9 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
>>>    	.numa_group	= NULL,
>>>    	.numa_faults	= NULL,
>>>    #endif
>>> +#ifdef CONFIG_SCHED_CACHE
>>> +	.preferred_llc  = -1,
>>> +#endif
>>>    #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
>>>    	.kasan_depth	= 1,
>>>    #endif
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index 0a3918269906..10cec83f65d5 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -1300,6 +1300,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
>>>    	struct mm_struct *mm = p->mm;
>>>    	struct mm_sched *pcpu_sched;
>>>    	unsigned long epoch;
>>> +	int mm_sched_llc = -1;
>>>    
>>>    	if (!sched_cache_enabled())
>>>    		return;
>>> @@ -1330,6 +1331,23 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
>>>    		if (mm->mm_sched_cpu != -1)
>>>    			mm->mm_sched_cpu = -1;
>>>    	}
>>> +
>>> +	if (mm->mm_sched_cpu != -1) {
>>> +		mm_sched_llc = llc_id(mm->mm_sched_cpu);
>>> +
>>> +#ifdef CONFIG_NUMA_BALANCING
>>> +		/*
>>> +		 * Don't assign preferred LLC if it
>>> +		 * conflicts with NUMA balancing.
>>> +		 */
>>> +		if (p->numa_preferred_nid >= 0 &&
>> I wonder if the restriction here shouldn't be so strict. In Mel Gorman's
>> patch (e496132ebedd sched/fair: Adjust the allowed NUMA imbalance when
>> SD_NUMA spans multiple LLCs), the value of the 'imb_numa_nr' is checked
>> to determine if |SD_NUMA| imbalance is allowed. Could we use this same
>> check to decide whether or not to perform a cross-numa migration?
> If we set the preferred LLC that's in a different node other than the preferred
> node, the preferred LLC is going to fight with NUMA balancing and bounce
> tasks back and forth between nodes. NUMA locality is going to affect performance
> more so we'll let NUMA preference take precedence.

I might not have explained myself clearly. I’m questioning whether we 
need to integrate an imbalance check into the 'sgs->group_type|' is 
’group_has_spare|' scenario,  like Mel’s patch, to refine our llc 
migration decisions.

just like this:  8 cpus in one LLC, LLC-A has 6 tasks,  LLC-B has 2 
tasks, if LLC-A has task_a need to migrate to LLC_B, how to deal it ?

> Tim
>
>>> +		    cpu_to_node(mm->mm_sched_cpu) != p->numa_preferred_nid)
>>> +			mm_sched_llc = -1;
>>> +#endif
>>> +	}
>>> +
>>> +	if (p->preferred_llc != mm_sched_llc)
>>> +		p->preferred_llc = mm_sched_llc;
>>>    }
>>>    
>>>    static void task_tick_cache(struct rq *rq, struct task_struct *p)

Re: [PATCH v2 05/23] sched/cache: Assign preferred LLC ID to processes

Posted by Chen, Yu C 1 month, 2 weeks ago

On 12/19/2025 12:01 PM, Vern Hao wrote:
> 
> On 2025/12/16 03:32, Tim Chen wrote:
>> On Fri, 2025-12-12 at 11:34 +0800, Vern Hao wrote:
>>> On 2025/12/4 07:07, Tim Chen wrote:
>>>> With cache-aware scheduling enabled, each task is assigned a
>>>> preferred LLC ID. This allows quick identification of the LLC domain
>>>> where the task prefers to run, similar to numa_preferred_nid in
>>>> NUMA balancing.
>>>>

[snip]

>>>> +
>>>> +    if (mm->mm_sched_cpu != -1) {
>>>> +        mm_sched_llc = llc_id(mm->mm_sched_cpu);
>>>> +
>>>> +#ifdef CONFIG_NUMA_BALANCING
>>>> +        /*
>>>> +         * Don't assign preferred LLC if it
>>>> +         * conflicts with NUMA balancing.
>>>> +         */
>>>> +        if (p->numa_preferred_nid >= 0 &&
>>> I wonder if the restriction here shouldn't be so strict. In Mel Gorman's
>>> patch (e496132ebedd sched/fair: Adjust the allowed NUMA imbalance when
>>> SD_NUMA spans multiple LLCs), the value of the 'imb_numa_nr' is checked
>>> to determine if |SD_NUMA| imbalance is allowed. Could we use this same
>>> check to decide whether or not to perform a cross-numa migration?
>> If we set the preferred LLC that's in a different node other than the 
>> preferred
>> node, the preferred LLC is going to fight with NUMA balancing and bounce
>> tasks back and forth between nodes. NUMA locality is going to affect 
>> performance
>> more so we'll let NUMA preference take precedence.
> 
> I might not have explained myself clearly. I’m questioning whether we 
> need to integrate an imbalance check into the 'sgs->group_type|' is 
> ’group_has_spare|' scenario,  like Mel’s patch, to refine our llc 
> migration decisions.
> 
> just like this:  8 cpus in one LLC, LLC-A has 6 tasks,  LLC-B has 2 
> tasks, if LLC-A has task_a need to migrate to LLC_B, how to deal it ?
> 

If LLC_B is the preferred LLC of task_a, and if the average utilization of
LLC_B has not reached 50%, task_a will be moved to LLC_B. If LLC_A is the
preferred LLC of task_a, then if LLC_A has not reached 50%, task_a will
not be migrated to LLC_B. There are some comments around can_migrate_llc(),
which describe the decision matrix for migration.

thanks,
Chenyu

Re: [PATCH v2 05/23] sched/cache: Assign preferred LLC ID to processes

Posted by Peter Zijlstra 2 months ago

On Wed, Dec 03, 2025 at 03:07:24PM -0800, Tim Chen wrote:
> With cache-aware scheduling enabled, each task is assigned a
> preferred LLC ID. This allows quick identification of the LLC domain
> where the task prefers to run, similar to numa_preferred_nid in
> NUMA balancing.

> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 0a3918269906..10cec83f65d5 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1300,6 +1300,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
>  	struct mm_struct *mm = p->mm;
>  	struct mm_sched *pcpu_sched;
>  	unsigned long epoch;
> +	int mm_sched_llc = -1;
>  
>  	if (!sched_cache_enabled())
>  		return;
> @@ -1330,6 +1331,23 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
>  		if (mm->mm_sched_cpu != -1)
>  			mm->mm_sched_cpu = -1;
>  	}
> +
> +	if (mm->mm_sched_cpu != -1) {
> +		mm_sched_llc = llc_id(mm->mm_sched_cpu);
> +
> +#ifdef CONFIG_NUMA_BALANCING
> +		/*
> +		 * Don't assign preferred LLC if it
> +		 * conflicts with NUMA balancing.
> +		 */
> +		if (p->numa_preferred_nid >= 0 &&
> +		    cpu_to_node(mm->mm_sched_cpu) != p->numa_preferred_nid)
> +			mm_sched_llc = -1;
> +#endif
> +	}
> +
> +	if (p->preferred_llc != mm_sched_llc)
> +		p->preferred_llc = mm_sched_llc;
>  }

This can of course still happen when sched_setnuma() gets called. I'm
thinking it is not much of an issue because we expect this thing to get
called fairly regularly -- at a higher rate than sched_setnuma() at
least -- and thus the conflict only exists for a short period of time?

If so, that would make for a good comment.

Additionally, we could of course search for the busiest LLC inside the
node, instead of setting -1. Again, that could live as a comment for
future work.

Re: [PATCH v2 05/23] sched/cache: Assign preferred LLC ID to processes

Posted by Tim Chen 2 months ago

On Tue, 2025-12-09 at 13:11 +0100, Peter Zijlstra wrote:
> On Wed, Dec 03, 2025 at 03:07:24PM -0800, Tim Chen wrote:
> > With cache-aware scheduling enabled, each task is assigned a
> > preferred LLC ID. This allows quick identification of the LLC domain
> > where the task prefers to run, similar to numa_preferred_nid in
> > NUMA balancing.
> 
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 0a3918269906..10cec83f65d5 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -1300,6 +1300,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
> >  	struct mm_struct *mm = p->mm;
> >  	struct mm_sched *pcpu_sched;
> >  	unsigned long epoch;
> > +	int mm_sched_llc = -1;
> >  
> >  	if (!sched_cache_enabled())
> >  		return;
> > @@ -1330,6 +1331,23 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
> >  		if (mm->mm_sched_cpu != -1)
> >  			mm->mm_sched_cpu = -1;
> >  	}
> > +
> > +	if (mm->mm_sched_cpu != -1) {
> > +		mm_sched_llc = llc_id(mm->mm_sched_cpu);
> > +
> > +#ifdef CONFIG_NUMA_BALANCING
> > +		/*
> > +		 * Don't assign preferred LLC if it
> > +		 * conflicts with NUMA balancing.
> > +		 */
> > +		if (p->numa_preferred_nid >= 0 &&
> > +		    cpu_to_node(mm->mm_sched_cpu) != p->numa_preferred_nid)
> > +			mm_sched_llc = -1;
> > +#endif
> > +	}
> > +
> > +	if (p->preferred_llc != mm_sched_llc)
> > +		p->preferred_llc = mm_sched_llc;
> >  }
> 
> This can of course still happen when sched_setnuma() gets called. I'm
> thinking it is not much of an issue because we expect this thing to get
> called fairly regularly -- at a higher rate than sched_setnuma() at
> least -- and thus the conflict only exists for a short period of time?
> 
> If so, that would make for a good comment.

Sure.  Will do.

> 
> Additionally, we could of course search for the busiest LLC inside the
> node, instead of setting -1. Again, that could live as a comment for
> future work.

A potential issue with scanning only the preferred node of a single task 
is that tasks within the same process may have different preferred nodes.
For example, task 1 may prefer one node, while tasks 2…n prefer another. 
If we base the busiest-LLC scan solely on task 1’s preference, we may 
ignore the preferences of tasks 2…n. Consequently, constraining 
the preferred LLC according to task 1’s node can interfere with 
NUMA balancing for the rest of the process. This problem does not 
arise when all tasks being aggregated belong to the same numa_group, 
since they will share the same preferred node.

Tim