Cache Aware Scheduling

[PATCH v3 08/21] sched/cache: Calculate the percpu sd task LLC preference

Posted by Tim Chen 1 month, 2 weeks ago

Calculate the number of tasks' LLC preferences for each runqueue.
This statistic is computed during task enqueue and dequeue
operations, and is used by the cache-aware load balancing.

Co-developed-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
---

Notes:
    v2->v3: Move max_llcs check from patch4 to this patch.
    This would clarify the rationale for the
    max_llc check and makes review easier (Peter Zijlstra).

 kernel/sched/fair.c | 56 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 54 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6ad9ad2f918f..4a98aa866d65 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1199,28 +1199,80 @@ static int llc_id(int cpu)
 	return per_cpu(sd_llc_id, cpu);
 }
 
+static inline bool valid_llc_id(int id)
+{
+	if (unlikely(id < 0 || id >= max_llcs))
+		return false;
+
+	return true;
+}
+
+static inline bool valid_llc_buf(struct sched_domain *sd,
+				 int id)
+{
+	/*
+	 * The check for sd and its corresponding pf is to
+	 * confirm that the sd->pf[] has been allocated in
+	 * build_sched_domains() after the assignment of
+	 * per_cpu(sd_llc_id, i). This is used to avoid
+	 * the race condition.
+	 */
+	if (unlikely(!sd || !sd->pf))
+		return false;
+
+	return valid_llc_id(id);
+}
+
 static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
 {
+	struct sched_domain *sd;
 	int pref_llc;
 
 	pref_llc = p->preferred_llc;
-	if (pref_llc < 0)
+	if (!valid_llc_id(pref_llc))
 		return;
 
 	rq->nr_llc_running++;
 	rq->nr_pref_llc_running += (pref_llc == task_llc(p));
+
+	scoped_guard (rcu) {
+		sd = rcu_dereference(rq->sd);
+		if (valid_llc_buf(sd, pref_llc))
+			sd->pf[pref_llc]++;
+	}
 }
 
 static void account_llc_dequeue(struct rq *rq, struct task_struct *p)
 {
+	struct sched_domain *sd;
 	int pref_llc;
 
 	pref_llc = p->preferred_llc;
-	if (pref_llc < 0)
+	if (!valid_llc_id(pref_llc))
 		return;
 
 	rq->nr_llc_running--;
 	rq->nr_pref_llc_running -= (pref_llc == task_llc(p));
+
+	scoped_guard (rcu) {
+		sd = rcu_dereference(rq->sd);
+		if (valid_llc_buf(sd, pref_llc)) {
+			/*
+			 * There is a race condition between dequeue
+			 * and CPU hotplug. After a task has been enqueued
+			 * on CPUx, a CPU hotplug event occurs, and all online
+			 * CPUs (including CPUx) rebuild their sched_domains
+			 * and reset statistics to zero (including sd->pf).
+			 * This can cause temporary undercount and we have to
+			 * check for such underflow in sd->pf.
+			 *
+			 * This undercount is temporary and accurate accounting
+			 * will resume once the rq has a chance to be idle.
+			 */
+			if (sd->pf[pref_llc])
+				sd->pf[pref_llc]--;
+		}
+	}
 }
 
 void mm_init_sched(struct mm_struct *mm,
-- 
2.32.0

Re: [PATCH v3 08/21] sched/cache: Calculate the percpu sd task LLC preference

Posted by Peter Zijlstra 1 month, 1 week ago

On Tue, Feb 10, 2026 at 02:18:48PM -0800, Tim Chen wrote:
>  static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
>  {
> +	struct sched_domain *sd;
>  	int pref_llc;
>  
>  	pref_llc = p->preferred_llc;
> -	if (pref_llc < 0)
> +	if (!valid_llc_id(pref_llc))
>  		return;
>  
>  	rq->nr_llc_running++;
>  	rq->nr_pref_llc_running += (pref_llc == task_llc(p));
> +
> +	scoped_guard (rcu) {
> +		sd = rcu_dereference(rq->sd);
> +		if (valid_llc_buf(sd, pref_llc))
> +			sd->pf[pref_llc]++;
> +	}
>  }
>  
>  static void account_llc_dequeue(struct rq *rq, struct task_struct *p)
>  {
> +	struct sched_domain *sd;
>  	int pref_llc;
>  
>  	pref_llc = p->preferred_llc;
> -	if (pref_llc < 0)
> +	if (!valid_llc_id(pref_llc))
>  		return;
>  
>  	rq->nr_llc_running--;
>  	rq->nr_pref_llc_running -= (pref_llc == task_llc(p));
> +
> +	scoped_guard (rcu) {
> +		sd = rcu_dereference(rq->sd);
> +		if (valid_llc_buf(sd, pref_llc)) {
> +			/*
> +			 * There is a race condition between dequeue
> +			 * and CPU hotplug. After a task has been enqueued
> +			 * on CPUx, a CPU hotplug event occurs, and all online
> +			 * CPUs (including CPUx) rebuild their sched_domains
> +			 * and reset statistics to zero (including sd->pf).
> +			 * This can cause temporary undercount and we have to
> +			 * check for such underflow in sd->pf.
> +			 *
> +			 * This undercount is temporary and accurate accounting
> +			 * will resume once the rq has a chance to be idle.
> +			 */
> +			if (sd->pf[pref_llc])
> +				sd->pf[pref_llc]--;
> +		}
> +	}
>  }

FWIW, enqueue/dequeue must be with rq->lock held, and thus preemption
disabled and IRQs off. That RCU section is completely pointless.

Re: [PATCH v3 08/21] sched/cache: Calculate the percpu sd task LLC preference

Posted by Peter Zijlstra 1 month, 1 week ago

On Fri, Feb 20, 2026 at 12:02:22PM +0100, Peter Zijlstra wrote:
> On Tue, Feb 10, 2026 at 02:18:48PM -0800, Tim Chen wrote:
> >  static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
> >  {
> > +	struct sched_domain *sd;
> >  	int pref_llc;
> >  
> >  	pref_llc = p->preferred_llc;
> > -	if (pref_llc < 0)
> > +	if (!valid_llc_id(pref_llc))
> >  		return;
> >  
> >  	rq->nr_llc_running++;
> >  	rq->nr_pref_llc_running += (pref_llc == task_llc(p));
> > +
> > +	scoped_guard (rcu) {
> > +		sd = rcu_dereference(rq->sd);
> > +		if (valid_llc_buf(sd, pref_llc))
> > +			sd->pf[pref_llc]++;
> > +	}
> >  }
> >  
> >  static void account_llc_dequeue(struct rq *rq, struct task_struct *p)
> >  {
> > +	struct sched_domain *sd;
> >  	int pref_llc;
> >  
> >  	pref_llc = p->preferred_llc;
> > -	if (pref_llc < 0)
> > +	if (!valid_llc_id(pref_llc))
> >  		return;
> >  
> >  	rq->nr_llc_running--;
> >  	rq->nr_pref_llc_running -= (pref_llc == task_llc(p));
> > +
> > +	scoped_guard (rcu) {
> > +		sd = rcu_dereference(rq->sd);
> > +		if (valid_llc_buf(sd, pref_llc)) {
> > +			/*
> > +			 * There is a race condition between dequeue
> > +			 * and CPU hotplug. After a task has been enqueued
> > +			 * on CPUx, a CPU hotplug event occurs, and all online
> > +			 * CPUs (including CPUx) rebuild their sched_domains
> > +			 * and reset statistics to zero (including sd->pf).
> > +			 * This can cause temporary undercount and we have to
> > +			 * check for such underflow in sd->pf.
> > +			 *
> > +			 * This undercount is temporary and accurate accounting
> > +			 * will resume once the rq has a chance to be idle.
> > +			 */
> > +			if (sd->pf[pref_llc])
> > +				sd->pf[pref_llc]--;
> > +		}
> > +	}
> >  }
> 
> FWIW, enqueue/dequeue must be with rq->lock held, and thus preemption
> disabled and IRQs off. That RCU section is completely pointless.

That is, use rcu_dereference_all() and observe the warning go away.

Re: [PATCH v3 08/21] sched/cache: Calculate the percpu sd task LLC preference

Posted by Chen, Yu C 1 month, 1 week ago

On 2/20/2026 10:02 PM, Peter Zijlstra wrote:
> On Fri, Feb 20, 2026 at 12:02:22PM +0100, Peter Zijlstra wrote:
>> On Tue, Feb 10, 2026 at 02:18:48PM -0800, Tim Chen wrote:
>>>   static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
>>>   {
>>> +	struct sched_domain *sd;
>>>   	int pref_llc;
>>>   
>>>   	pref_llc = p->preferred_llc;
>>> -	if (pref_llc < 0)
>>> +	if (!valid_llc_id(pref_llc))
>>>   		return;
>>>   
>>>   	rq->nr_llc_running++;
>>>   	rq->nr_pref_llc_running += (pref_llc == task_llc(p));
>>> +
>>> +	scoped_guard (rcu) {
>>> +		sd = rcu_dereference(rq->sd);
>>> +		if (valid_llc_buf(sd, pref_llc))
>>> +			sd->pf[pref_llc]++;
>>> +	}
>>>   }
>>>   
>>>   static void account_llc_dequeue(struct rq *rq, struct task_struct *p)
>>>   {
>>> +	struct sched_domain *sd;
>>>   	int pref_llc;
>>>   
>>>   	pref_llc = p->preferred_llc;
>>> -	if (pref_llc < 0)
>>> +	if (!valid_llc_id(pref_llc))
>>>   		return;
>>>   
>>>   	rq->nr_llc_running--;
>>>   	rq->nr_pref_llc_running -= (pref_llc == task_llc(p));
>>> +
>>> +	scoped_guard (rcu) {
>>> +		sd = rcu_dereference(rq->sd);
>>> +		if (valid_llc_buf(sd, pref_llc)) {
>>> +			/*
>>> +			 * There is a race condition between dequeue
>>> +			 * and CPU hotplug. After a task has been enqueued
>>> +			 * on CPUx, a CPU hotplug event occurs, and all online
>>> +			 * CPUs (including CPUx) rebuild their sched_domains
>>> +			 * and reset statistics to zero (including sd->pf).
>>> +			 * This can cause temporary undercount and we have to
>>> +			 * check for such underflow in sd->pf.
>>> +			 *
>>> +			 * This undercount is temporary and accurate accounting
>>> +			 * will resume once the rq has a chance to be idle.
>>> +			 */
>>> +			if (sd->pf[pref_llc])
>>> +				sd->pf[pref_llc]--;
>>> +		}
>>> +	}
>>>   }
>>
>> FWIW, enqueue/dequeue must be with rq->lock held, and thus preemption
>> disabled and IRQs off. That RCU section is completely pointless.
> 
> That is, use rcu_dereference_all() and observe the warning go away.

OK we will remove rcu_read_lock() and use rcu_dereference_all() directly.

thanks,
Chenyu