For each runqueue, track the number of tasks with an LLC preference
and how many of them are running on their preferred LLC. This mirrors
nr_numa_running and nr_preferred_running for NUMA balancing, and will
be used by cache-aware load balancing in later patches.
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
---
Notes:
v1->v2: Invoke task_of() once and reuse its result afterwards.
(Peter Zijlstra)
Remove hacky reset_llc_stats() and introduce sched_llc_active flag
to properly pair enqueue/dequeue statistics update (Peter Zijlstra, K Prateek Nayak)
include/linux/sched.h | 2 ++
init/init_task.c | 1 +
kernel/sched/core.c | 5 ++++
kernel/sched/fair.c | 60 ++++++++++++++++++++++++++++++++++++++++---
kernel/sched/sched.h | 6 +++++
5 files changed, 71 insertions(+), 3 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1ad46220cd04..466ba8b7398c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1408,6 +1408,8 @@ struct task_struct {
#ifdef CONFIG_SCHED_CACHE
struct callback_head cache_work;
+ /*the p is currently refcounted in a rq's preferred llc stats*/
+ bool sched_llc_active;
int preferred_llc;
#endif
diff --git a/init/init_task.c b/init/init_task.c
index 44bae72b5b7d..ee78837b0aa2 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -192,6 +192,7 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
.numa_faults = NULL,
#endif
#ifdef CONFIG_SCHED_CACHE
+ .sched_llc_active = false,
.preferred_llc = -1,
#endif
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e8bdf03a4b7f..48626c81ba8e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -531,6 +531,11 @@ void __trace_set_current_state(int state_value)
}
EXPORT_SYMBOL(__trace_set_current_state);
+int task_llc(const struct task_struct *p)
+{
+ return per_cpu(sd_llc_id, task_cpu(p));
+}
+
/*
* Serialization rules:
*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 10cec83f65d5..d46a70a9d9fb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1223,6 +1223,43 @@ static int llc_id(int cpu)
return llc;
}
+static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
+{
+ int pref_llc;
+
+ if (!sched_cache_enabled())
+ return;
+
+ pref_llc = p->preferred_llc;
+ if (pref_llc < 0)
+ return;
+
+ rq->nr_llc_running++;
+ rq->nr_pref_llc_running += (pref_llc == task_llc(p));
+ p->sched_llc_active = true;
+}
+
+static void account_llc_dequeue(struct rq *rq, struct task_struct *p)
+{
+ int pref_llc;
+
+ /*
+ * Borrow the uc_se->active from uclamp_rq_inc_id(),
+ * uclamp_rq_dec_id() to avoid the unbalanced calculation
+ * of rq statistics.
+ */
+ if (unlikely(!p->sched_llc_active))
+ return;
+
+ pref_llc = p->preferred_llc;
+ if (pref_llc < 0)
+ return;
+
+ rq->nr_llc_running--;
+ rq->nr_pref_llc_running -= (pref_llc == task_llc(p));
+ p->sched_llc_active = false;
+}
+
void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
{
unsigned long epoch;
@@ -1294,6 +1331,8 @@ static unsigned long __no_profile fraction_mm_sched(struct rq *rq, struct mm_sch
return div64_u64(NICE_0_LOAD * pcpu_sched->runtime, rq->cpu_runtime + 1);
}
+static unsigned int task_running_on_cpu(int cpu, struct task_struct *p);
+
static inline
void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
{
@@ -1346,8 +1385,13 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
#endif
}
- if (p->preferred_llc != mm_sched_llc)
+ /* task not on rq accounted later in account_entity_enqueue() */
+ if (task_running_on_cpu(rq->cpu, p) &&
+ p->preferred_llc != mm_sched_llc) {
+ account_llc_dequeue(rq, p);
p->preferred_llc = mm_sched_llc;
+ account_llc_enqueue(rq, p);
+ }
}
static void task_tick_cache(struct rq *rq, struct task_struct *p)
@@ -1475,6 +1519,10 @@ void init_sched_mm(struct task_struct *p) { }
static void task_tick_cache(struct rq *rq, struct task_struct *p) { }
+static void account_llc_enqueue(struct rq *rq, struct task_struct *p) {}
+
+static void account_llc_dequeue(struct rq *rq, struct task_struct *p) {}
+
#endif
/*
@@ -3965,9 +4013,11 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_add(&cfs_rq->load, se->load.weight);
if (entity_is_task(se)) {
+ struct task_struct *p = task_of(se);
struct rq *rq = rq_of(cfs_rq);
- account_numa_enqueue(rq, task_of(se));
+ account_numa_enqueue(rq, p);
+ account_llc_enqueue(rq, p);
list_add(&se->group_node, &rq->cfs_tasks);
}
cfs_rq->nr_queued++;
@@ -3978,7 +4028,11 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_sub(&cfs_rq->load, se->load.weight);
if (entity_is_task(se)) {
- account_numa_dequeue(rq_of(cfs_rq), task_of(se));
+ struct task_struct *p = task_of(se);
+ struct rq *rq = rq_of(cfs_rq);
+
+ account_numa_dequeue(rq, p);
+ account_llc_dequeue(rq, p);
list_del_init(&se->group_node);
}
cfs_rq->nr_queued--;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 728737641847..ee8b70647835 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1126,6 +1126,10 @@ struct rq {
unsigned int nr_preferred_running;
unsigned int numa_migrate_on;
#endif
+#ifdef CONFIG_SCHED_CACHE
+ unsigned int nr_pref_llc_running;
+ unsigned int nr_llc_running;
+#endif
#ifdef CONFIG_NO_HZ_COMMON
unsigned long last_blocked_load_update_tick;
unsigned int has_blocked_load;
@@ -1980,6 +1984,8 @@ init_numa_balancing(u64 clone_flags, struct task_struct *p)
#endif /* !CONFIG_NUMA_BALANCING */
+int task_llc(const struct task_struct *p);
+
static inline void
queue_balance_callback(struct rq *rq,
struct balance_callback *head,
--
2.32.0
On 2025/12/4 07:07, Tim Chen wrote:
> For each runqueue, track the number of tasks with an LLC preference
> and how many of them are running on their preferred LLC. This mirrors
> nr_numa_running and nr_preferred_running for NUMA balancing, and will
> be used by cache-aware load balancing in later patches.
>
> Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
> ---
>
> Notes:
> v1->v2: Invoke task_of() once and reuse its result afterwards.
> (Peter Zijlstra)
> Remove hacky reset_llc_stats() and introduce sched_llc_active flag
> to properly pair enqueue/dequeue statistics update (Peter Zijlstra, K Prateek Nayak)
>
> include/linux/sched.h | 2 ++
> init/init_task.c | 1 +
> kernel/sched/core.c | 5 ++++
> kernel/sched/fair.c | 60 ++++++++++++++++++++++++++++++++++++++++---
> kernel/sched/sched.h | 6 +++++
> 5 files changed, 71 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 1ad46220cd04..466ba8b7398c 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1408,6 +1408,8 @@ struct task_struct {
>
> #ifdef CONFIG_SCHED_CACHE
> struct callback_head cache_work;
> + /*the p is currently refcounted in a rq's preferred llc stats*/
> + bool sched_llc_active;
> int preferred_llc;
> #endif
>
> diff --git a/init/init_task.c b/init/init_task.c
> index 44bae72b5b7d..ee78837b0aa2 100644
> --- a/init/init_task.c
> +++ b/init/init_task.c
> @@ -192,6 +192,7 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
> .numa_faults = NULL,
> #endif
> #ifdef CONFIG_SCHED_CACHE
> + .sched_llc_active = false,
> .preferred_llc = -1,
> #endif
> #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index e8bdf03a4b7f..48626c81ba8e 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -531,6 +531,11 @@ void __trace_set_current_state(int state_value)
> }
> EXPORT_SYMBOL(__trace_set_current_state);
>
> +int task_llc(const struct task_struct *p)
> +{
> + return per_cpu(sd_llc_id, task_cpu(p));
> +}
> +
> /*
> * Serialization rules:
> *
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 10cec83f65d5..d46a70a9d9fb 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1223,6 +1223,43 @@ static int llc_id(int cpu)
> return llc;
> }
>
> +static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
> +{
> + int pref_llc;
> +
> + if (!sched_cache_enabled())
> + return;
> +
> + pref_llc = p->preferred_llc;
> + if (pref_llc < 0)
> + return;
> +
> + rq->nr_llc_running++;
> + rq->nr_pref_llc_running += (pref_llc == task_llc(p));
> + p->sched_llc_active = true;
> +}
> +
> +static void account_llc_dequeue(struct rq *rq, struct task_struct *p)
> +{
> + int pref_llc;
> +
> + /*
> + * Borrow the uc_se->active from uclamp_rq_inc_id(),
> + * uclamp_rq_dec_id() to avoid the unbalanced calculation
> + * of rq statistics.
> + */
> + if (unlikely(!p->sched_llc_active))
> + return;
> +
> + pref_llc = p->preferred_llc;
> + if (pref_llc < 0)
> + return;
> +
> + rq->nr_llc_running--;
> + rq->nr_pref_llc_running -= (pref_llc == task_llc(p));
> + p->sched_llc_active = false;
> +}
> +
> void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
> {
> unsigned long epoch;
> @@ -1294,6 +1331,8 @@ static unsigned long __no_profile fraction_mm_sched(struct rq *rq, struct mm_sch
> return div64_u64(NICE_0_LOAD * pcpu_sched->runtime, rq->cpu_runtime + 1);
> }
>
> +static unsigned int task_running_on_cpu(int cpu, struct task_struct *p);
> +
> static inline
> void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
> {
> @@ -1346,8 +1385,13 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
> #endif
> }
>
> - if (p->preferred_llc != mm_sched_llc)
> + /* task not on rq accounted later in account_entity_enqueue() */
> + if (task_running_on_cpu(rq->cpu, p) &&
> + p->preferred_llc != mm_sched_llc) {
#ifdef CONFIG_NUMA_BALANCING
/*
* Don't assign preferred LLC if it
* conflicts with NUMA balancing.
*/
if (p->numa_preferred_nid >= 0 &&
cpu_to_node(mm->mm_sched_cpu) != p->numa_preferred_nid)
mm_sched_llc = -1;
#endif
}
/* task not on rq accounted later in account_entity_enqueue() */
if (task_running_on_cpu(rq->cpu, p) &&
p->preferred_llc != mm_sched_llc) {
account_llc_dequeue(rq, p);
p->preferred_llc = mm_sched_llc;
account_llc_enqueue(rq, p);
}
I am a little concerned that there might be cases where both
|p->preferred_llc| and |mm_sched_llc| are equal to -1 at this point.",
Is it necessary to add a check here?
> + account_llc_dequeue(rq, p);
> p->preferred_llc = mm_sched_llc;
> + account_llc_enqueue(rq, p);
> + }
> }
>
> static void task_tick_cache(struct rq *rq, struct task_struct *p)
> @@ -1475,6 +1519,10 @@ void init_sched_mm(struct task_struct *p) { }
>
> static void task_tick_cache(struct rq *rq, struct task_struct *p) { }
>
> +static void account_llc_enqueue(struct rq *rq, struct task_struct *p) {}
> +
> +static void account_llc_dequeue(struct rq *rq, struct task_struct *p) {}
> +
> #endif
>
> /*
> @@ -3965,9 +4013,11 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
> {
> update_load_add(&cfs_rq->load, se->load.weight);
> if (entity_is_task(se)) {
> + struct task_struct *p = task_of(se);
> struct rq *rq = rq_of(cfs_rq);
>
> - account_numa_enqueue(rq, task_of(se));
> + account_numa_enqueue(rq, p);
> + account_llc_enqueue(rq, p);
> list_add(&se->group_node, &rq->cfs_tasks);
> }
> cfs_rq->nr_queued++;
> @@ -3978,7 +4028,11 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
> {
> update_load_sub(&cfs_rq->load, se->load.weight);
> if (entity_is_task(se)) {
> - account_numa_dequeue(rq_of(cfs_rq), task_of(se));
> + struct task_struct *p = task_of(se);
> + struct rq *rq = rq_of(cfs_rq);
> +
> + account_numa_dequeue(rq, p);
> + account_llc_dequeue(rq, p);
> list_del_init(&se->group_node);
> }
> cfs_rq->nr_queued--;
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 728737641847..ee8b70647835 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1126,6 +1126,10 @@ struct rq {
> unsigned int nr_preferred_running;
> unsigned int numa_migrate_on;
> #endif
> +#ifdef CONFIG_SCHED_CACHE
> + unsigned int nr_pref_llc_running;
> + unsigned int nr_llc_running;
> +#endif
> #ifdef CONFIG_NO_HZ_COMMON
> unsigned long last_blocked_load_update_tick;
> unsigned int has_blocked_load;
> @@ -1980,6 +1984,8 @@ init_numa_balancing(u64 clone_flags, struct task_struct *p)
>
> #endif /* !CONFIG_NUMA_BALANCING */
>
> +int task_llc(const struct task_struct *p);
> +
> static inline void
> queue_balance_callback(struct rq *rq,
> struct balance_callback *head,
On 12/17/2025 6:04 PM, Vern Hao wrote:
>
> On 2025/12/4 07:07, Tim Chen wrote:
>> @@ -1346,8 +1385,13 @@ void account_mm_sched(struct rq *rq, struct
>> task_struct *p, s64 delta_exec)
>> #endif
>> }
>> - if (p->preferred_llc != mm_sched_llc)
>> + /* task not on rq accounted later in account_entity_enqueue() */
>> + if (task_running_on_cpu(rq->cpu, p) &&
>> + p->preferred_llc != mm_sched_llc) {
>> #ifdef CONFIG_NUMA_BALANCING
>> /*
>> * Don't assign preferred LLC if it
>> * conflicts with NUMA balancing.
>> */
>> if (p->numa_preferred_nid >= 0 &&
>> cpu_to_node(mm->mm_sched_cpu) != p-
>> >numa_preferred_nid)
>> mm_sched_llc = -1;
>> #endif
>> }
>>
>> /* task not on rq accounted later in account_entity_enqueue() */
>> if (task_running_on_cpu(rq->cpu, p) &&
>> p->preferred_llc != mm_sched_llc) {
>> account_llc_dequeue(rq, p);
>> p->preferred_llc = mm_sched_llc;
>> account_llc_enqueue(rq, p);
>>
>> }
>>
> I am a little concerned that there might be cases where both |p-
> >preferred_llc| and |mm_sched_llc| are equal to -1 at this point.", Is
> it necessary to add a check here?
>
Are you concerned about the mismatch between the percpu runqueue values
of nr_pref_llc_running, nr_pref_llc, and nr_llc_running? This should not
be an issue, because account_llc_dequeue() and account_llc_enqueue() are
always invoked together in account_mm_sched(). If p->preferred_llc =
mm_sched_llc = -1,
account_llc_dequeue/enqueue() will not be invoked, so it is still paired.
Please let me know if I understand your comments correctly.
thanks,
Chenyu
On Wed, Dec 03, 2025 at 03:07:25PM -0800, Tim Chen wrote:
> #ifdef CONFIG_SCHED_CACHE
> struct callback_head cache_work;
> + /*the p is currently refcounted in a rq's preferred llc stats*/
Shall we have spaces after and before the comment marks?
Also, comment confuses me, I don't see get_task_struct() /
put_task_struct() usage. Did you mean something else with refcount?
> + bool sched_llc_active;
> int preferred_llc;
> #endif
> +static void account_llc_dequeue(struct rq *rq, struct task_struct *p)
> +{
> + int pref_llc;
> +
> + /*
> + * Borrow the uc_se->active from uclamp_rq_inc_id(),
> + * uclamp_rq_dec_id() to avoid the unbalanced calculation
> + * of rq statistics.
> + */
> + if (unlikely(!p->sched_llc_active))
> + return;
Another very confusing comment; what? Also, can you please explain (in
the new comment) how we get here without having llc_active set?
> +
> + pref_llc = p->preferred_llc;
> + if (pref_llc < 0)
> + return;
> +
> + rq->nr_llc_running--;
> + rq->nr_pref_llc_running -= (pref_llc == task_llc(p));
> + p->sched_llc_active = false;
> +}
On Tue, 2025-12-09 at 13:16 +0100, Peter Zijlstra wrote:
> On Wed, Dec 03, 2025 at 03:07:25PM -0800, Tim Chen wrote:
>
>
> > #ifdef CONFIG_SCHED_CACHE
> > struct callback_head cache_work;
> > + /*the p is currently refcounted in a rq's preferred llc stats*/
>
> Shall we have spaces after and before the comment marks?
>
> Also, comment confuses me, I don't see get_task_struct() /
> put_task_struct() usage. Did you mean something else with refcount?
It is the accounting for number of tasks preferring
a certain LLC on a runqueue during enqueue/dequeue,
or when a task's LLC preference changes, by
account_llc_enqueue() and account_llc_dequeue()
How about change he comment to
/* LLC preference accounting should be done in dequeue */
>
> > + bool sched_llc_active;
> > int preferred_llc;
> > #endif
>
> > +static void account_llc_dequeue(struct rq *rq, struct task_struct *p)
> > +{
> > + int pref_llc;
> > +
> > + /*
> > + * Borrow the uc_se->active from uclamp_rq_inc_id(),
> > + * uclamp_rq_dec_id() to avoid the unbalanced calculation
> > + * of rq statistics.
> > + */
> > + if (unlikely(!p->sched_llc_active))
> > + return;
>
> Another very confusing comment; what? Also, can you please explain (in
> the new comment) how we get here without having llc_active set?
The comment meant to say that we are using a similar mechanism as
accounting done in uc_se->active from uclamp_rq_inc_id(). I agree that
it confuses more than making things clearer.
How about the following comment to make things clearer:
/*
* Cache aware scheduling was active when the task was enqueued.
* Admin has disabled cache aware scheduling before task was dequeued
* but the accounting has to be kept straight in case cache aware scheduling
* is re-enabled.
*/
>
> > +
> > + pref_llc = p->preferred_llc;
> > + if (pref_llc < 0)
> > + return;
> > +
> > + rq->nr_llc_running--;
> > + rq->nr_pref_llc_running -= (pref_llc == task_llc(p));
> > + p->sched_llc_active = false;
> > +}
On Tue, Dec 09, 2025 at 02:55:21PM -0800, Tim Chen wrote:
> > > +static void account_llc_dequeue(struct rq *rq, struct task_struct *p)
> > > +{
> > > + int pref_llc;
> > > +
> > > + /*
> > > + * Borrow the uc_se->active from uclamp_rq_inc_id(),
> > > + * uclamp_rq_dec_id() to avoid the unbalanced calculation
> > > + * of rq statistics.
> > > + */
> > > + if (unlikely(!p->sched_llc_active))
> > > + return;
> >
> > Another very confusing comment; what? Also, can you please explain (in
> > the new comment) how we get here without having llc_active set?
>
> The comment meant to say that we are using a similar mechanism as
> accounting done in uc_se->active from uclamp_rq_inc_id(). I agree that
> it confuses more than making things clearer.
>
> How about the following comment to make things clearer:
>
> /*
> * Cache aware scheduling was active when the task was enqueued.
> * Admin has disabled cache aware scheduling before task was dequeued
> * but the accounting has to be kept straight in case cache aware scheduling
> * is re-enabled.
> */
Is having that sched_cache_enabled() test worth it?
account_numa_{en,de}queue() don't seem to have any of this.
> > > + pref_llc = p->preferred_llc;
> > > + if (pref_llc < 0)
> > > + return;
> > > +
> > > + rq->nr_llc_running--;
> > > + rq->nr_pref_llc_running -= (pref_llc == task_llc(p));
> > > + p->sched_llc_active = false;
> > > +}
On 12/10/2025 5:42 PM, Peter Zijlstra wrote:
> On Tue, Dec 09, 2025 at 02:55:21PM -0800, Tim Chen wrote:
>
>>>> +static void account_llc_dequeue(struct rq *rq, struct task_struct *p)
>>>> +{
>>>> + int pref_llc;
>>>> +
>>>> + /*
>>>> + * Borrow the uc_se->active from uclamp_rq_inc_id(),
>>>> + * uclamp_rq_dec_id() to avoid the unbalanced calculation
>>>> + * of rq statistics.
>>>> + */
>>>> + if (unlikely(!p->sched_llc_active))
>>>> + return;
>>>
>>> Another very confusing comment; what? Also, can you please explain (in
>>> the new comment) how we get here without having llc_active set?
>>
>> The comment meant to say that we are using a similar mechanism as
>> accounting done in uc_se->active from uclamp_rq_inc_id(). I agree that
>> it confuses more than making things clearer.
>>
>> How about the following comment to make things clearer:
>>
>> /*
>> * Cache aware scheduling was active when the task was enqueued.
>> * Admin has disabled cache aware scheduling before task was dequeued
>> * but the accounting has to be kept straight in case cache aware scheduling
>> * is re-enabled.
>> */
>
> Is having that sched_cache_enabled() test worth it?
> account_numa_{en,de}queue() don't seem to have any of this.
>
>
OK, I think we can remove the sched_cache_enabled() check and
make the account_llc_{en,de}queue() depending on CONFIG_SCHED_CACHE,
so the sched_llc_active can be removed.
thanks,
Chenyu
© 2016 - 2025 Red Hat, Inc.