Introduce an index mapping between CPUs and their LLCs. This provides
a continuous per LLC index needed for cache-aware load balancing in
later patches.
The existing per_cpu llc_id usually points to the first CPU of the
LLC domain, which is sparse and unsuitable as an array index. Using
llc_id directly would waste memory.
With the new mapping, CPUs in the same LLC share a continuous id:
per_cpu(llc_id, CPU=0...15) = 0
per_cpu(llc_id, CPU=16...31) = 1
per_cpu(llc_id, CPU=32...47) = 2
...
Co-developed-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
---
Notes:
v1->v2:
Convert the static LLC id to be allocated sequentially as LLCs are
discovered, and replace the old sd_llc_id. (Peter Zijlstra)
kernel/sched/fair.c | 9 ++++++-
kernel/sched/sched.h | 1 +
kernel/sched/topology.c | 60 +++++++++++++++++++++++++++++++++++++++--
3 files changed, 67 insertions(+), 3 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 710ed9943d27..0a3918269906 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1210,10 +1210,17 @@ __read_mostly unsigned int llc_imb_pct = 20;
static int llc_id(int cpu)
{
+ int llc;
+
if (cpu < 0)
return -1;
- return per_cpu(sd_llc_id, cpu);
+ llc = per_cpu(sd_llc_id, cpu);
+ /* avoid race with cpu hotplug */
+ if (unlikely(llc >= max_llcs))
+ return -1;
+
+ return llc;
}
void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index bf72c5bab506..728737641847 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2075,6 +2075,7 @@ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
extern struct static_key_false sched_asym_cpucapacity;
extern struct static_key_false sched_cluster_active;
+extern int max_llcs;
static __always_inline bool sched_asym_cpucap_active(void)
{
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 444bdfdab731..f25d950ab015 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -17,6 +17,8 @@ void sched_domains_mutex_unlock(void)
mutex_unlock(&sched_domains_mutex);
}
+int max_llcs;
+
/* Protected by sched_domains_mutex: */
static cpumask_var_t sched_domains_tmpmask;
static cpumask_var_t sched_domains_tmpmask2;
@@ -668,6 +670,55 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
DEFINE_STATIC_KEY_FALSE(sched_cluster_active);
+/*
+ * Assign continuous llc id for the CPU, and return
+ * the assigned llc id.
+ */
+static int update_llc_id(struct sched_domain *sd,
+ int cpu)
+{
+ int id = per_cpu(sd_llc_id, cpu), i;
+
+ if (id >= 0)
+ return id;
+
+ if (sd) {
+ /* Look for any assigned id and reuse it.*/
+ for_each_cpu(i, sched_domain_span(sd)) {
+ id = per_cpu(sd_llc_id, i);
+
+ if (id >= 0) {
+ per_cpu(sd_llc_id, cpu) = id;
+ return id;
+ }
+ }
+ }
+
+ /*
+ * When 1. there is no id assigned to this LLC domain,
+ * or 2. the sd is NULL, we reach here.
+ * Consider the following scenario,
+ * CPU0~CPU95 are in the node0, CPU96~CPU191 are
+ * in the node1. During bootup, maxcpus=96 is
+ * appended.
+ * case 1: When running cpu_attach_domain(CPU24)
+ * during boot up, CPU24 is the first CPU in its
+ * non-NULL LLC domain. However,
+ * its corresponding llc id has not been assigned yet.
+ *
+ * case 2: After boot up, the CPU100 is brought up
+ * via sysfs manually. As a result, CPU100 has only a
+ * Numa domain attached, because CPU100 is the only CPU
+ * of a sched domain, all its bottom domains are degenerated.
+ * The LLC domain pointer sd is NULL for CPU100.
+ *
+ * For both cases, we want to increase the number of LLCs.
+ */
+ per_cpu(sd_llc_id, cpu) = max_llcs++;
+
+ return per_cpu(sd_llc_id, cpu);
+}
+
static void update_top_cache_domain(int cpu)
{
struct sched_domain_shared *sds = NULL;
@@ -677,14 +728,13 @@ static void update_top_cache_domain(int cpu)
sd = highest_flag_domain(cpu, SD_SHARE_LLC);
if (sd) {
- id = cpumask_first(sched_domain_span(sd));
size = cpumask_weight(sched_domain_span(sd));
sds = sd->shared;
}
rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
per_cpu(sd_llc_size, cpu) = size;
- per_cpu(sd_llc_id, cpu) = id;
+ id = update_llc_id(sd, cpu);
rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
sd = lowest_flag_domain(cpu, SD_CLUSTER);
@@ -2488,6 +2538,12 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
bool has_asym = false;
bool has_cluster = false;
+ /* first scan of LLCs */
+ if (!max_llcs) {
+ for_each_possible_cpu(i)
+ per_cpu(sd_llc_id, i) = -1;
+ }
+
if (WARN_ON(cpumask_empty(cpu_map)))
goto error;
--
2.32.0
On Wed, Dec 03, 2025 at 03:07:23PM -0800, Tim Chen wrote:
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 710ed9943d27..0a3918269906 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1210,10 +1210,17 @@ __read_mostly unsigned int llc_imb_pct = 20;
>
> static int llc_id(int cpu)
> {
> + int llc;
> +
> if (cpu < 0)
> return -1;
>
> + llc = per_cpu(sd_llc_id, cpu);
> + /* avoid race with cpu hotplug */
> + if (unlikely(llc >= max_llcs))
> + return -1;
> +
> + return llc;
> }
>
> void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
> @@ -668,6 +670,55 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
> DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
> DEFINE_STATIC_KEY_FALSE(sched_cluster_active);
>
> +/*
> + * Assign continuous llc id for the CPU, and return
> + * the assigned llc id.
> + */
> +static int update_llc_id(struct sched_domain *sd,
> + int cpu)
> +{
> + int id = per_cpu(sd_llc_id, cpu), i;
> +
> + if (id >= 0)
> + return id;
> +
> + if (sd) {
> + /* Look for any assigned id and reuse it.*/
> + for_each_cpu(i, sched_domain_span(sd)) {
> + id = per_cpu(sd_llc_id, i);
> +
> + if (id >= 0) {
> + per_cpu(sd_llc_id, cpu) = id;
> + return id;
> + }
> + }
> + }
> +
> + /*
> + * When 1. there is no id assigned to this LLC domain,
> + * or 2. the sd is NULL, we reach here.
> + * Consider the following scenario,
> + * CPU0~CPU95 are in the node0, CPU96~CPU191 are
> + * in the node1. During bootup, maxcpus=96 is
> + * appended.
> + * case 1: When running cpu_attach_domain(CPU24)
> + * during boot up, CPU24 is the first CPU in its
> + * non-NULL LLC domain. However,
> + * its corresponding llc id has not been assigned yet.
> + *
> + * case 2: After boot up, the CPU100 is brought up
> + * via sysfs manually. As a result, CPU100 has only a
> + * Numa domain attached, because CPU100 is the only CPU
> + * of a sched domain, all its bottom domains are degenerated.
> + * The LLC domain pointer sd is NULL for CPU100.
> + *
> + * For both cases, we want to increase the number of LLCs.
> + */
> + per_cpu(sd_llc_id, cpu) = max_llcs++;
> +
> + return per_cpu(sd_llc_id, cpu);
> +}
I'm not sure I follow. So partition_sched_domains() first calls
detach_destroy_domains() on the old set, and then build_sched_domains()
on the new set.
Do detach_destroy_domain() will do:
cpu_attach_domain(NULL,..);
That is, it will explicitly attach the NULL sched_domain to a CPU. At
which point I feel update_llc_id() should be returning -1, no?
Then later, build_sched_domains() will set a !NULL sched_domain, at
which point update_llc_id() can set a real value.
This should then also get rid of that weird max_llcs check in llc_id(),
right?
On Tue, 2025-12-09 at 12:58 +0100, Peter Zijlstra wrote:
> On Wed, Dec 03, 2025 at 03:07:23PM -0800, Tim Chen wrote:
>
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 710ed9943d27..0a3918269906 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -1210,10 +1210,17 @@ __read_mostly unsigned int llc_imb_pct = 20;
> >
> > static int llc_id(int cpu)
> > {
> > + int llc;
> > +
> > if (cpu < 0)
> > return -1;
> >
> > + llc = per_cpu(sd_llc_id, cpu);
> > + /* avoid race with cpu hotplug */
> > + if (unlikely(llc >= max_llcs))
> > + return -1;
> > +
> > + return llc;
> > }
> >
> > void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
>
> > @@ -668,6 +670,55 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
> > DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
> > DEFINE_STATIC_KEY_FALSE(sched_cluster_active);
> >
> > +/*
> > + * Assign continuous llc id for the CPU, and return
> > + * the assigned llc id.
> > + */
> > +static int update_llc_id(struct sched_domain *sd,
> > + int cpu)
> > +{
> > + int id = per_cpu(sd_llc_id, cpu), i;
> > +
> > + if (id >= 0)
> > + return id;
> > +
> > + if (sd) {
> > + /* Look for any assigned id and reuse it.*/
> > + for_each_cpu(i, sched_domain_span(sd)) {
> > + id = per_cpu(sd_llc_id, i);
> > +
> > + if (id >= 0) {
> > + per_cpu(sd_llc_id, cpu) = id;
> > + return id;
> > + }
> > + }
> > + }
> > +
> > + /*
> > + * When 1. there is no id assigned to this LLC domain,
> > + * or 2. the sd is NULL, we reach here.
> > + * Consider the following scenario,
> > + * CPU0~CPU95 are in the node0, CPU96~CPU191 are
> > + * in the node1. During bootup, maxcpus=96 is
> > + * appended.
> > + * case 1: When running cpu_attach_domain(CPU24)
> > + * during boot up, CPU24 is the first CPU in its
> > + * non-NULL LLC domain. However,
> > + * its corresponding llc id has not been assigned yet.
> > + *
> > + * case 2: After boot up, the CPU100 is brought up
> > + * via sysfs manually. As a result, CPU100 has only a
> > + * Numa domain attached, because CPU100 is the only CPU
> > + * of a sched domain, all its bottom domains are degenerated.
> > + * The LLC domain pointer sd is NULL for CPU100.
> > + *
> > + * For both cases, we want to increase the number of LLCs.
> > + */
> > + per_cpu(sd_llc_id, cpu) = max_llcs++;
> > +
> > + return per_cpu(sd_llc_id, cpu);
> > +}
>
> I'm not sure I follow. So partition_sched_domains() first calls
> detach_destroy_domains() on the old set, and then build_sched_domains()
> on the new set.
>
> Do detach_destroy_domain() will do:
>
> cpu_attach_domain(NULL,..);
>
> That is, it will explicitly attach the NULL sched_domain to a CPU. At
> which point I feel update_llc_id() should be returning -1, no?
>
> Then later, build_sched_domains() will set a !NULL sched_domain, at
> which point update_llc_id() can set a real value.
>
> This should then also get rid of that weird max_llcs check in llc_id(),
> right?
Thanks for pointing this out. Yes, we should take care of the
attachment of NULL sd. Will update the code accordingly.
Tim
On 12/16/2025 4:49 AM, Tim Chen wrote:
> On Tue, 2025-12-09 at 12:58 +0100, Peter Zijlstra wrote:
>> On Wed, Dec 03, 2025 at 03:07:23PM -0800, Tim Chen wrote:
>>
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index 710ed9943d27..0a3918269906 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -1210,10 +1210,17 @@ __read_mostly unsigned int llc_imb_pct = 20;
>>>
>>> static int llc_id(int cpu)
>>> {
>>> + int llc;
>>> +
>>> if (cpu < 0)
>>> return -1;
>>>
>>> + llc = per_cpu(sd_llc_id, cpu);
>>> + /* avoid race with cpu hotplug */
>>> + if (unlikely(llc >= max_llcs))
>>> + return -1;
>>> +
>>> + return llc;
>>> }
>>>
>>> void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
>>
>>> @@ -668,6 +670,55 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
>>> DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
>>> DEFINE_STATIC_KEY_FALSE(sched_cluster_active);
>>>
>>> +/*
>>> + * Assign continuous llc id for the CPU, and return
>>> + * the assigned llc id.
>>> + */
>>> +static int update_llc_id(struct sched_domain *sd,
>>> + int cpu)
>>> +{
>>> + int id = per_cpu(sd_llc_id, cpu), i;
>>> +
>>> + if (id >= 0)
>>> + return id;
>>> +
>>> + if (sd) {
>>> + /* Look for any assigned id and reuse it.*/
>>> + for_each_cpu(i, sched_domain_span(sd)) {
>>> + id = per_cpu(sd_llc_id, i);
>>> +
>>> + if (id >= 0) {
>>> + per_cpu(sd_llc_id, cpu) = id;
>>> + return id;
>>> + }
>>> + }
>>> + }
>>> +
>>> + /*
>>> + * When 1. there is no id assigned to this LLC domain,
>>> + * or 2. the sd is NULL, we reach here.
>>> + * Consider the following scenario,
>>> + * CPU0~CPU95 are in the node0, CPU96~CPU191 are
>>> + * in the node1. During bootup, maxcpus=96 is
>>> + * appended.
>>> + * case 1: When running cpu_attach_domain(CPU24)
>>> + * during boot up, CPU24 is the first CPU in its
>>> + * non-NULL LLC domain. However,
>>> + * its corresponding llc id has not been assigned yet.
>>> + *
>>> + * case 2: After boot up, the CPU100 is brought up
>>> + * via sysfs manually. As a result, CPU100 has only a
>>> + * Numa domain attached, because CPU100 is the only CPU
>>> + * of a sched domain, all its bottom domains are degenerated.
>>> + * The LLC domain pointer sd is NULL for CPU100.
>>> + *
>>> + * For both cases, we want to increase the number of LLCs.
>>> + */
>>> + per_cpu(sd_llc_id, cpu) = max_llcs++;
>>> +
>>> + return per_cpu(sd_llc_id, cpu);
>>> +}
>>
>> I'm not sure I follow. So partition_sched_domains() first calls
>> detach_destroy_domains() on the old set, and then build_sched_domains()
>> on the new set.
>>
>> Do detach_destroy_domain() will do:
>>
>> cpu_attach_domain(NULL,..);
>>
>> That is, it will explicitly attach the NULL sched_domain to a CPU. At
>> which point I feel update_llc_id() should be returning -1, no?
>>
>> Then later, build_sched_domains() will set a !NULL sched_domain, at
>> which point update_llc_id() can set a real value.
>>
>> This should then also get rid of that weird max_llcs check in llc_id(),
>> right?
The check for max_llcs was intended to prevent out-of-bounds access
to rq->nr_pref_llc[] at multiple points in the code.
Since dst_llc = llc_id(env->dst_cpu); — and while the LLC ID for the
CPU is updated in update_llc_id(), this update occurs before we reallocate
the nr_pref_llc buffer — dst_llc may end up exceeding the bounds of the
original nr_pref_llc buffer.
For this reason, we added a check if (dst_llc > max_llc) in llc_id()
when attempting to access rq->nr_pref_llc[dst_llc].
However, I agree that the max_llc check seems to not properly integrated
into the current patch: it should instead be placed in the 7th patch, as
this would better illustrate the rationale for the max_llc check here:
sched/cache: Introduce per runqueue task LLC preference counter
In the 7th patch, we actually increment new_max_llcs rather than
max_llcs — meaning max_llcs always represents the "old" number of LLCs.
As a result, there is a race window between extending the rq->nr_pref_llc
buffer and updating max_llcs.
@@ -714,7 +827,7 @@ static int update_llc_id(struct sched_domain *sd,
*
* For both cases, we want to increase the number of LLCs.
*/
- per_cpu(sd_llc_id, cpu) = max_llcs++;
+ per_cpu(sd_llc_id, cpu) = new_max_llcs++;
return per_cpu(sd_llc_id, cpu);
}
> Thanks for pointing this out. Yes, we should take care of the
> attachment of NULL sd. Will update the code accordingly.
>
My understanding is that, if the sd is NULL, it is either because invoked
by detach_destroy_domain() for the old set, or by case 2 mentioned in
above comments:
Say, CPU0-CPU95 are online during bootup, the boot command line is
maxcpus=96.
Later after bootup, the user wants to bring up CPU100, the LLC domain for
CPU100 is NULL in this case(due to sd generation), and a new LLC should be
detected.
That is to say, when we reach update_llc_id(), there could be 2 reasons
for NULL sd. For the detach_destroy_domain() case, update_llc_id()
should return a valid id without increasing the max_llcs, because of
if (id >= 0)
return id;
And for the latter, the max_llcs should be increased.
Let me double check on this.
thanks,
Chenyu
> Tim
On Tue, 2025-12-16 at 13:31 +0800, Chen, Yu C wrote:
> On 12/16/2025 4:49 AM, Tim Chen wrote:
> > On Tue, 2025-12-09 at 12:58 +0100, Peter Zijlstra wrote:
> > > On Wed, Dec 03, 2025 at 03:07:23PM -0800, Tim Chen wrote:
> > >
> > > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > > index 710ed9943d27..0a3918269906 100644
> > > > --- a/kernel/sched/fair.c
> > > > +++ b/kernel/sched/fair.c
> > > > @@ -1210,10 +1210,17 @@ __read_mostly unsigned int llc_imb_pct = 20;
> > > >
> > > > static int llc_id(int cpu)
> > > > {
> > > > + int llc;
> > > > +
> > > > if (cpu < 0)
> > > > return -1;
> > > >
> > > > + llc = per_cpu(sd_llc_id, cpu);
> > > > + /* avoid race with cpu hotplug */
> > > > + if (unlikely(llc >= max_llcs))
> > > > + return -1;
> > > > +
> > > > + return llc;
> > > > }
> > > >
> > > > void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
> > >
> > > > @@ -668,6 +670,55 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
> > > > DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
> > > > DEFINE_STATIC_KEY_FALSE(sched_cluster_active);
> > > >
> > > > +/*
> > > > + * Assign continuous llc id for the CPU, and return
> > > > + * the assigned llc id.
> > > > + */
> > > > +static int update_llc_id(struct sched_domain *sd,
> > > > + int cpu)
> > > > +{
> > > > + int id = per_cpu(sd_llc_id, cpu), i;
> > > > +
> > > > + if (id >= 0)
> > > > + return id;
> > > > +
> > > > + if (sd) {
> > > > + /* Look for any assigned id and reuse it.*/
> > > > + for_each_cpu(i, sched_domain_span(sd)) {
> > > > + id = per_cpu(sd_llc_id, i);
> > > > +
> > > > + if (id >= 0) {
> > > > + per_cpu(sd_llc_id, cpu) = id;
> > > > + return id;
> > > > + }
> > > > + }
> > > > + }
> > > > +
> > > > + /*
> > > > + * When 1. there is no id assigned to this LLC domain,
> > > > + * or 2. the sd is NULL, we reach here.
> > > > + * Consider the following scenario,
> > > > + * CPU0~CPU95 are in the node0, CPU96~CPU191 are
> > > > + * in the node1. During bootup, maxcpus=96 is
> > > > + * appended.
> > > > + * case 1: When running cpu_attach_domain(CPU24)
> > > > + * during boot up, CPU24 is the first CPU in its
> > > > + * non-NULL LLC domain. However,
> > > > + * its corresponding llc id has not been assigned yet.
> > > > + *
> > > > + * case 2: After boot up, the CPU100 is brought up
> > > > + * via sysfs manually. As a result, CPU100 has only a
> > > > + * Numa domain attached, because CPU100 is the only CPU
> > > > + * of a sched domain, all its bottom domains are degenerated.
> > > > + * The LLC domain pointer sd is NULL for CPU100.
> > > > + *
> > > > + * For both cases, we want to increase the number of LLCs.
> > > > + */
> > > > + per_cpu(sd_llc_id, cpu) = max_llcs++;
> > > > +
> > > > + return per_cpu(sd_llc_id, cpu);
> > > > +}
> > >
> > > I'm not sure I follow. So partition_sched_domains() first calls
> > > detach_destroy_domains() on the old set, and then build_sched_domains()
> > > on the new set.
> > >
> > > Do detach_destroy_domain() will do:
> > >
> > > cpu_attach_domain(NULL,..);
> > >
> > > That is, it will explicitly attach the NULL sched_domain to a CPU. At
> > > which point I feel update_llc_id() should be returning -1, no?
> > >
> > > Then later, build_sched_domains() will set a !NULL sched_domain, at
> > > which point update_llc_id() can set a real value.
> > >
> > > This should then also get rid of that weird max_llcs check in llc_id(),
> > > right?
>
> The check for max_llcs was intended to prevent out-of-bounds access
> to rq->nr_pref_llc[] at multiple points in the code.
> Since dst_llc = llc_id(env->dst_cpu); — and while the LLC ID for the
> CPU is updated in update_llc_id(), this update occurs before we reallocate
> the nr_pref_llc buffer — dst_llc may end up exceeding the bounds of the
> original nr_pref_llc buffer.
>
> For this reason, we added a check if (dst_llc > max_llc) in llc_id()
> when attempting to access rq->nr_pref_llc[dst_llc].
>
> However, I agree that the max_llc check seems to not properly integrated
> into the current patch: it should instead be placed in the 7th patch, as
> this would better illustrate the rationale for the max_llc check here:
> sched/cache: Introduce per runqueue task LLC preference counter
>
> In the 7th patch, we actually increment new_max_llcs rather than
> max_llcs — meaning max_llcs always represents the "old" number of LLCs.
> As a result, there is a race window between extending the rq->nr_pref_llc
> buffer and updating max_llcs.
>
>
> @@ -714,7 +827,7 @@ static int update_llc_id(struct sched_domain *sd,
> *
> * For both cases, we want to increase the number of LLCs.
> */
> - per_cpu(sd_llc_id, cpu) = max_llcs++;
> + per_cpu(sd_llc_id, cpu) = new_max_llcs++;
>
> return per_cpu(sd_llc_id, cpu);
> }
>
>
> > Thanks for pointing this out. Yes, we should take care of the
> > attachment of NULL sd. Will update the code accordingly.
> >
>
> My understanding is that, if the sd is NULL, it is either because invoked
> by detach_destroy_domain() for the old set, or by case 2 mentioned in
> above comments:
> Say, CPU0-CPU95 are online during bootup, the boot command line is
> maxcpus=96.
> Later after bootup, the user wants to bring up CPU100, the LLC domain for
> CPU100 is NULL in this case(due to sd generation), and a new LLC should be
> detected.
>
> That is to say, when we reach update_llc_id(), there could be 2 reasons
> for NULL sd. For the detach_destroy_domain() case, update_llc_id()
> should return a valid id without increasing the max_llcs, because of
> if (id >= 0)
> return id;
> And for the latter, the max_llcs should be increased.
> Let me double check on this.
The issue is we could offline all CPUs in a LLC and online them later.
In the current code, we will assign their ids all to -1. So on attach
of CPUs again, we'll be assigning a new LLC. I think the proper thing
to do is not to assign llc id of the offlined cpu (the case where sd == NULL)
and keep the original llc id assigned. Then we should be okay and not
increase max_llcs.
Tim
>
> thanks,
> Chenyu
>
>
> > Tim
On 12/17/2025 3:53 AM, Tim Chen wrote:
> On Tue, 2025-12-16 at 13:31 +0800, Chen, Yu C wrote:
>> On 12/16/2025 4:49 AM, Tim Chen wrote:
>>> On Tue, 2025-12-09 at 12:58 +0100, Peter Zijlstra wrote:
>>>> On Wed, Dec 03, 2025 at 03:07:23PM -0800, Tim Chen wrote:
>>>>
>>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>>>> index 710ed9943d27..0a3918269906 100644
>>>>> --- a/kernel/sched/fair.c
>>>>> +++ b/kernel/sched/fair.c
>>>>> @@ -1210,10 +1210,17 @@ __read_mostly unsigned int llc_imb_pct = 20;
>>>>>
>>>>> static int llc_id(int cpu)
>>>>> {
>>>>> + int llc;
>>>>> +
>>>>> if (cpu < 0)
>>>>> return -1;
>>>>>
>>>>> + llc = per_cpu(sd_llc_id, cpu);
>>>>> + /* avoid race with cpu hotplug */
>>>>> + if (unlikely(llc >= max_llcs))
>>>>> + return -1;
>>>>> +
>>>>> + return llc;
>>>>> }
>>>>>
>>>>> void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
>>>>
>>>>> @@ -668,6 +670,55 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
>>>>> DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
>>>>> DEFINE_STATIC_KEY_FALSE(sched_cluster_active);
>>>>>
>>>>> +/*
>>>>> + * Assign continuous llc id for the CPU, and return
>>>>> + * the assigned llc id.
>>>>> + */
>>>>> +static int update_llc_id(struct sched_domain *sd,
>>>>> + int cpu)
>>>>> +{
>>>>> + int id = per_cpu(sd_llc_id, cpu), i;
>>>>> +
>>>>> + if (id >= 0)
>>>>> + return id;
>>>>> +
>>>>> + if (sd) {
>>>>> + /* Look for any assigned id and reuse it.*/
>>>>> + for_each_cpu(i, sched_domain_span(sd)) {
>>>>> + id = per_cpu(sd_llc_id, i);
>>>>> +
>>>>> + if (id >= 0) {
>>>>> + per_cpu(sd_llc_id, cpu) = id;
>>>>> + return id;
>>>>> + }
>>>>> + }
>>>>> + }
>>>>> +
>>>>> + /*
>>>>> + * When 1. there is no id assigned to this LLC domain,
>>>>> + * or 2. the sd is NULL, we reach here.
>>>>> + * Consider the following scenario,
>>>>> + * CPU0~CPU95 are in the node0, CPU96~CPU191 are
>>>>> + * in the node1. During bootup, maxcpus=96 is
>>>>> + * appended.
>>>>> + * case 1: When running cpu_attach_domain(CPU24)
>>>>> + * during boot up, CPU24 is the first CPU in its
>>>>> + * non-NULL LLC domain. However,
>>>>> + * its corresponding llc id has not been assigned yet.
>>>>> + *
>>>>> + * case 2: After boot up, the CPU100 is brought up
>>>>> + * via sysfs manually. As a result, CPU100 has only a
>>>>> + * Numa domain attached, because CPU100 is the only CPU
>>>>> + * of a sched domain, all its bottom domains are degenerated.
>>>>> + * The LLC domain pointer sd is NULL for CPU100.
>>>>> + *
>>>>> + * For both cases, we want to increase the number of LLCs.
>>>>> + */
>>>>> + per_cpu(sd_llc_id, cpu) = max_llcs++;
>>>>> +
>>>>> + return per_cpu(sd_llc_id, cpu);
>>>>> +}
>>>>
>>>> I'm not sure I follow. So partition_sched_domains() first calls
>>>> detach_destroy_domains() on the old set, and then build_sched_domains()
>>>> on the new set.
>>>>
>>>> Do detach_destroy_domain() will do:
>>>>
>>>> cpu_attach_domain(NULL,..);
>>>>
>>>> That is, it will explicitly attach the NULL sched_domain to a CPU. At
>>>> which point I feel update_llc_id() should be returning -1, no?
>>>>
>>>> Then later, build_sched_domains() will set a !NULL sched_domain, at
>>>> which point update_llc_id() can set a real value.
>>>>
>>>> This should then also get rid of that weird max_llcs check in llc_id(),
>>>> right?
>>
>> The check for max_llcs was intended to prevent out-of-bounds access
>> to rq->nr_pref_llc[] at multiple points in the code.
>> Since dst_llc = llc_id(env->dst_cpu); — and while the LLC ID for the
>> CPU is updated in update_llc_id(), this update occurs before we reallocate
>> the nr_pref_llc buffer — dst_llc may end up exceeding the bounds of the
>> original nr_pref_llc buffer.
>>
>> For this reason, we added a check if (dst_llc > max_llc) in llc_id()
>> when attempting to access rq->nr_pref_llc[dst_llc].
>>
>> However, I agree that the max_llc check seems to not properly integrated
>> into the current patch: it should instead be placed in the 7th patch, as
>> this would better illustrate the rationale for the max_llc check here:
>> sched/cache: Introduce per runqueue task LLC preference counter
>>
>> In the 7th patch, we actually increment new_max_llcs rather than
>> max_llcs — meaning max_llcs always represents the "old" number of LLCs.
>> As a result, there is a race window between extending the rq->nr_pref_llc
>> buffer and updating max_llcs.
>>
>>
>> @@ -714,7 +827,7 @@ static int update_llc_id(struct sched_domain *sd,
>> *
>> * For both cases, we want to increase the number of LLCs.
>> */
>> - per_cpu(sd_llc_id, cpu) = max_llcs++;
>> + per_cpu(sd_llc_id, cpu) = new_max_llcs++;
>>
>> return per_cpu(sd_llc_id, cpu);
>> }
>>
>>
>>> Thanks for pointing this out. Yes, we should take care of the
>>> attachment of NULL sd. Will update the code accordingly.
>>>
>>
>> My understanding is that, if the sd is NULL, it is either because invoked
>> by detach_destroy_domain() for the old set, or by case 2 mentioned in
>> above comments:
>> Say, CPU0-CPU95 are online during bootup, the boot command line is
>> maxcpus=96.
>> Later after bootup, the user wants to bring up CPU100, the LLC domain for
>> CPU100 is NULL in this case(due to sd generation), and a new LLC should be
>> detected.
>>
>> That is to say, when we reach update_llc_id(), there could be 2 reasons
>> for NULL sd. For the detach_destroy_domain() case, update_llc_id()
>> should return a valid id without increasing the max_llcs, because of
>> if (id >= 0)
>> return id;
>> And for the latter, the max_llcs should be increased.
>> Let me double check on this.
>
> The issue is we could offline all CPUs in a LLC and online them later.
> In the current code, we will assign their ids all to -1.
I suppose we don't reset the ids in current implementation, only
the first scan of LLCs will reset/initialize the ids to -1 in
build_sched_domains()?
if (!max_llcs) { //max_llcs is initialized to 0 during bootup
for_each_possible_cpu(i)
per_cpu(sd_llc_id, i) = -1;
}
> So on attach
> of CPUs again, we'll be assigning a new LLC. I think the proper thing
> to do is not to assign llc id of the offlined cpu (the case where sd == NULL)
> and keep the original llc id assigned. Then we should be okay and not
> increase max_llcs.
>
This is the current implementation because we don't assign new ids to
CPUs that already have an id(no matter it is offline/online).
thanks,
Chenyu
© 2016 - 2025 Red Hat, Inc.