From: Chen Yu <yu.c.chen@intel.com>
Introduce an index mapping between CPUs and their LLCs. This provides
a continuous per LLC index needed for cache-aware load balancing in
later patches.
The existing per_cpu llc_id usually points to the first CPU of the
LLC domain, which is sparse and unsuitable as an array index. Using
llc_id directly would waste memory.
With the new mapping, CPUs in the same LLC share a continuous id:
per_cpu(llc_id, CPU=0...15) = 0
per_cpu(llc_id, CPU=16...31) = 1
per_cpu(llc_id, CPU=32...47) = 2
...
Once a CPU has been assigned an llc_id, this ID persists even when
the CPU is taken offline and brought back online, which can facilitate
the management of the ID.
Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Co-developed-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
---
Notes:
v2->v3:
Allocate the LLC id according to the topology level data directly, rather
than calculating from the sched domain. This simplifies the code.
(Peter Zijlstra, K Prateek Nayak)
kernel/sched/topology.c | 47 ++++++++++++++++++++++++++++++++++++++---
1 file changed, 44 insertions(+), 3 deletions(-)
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index cf643a5ddedd..ca46b5cf7f78 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -20,6 +20,7 @@ void sched_domains_mutex_unlock(void)
/* Protected by sched_domains_mutex: */
static cpumask_var_t sched_domains_tmpmask;
static cpumask_var_t sched_domains_tmpmask2;
+static int tl_max_llcs;
static int __init sched_debug_setup(char *str)
{
@@ -658,7 +659,7 @@ static void destroy_sched_domains(struct sched_domain *sd)
*/
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
DEFINE_PER_CPU(int, sd_llc_size);
-DEFINE_PER_CPU(int, sd_llc_id);
+DEFINE_PER_CPU(int, sd_llc_id) = -1;
DEFINE_PER_CPU(int, sd_share_id);
DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
@@ -684,7 +685,6 @@ static void update_top_cache_domain(int cpu)
rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
per_cpu(sd_llc_size, cpu) = size;
- per_cpu(sd_llc_id, cpu) = id;
rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
sd = lowest_flag_domain(cpu, SD_CLUSTER);
@@ -2567,10 +2567,18 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
/* Set up domains for CPUs specified by the cpu_map: */
for_each_cpu(i, cpu_map) {
- struct sched_domain_topology_level *tl;
+ struct sched_domain_topology_level *tl, *tl_llc = NULL;
+ int lid;
sd = NULL;
for_each_sd_topology(tl) {
+ int flags = 0;
+
+ if (tl->sd_flags)
+ flags = (*tl->sd_flags)();
+
+ if (flags & SD_SHARE_LLC)
+ tl_llc = tl;
sd = build_sched_domain(tl, cpu_map, attr, sd, i);
@@ -2581,6 +2589,39 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
if (cpumask_equal(cpu_map, sched_domain_span(sd)))
break;
}
+
+ lid = per_cpu(sd_llc_id, i);
+ if (lid == -1) {
+ int j;
+
+ /*
+ * Assign the llc_id to the CPUs that do not
+ * have an LLC.
+ */
+ if (!tl_llc) {
+ per_cpu(sd_llc_id, i) = tl_max_llcs++;
+
+ continue;
+ }
+
+ /* try to reuse the llc_id of its siblings */
+ for_each_cpu(j, tl_llc->mask(tl_llc, i)) {
+ if (i == j)
+ continue;
+
+ lid = per_cpu(sd_llc_id, j);
+
+ if (lid != -1) {
+ per_cpu(sd_llc_id, i) = lid;
+
+ break;
+ }
+ }
+
+ /* a new LLC is detected */
+ if (lid == -1)
+ per_cpu(sd_llc_id, i) = tl_max_llcs++;
+ }
}
if (WARN_ON(!topology_span_sane(cpu_map)))
--
2.32.0
On Tue, Feb 10, 2026 at 02:18:44PM -0800, Tim Chen wrote:
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index cf643a5ddedd..ca46b5cf7f78 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -20,6 +20,7 @@ void sched_domains_mutex_unlock(void)
> /* Protected by sched_domains_mutex: */
> static cpumask_var_t sched_domains_tmpmask;
> static cpumask_var_t sched_domains_tmpmask2;
> +static int tl_max_llcs;
>
> static int __init sched_debug_setup(char *str)
> {
> @@ -658,7 +659,7 @@ static void destroy_sched_domains(struct sched_domain *sd)
> */
> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
> DEFINE_PER_CPU(int, sd_llc_size);
> -DEFINE_PER_CPU(int, sd_llc_id);
> +DEFINE_PER_CPU(int, sd_llc_id) = -1;
> DEFINE_PER_CPU(int, sd_share_id);
> DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
> @@ -684,7 +685,6 @@ static void update_top_cache_domain(int cpu)
>
> rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
> per_cpu(sd_llc_size, cpu) = size;
> - per_cpu(sd_llc_id, cpu) = id;
> rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
>
> sd = lowest_flag_domain(cpu, SD_CLUSTER);
> @@ -2567,10 +2567,18 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
>
> /* Set up domains for CPUs specified by the cpu_map: */
> for_each_cpu(i, cpu_map) {
> - struct sched_domain_topology_level *tl;
> + struct sched_domain_topology_level *tl, *tl_llc = NULL;
> + int lid;
>
> sd = NULL;
> for_each_sd_topology(tl) {
> + int flags = 0;
> +
> + if (tl->sd_flags)
> + flags = (*tl->sd_flags)();
> +
> + if (flags & SD_SHARE_LLC)
> + tl_llc = tl;
>
> sd = build_sched_domain(tl, cpu_map, attr, sd, i);
>
> @@ -2581,6 +2589,39 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
> if (cpumask_equal(cpu_map, sched_domain_span(sd)))
> break;
> }
> +
> + lid = per_cpu(sd_llc_id, i);
> + if (lid == -1) {
> + int j;
> +
> + /*
> + * Assign the llc_id to the CPUs that do not
> + * have an LLC.
> + */
Where does this happen? Is this for things like Atom that don't have an
L3 and so we don't set up a LLC domain?
> + if (!tl_llc) {
> + per_cpu(sd_llc_id, i) = tl_max_llcs++;
> +
> + continue;
> + }
> +
> + /* try to reuse the llc_id of its siblings */
> + for_each_cpu(j, tl_llc->mask(tl_llc, i)) {
> + if (i == j)
> + continue;
> +
> + lid = per_cpu(sd_llc_id, j);
> +
> + if (lid != -1) {
> + per_cpu(sd_llc_id, i) = lid;
> +
> + break;
> + }
> + }
> +
> + /* a new LLC is detected */
> + if (lid == -1)
> + per_cpu(sd_llc_id, i) = tl_max_llcs++;
> + }
> }
>
> if (WARN_ON(!topology_span_sane(cpu_map)))
> --
> 2.32.0
>
On 2/19/2026 10:59 PM, Peter Zijlstra wrote:
> On Tue, Feb 10, 2026 at 02:18:44PM -0800, Tim Chen wrote:
>
>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
>> index cf643a5ddedd..ca46b5cf7f78 100644
>> --- a/kernel/sched/topology.c
>> +++ b/kernel/sched/topology.c
>> @@ -20,6 +20,7 @@ void sched_domains_mutex_unlock(void)
>> /* Protected by sched_domains_mutex: */
>> static cpumask_var_t sched_domains_tmpmask;
>> static cpumask_var_t sched_domains_tmpmask2;
>> +static int tl_max_llcs;
>>
>> static int __init sched_debug_setup(char *str)
>> {
>> @@ -658,7 +659,7 @@ static void destroy_sched_domains(struct sched_domain *sd)
>> */
>> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
>> DEFINE_PER_CPU(int, sd_llc_size);
>> -DEFINE_PER_CPU(int, sd_llc_id);
>> +DEFINE_PER_CPU(int, sd_llc_id) = -1;
>> DEFINE_PER_CPU(int, sd_share_id);
>> DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
>> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
>> @@ -684,7 +685,6 @@ static void update_top_cache_domain(int cpu)
>>
>> rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
>> per_cpu(sd_llc_size, cpu) = size;
>> - per_cpu(sd_llc_id, cpu) = id;
>> rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
>>
>> sd = lowest_flag_domain(cpu, SD_CLUSTER);
>> @@ -2567,10 +2567,18 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
>>
>> /* Set up domains for CPUs specified by the cpu_map: */
>> for_each_cpu(i, cpu_map) {
>> - struct sched_domain_topology_level *tl;
>> + struct sched_domain_topology_level *tl, *tl_llc = NULL;
>> + int lid;
>>
>> sd = NULL;
>> for_each_sd_topology(tl) {
>> + int flags = 0;
>> +
>> + if (tl->sd_flags)
>> + flags = (*tl->sd_flags)();
>> +
>> + if (flags & SD_SHARE_LLC)
>> + tl_llc = tl;
>>
>> sd = build_sched_domain(tl, cpu_map, attr, sd, i);
>>
>> @@ -2581,6 +2589,39 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
>> if (cpumask_equal(cpu_map, sched_domain_span(sd)))
>> break;
>> }
>> +
>> + lid = per_cpu(sd_llc_id, i);
>> + if (lid == -1) {
>> + int j;
>> +
>> + /*
>> + * Assign the llc_id to the CPUs that do not
>> + * have an LLC.
>> + */
>
> Where does this happen? Is this for things like Atom that don't have an
> L3 and so we don't set up a LLC domain?
>
Yes, for some hybrid platforms, some CPUs on that platforms might not
have L3,
Tim might correct me if I’m wrong. Above code is derived from the
update_top_cache_domain(),
if there is no sched domain with SD_SHARE_LLC, per_cpu(sd_llc_id, cpu)
is set to the
CPU number directly.
thanks,
Chenyu
On Thu, 2026-02-19 at 23:20 +0800, Chen, Yu C wrote:
> On 2/19/2026 10:59 PM, Peter Zijlstra wrote:
> > On Tue, Feb 10, 2026 at 02:18:44PM -0800, Tim Chen wrote:
> >
> > > diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> > > index cf643a5ddedd..ca46b5cf7f78 100644
> > > --- a/kernel/sched/topology.c
> > > +++ b/kernel/sched/topology.c
> > > @@ -20,6 +20,7 @@ void sched_domains_mutex_unlock(void)
> > > /* Protected by sched_domains_mutex: */
> > > static cpumask_var_t sched_domains_tmpmask;
> > > static cpumask_var_t sched_domains_tmpmask2;
> > > +static int tl_max_llcs;
> > >
> > > static int __init sched_debug_setup(char *str)
> > > {
> > > @@ -658,7 +659,7 @@ static void destroy_sched_domains(struct sched_domain *sd)
> > > */
> > > DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
> > > DEFINE_PER_CPU(int, sd_llc_size);
> > > -DEFINE_PER_CPU(int, sd_llc_id);
> > > +DEFINE_PER_CPU(int, sd_llc_id) = -1;
> > > DEFINE_PER_CPU(int, sd_share_id);
> > > DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
> > > DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
> > > @@ -684,7 +685,6 @@ static void update_top_cache_domain(int cpu)
> > >
> > > rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
> > > per_cpu(sd_llc_size, cpu) = size;
> > > - per_cpu(sd_llc_id, cpu) = id;
> > > rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
> > >
> > > sd = lowest_flag_domain(cpu, SD_CLUSTER);
> > > @@ -2567,10 +2567,18 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
> > >
> > > /* Set up domains for CPUs specified by the cpu_map: */
> > > for_each_cpu(i, cpu_map) {
> > > - struct sched_domain_topology_level *tl;
> > > + struct sched_domain_topology_level *tl, *tl_llc = NULL;
> > > + int lid;
> > >
> > > sd = NULL;
> > > for_each_sd_topology(tl) {
> > > + int flags = 0;
> > > +
> > > + if (tl->sd_flags)
> > > + flags = (*tl->sd_flags)();
> > > +
> > > + if (flags & SD_SHARE_LLC)
> > > + tl_llc = tl;
> > >
> > > sd = build_sched_domain(tl, cpu_map, attr, sd, i);
> > >
> > > @@ -2581,6 +2589,39 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
> > > if (cpumask_equal(cpu_map, sched_domain_span(sd)))
> > > break;
> > > }
> > > +
> > > + lid = per_cpu(sd_llc_id, i);
> > > + if (lid == -1) {
> > > + int j;
> > > +
> > > + /*
> > > + * Assign the llc_id to the CPUs that do not
> > > + * have an LLC.
> > > + */
> >
> > Where does this happen? Is this for things like Atom that don't have an
> > L3 and so we don't set up a LLC domain?
> >
>
> Yes, for some hybrid platforms, some CPUs on that platforms might not
> have L3,
> Tim might correct me if I’m wrong. Above code is derived from the
> update_top_cache_domain(),
> if there is no sched domain with SD_SHARE_LLC, per_cpu(sd_llc_id, cpu)
> is set to the
> CPU number directly.
>
That's correct. One example is Meteor Lake where some Atom CPUs don't have
L3 but have only L2. And some Ampere CPUs also have no shared L3.
https://www.spinics.net/lists/kernel/msg5863118.html?utm_source=chatgpt.com
This also reminded me that if we rely on cpu_coregroup_mask for LLC id
assignment, we may be missing out such platforms which need to treat
L2 as the last level cache. So we may need to fallback to cpu_clustergroup_mask
or cpu_smt_mask where applicable.
Tim
> thanks,
> Chenyu
>
On Thu, 2026-02-19 at 11:20 -0800, Tim Chen wrote:
> On Thu, 2026-02-19 at 23:20 +0800, Chen, Yu C wrote:
> > On 2/19/2026 10:59 PM, Peter Zijlstra wrote:
> > > On Tue, Feb 10, 2026 at 02:18:44PM -0800, Tim Chen wrote:
> > >
> > > > diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> > > > index cf643a5ddedd..ca46b5cf7f78 100644
> > > > --- a/kernel/sched/topology.c
> > > > +++ b/kernel/sched/topology.c
> > > > @@ -20,6 +20,7 @@ void sched_domains_mutex_unlock(void)
> > > > /* Protected by sched_domains_mutex: */
> > > > static cpumask_var_t sched_domains_tmpmask;
> > > > static cpumask_var_t sched_domains_tmpmask2;
> > > > +static int tl_max_llcs;
> > > >
> > > > static int __init sched_debug_setup(char *str)
> > > > {
> > > > @@ -658,7 +659,7 @@ static void destroy_sched_domains(struct sched_domain *sd)
> > > > */
> > > > DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
> > > > DEFINE_PER_CPU(int, sd_llc_size);
> > > > -DEFINE_PER_CPU(int, sd_llc_id);
> > > > +DEFINE_PER_CPU(int, sd_llc_id) = -1;
> > > > DEFINE_PER_CPU(int, sd_share_id);
> > > > DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
> > > > DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
> > > > @@ -684,7 +685,6 @@ static void update_top_cache_domain(int cpu)
> > > >
> > > > rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
> > > > per_cpu(sd_llc_size, cpu) = size;
> > > > - per_cpu(sd_llc_id, cpu) = id;
> > > > rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
> > > >
> > > > sd = lowest_flag_domain(cpu, SD_CLUSTER);
> > > > @@ -2567,10 +2567,18 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
> > > >
> > > > /* Set up domains for CPUs specified by the cpu_map: */
> > > > for_each_cpu(i, cpu_map) {
> > > > - struct sched_domain_topology_level *tl;
> > > > + struct sched_domain_topology_level *tl, *tl_llc = NULL;
> > > > + int lid;
> > > >
> > > > sd = NULL;
> > > > for_each_sd_topology(tl) {
> > > > + int flags = 0;
> > > > +
> > > > + if (tl->sd_flags)
> > > > + flags = (*tl->sd_flags)();
> > > > +
> > > > + if (flags & SD_SHARE_LLC)
> > > > + tl_llc = tl;
> > > >
> > > > sd = build_sched_domain(tl, cpu_map, attr, sd, i);
> > > >
> > > > @@ -2581,6 +2589,39 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
> > > > if (cpumask_equal(cpu_map, sched_domain_span(sd)))
> > > > break;
> > > > }
> > > > +
> > > > + lid = per_cpu(sd_llc_id, i);
> > > > + if (lid == -1) {
> > > > + int j;
> > > > +
> > > > + /*
> > > > + * Assign the llc_id to the CPUs that do not
> > > > + * have an LLC.
> > > > + */
> > >
> > > Where does this happen? Is this for things like Atom that don't have an
> > > L3 and so we don't set up a LLC domain?
> > >
> >
> > Yes, for some hybrid platforms, some CPUs on that platforms might not
> > have L3,
> > Tim might correct me if I’m wrong. Above code is derived from the
> > update_top_cache_domain(),
> > if there is no sched domain with SD_SHARE_LLC, per_cpu(sd_llc_id, cpu)
> > is set to the
> > CPU number directly.
> >
>
> That's correct. One example is Meteor Lake where some Atom CPUs don't have
> L3 but have only L2. And some Ampere CPUs also have no shared L3.
>
> https://www.spinics.net/lists/kernel/msg5863118.html?utm_source=chatgpt.com
>
> This also reminded me that if we rely on cpu_coregroup_mask for LLC id
> assignment, we may be missing out such platforms which need to treat
> L2 as the last level cache. So we may need to fallback to cpu_clustergroup_mask
> or cpu_smt_mask where applicable.
On further inspection of the code, cpu_coregroup_mask will just be the same
as cpu_clustergroup_mask for that case so we should be okay.
Tim
>
> Tim
>
> > thanks,
> > Chenyu
> >
On 2/20/2026 5:04 AM, Tim Chen wrote:
> On Thu, 2026-02-19 at 11:20 -0800, Tim Chen wrote:
>> On Thu, 2026-02-19 at 23:20 +0800, Chen, Yu C wrote:
>>> On 2/19/2026 10:59 PM, Peter Zijlstra wrote:
>>>> On Tue, Feb 10, 2026 at 02:18:44PM -0800, Tim Chen wrote:
[ ... ]
>>>>> +
>>>>> + lid = per_cpu(sd_llc_id, i);
>>>>> + if (lid == -1) {
>>>>> + int j;
>>>>> +
>>>>> + /*
>>>>> + * Assign the llc_id to the CPUs that do not
>>>>> + * have an LLC.
>>>>> + */
>>>>
>>>> Where does this happen? Is this for things like Atom that don't have an
>>>> L3 and so we don't set up a LLC domain?
>>>>
>>>
>>> Yes, for some hybrid platforms, some CPUs on that platforms might not
>>> have L3,
>>> Tim might correct me if I’m wrong. Above code is derived from the
>>> update_top_cache_domain(),
>>> if there is no sched domain with SD_SHARE_LLC, per_cpu(sd_llc_id, cpu)
>>> is set to the
>>> CPU number directly.
>>>
>>
>> That's correct. One example is Meteor Lake where some Atom CPUs don't have
>> L3 but have only L2. And some Ampere CPUs also have no shared L3.
>>
>> https://www.spinics.net/lists/kernel/msg5863118.html?utm_source=chatgpt.com
>>
>> This also reminded me that if we rely on cpu_coregroup_mask for LLC id
>> assignment, we may be missing out such platforms which need to treat
>> L2 as the last level cache. So we may need to fallback to cpu_clustergroup_mask
>> or cpu_smt_mask where applicable.
>
> On further inspection of the code, cpu_coregroup_mask will just be the same
> as cpu_clustergroup_mask for that case so we should be okay.
>
OK, I assume this is true for Intel platforms because the llc_id will
be set to l2_id if there is no L3 cache:
c->topo.llc_id = (l3_id == BAD_APICID) ? l2_id : l3_id;
I suppose AMD platforms should not be impacted because I have not seen
any non-L3 platforms (for AMD).
For non-x86 platforms, cpu_coregroup_mask() will be converted to the
cluster mask if no LLC is present.
thanks,
Chenyu
On Tue, Feb 10, 2026 at 02:18:44PM -0800, Tim Chen wrote: > From: Chen Yu <yu.c.chen@intel.com> > > Introduce an index mapping between CPUs and their LLCs. This provides > a continuous per LLC index needed for cache-aware load balancing in > later patches. > > The existing per_cpu llc_id usually points to the first CPU of the > LLC domain, which is sparse and unsuitable as an array index. Using > llc_id directly would waste memory. > > With the new mapping, CPUs in the same LLC share a continuous id: > > per_cpu(llc_id, CPU=0...15) = 0 > per_cpu(llc_id, CPU=16...31) = 1 > per_cpu(llc_id, CPU=32...47) = 2 > ... > > Once a CPU has been assigned an llc_id, this ID persists even when > the CPU is taken offline and brought back online, which can facilitate > the management of the ID. > > Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com> > Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com> > Co-developed-by: K Prateek Nayak <kprateek.nayak@amd.com> > Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com> > Signed-off-by: Chen Yu <yu.c.chen@intel.com> Note that Tim is the one sending this email, so his SOB should be last. It is also fine to have a SOB occur multiple times in a chain. Please double check all these SOB chains, because I think this isn't the first one that isn't right (possibly the very first patch already has problems).
On Thu, 2026-02-19 at 12:35 +0100, Peter Zijlstra wrote: > On Tue, Feb 10, 2026 at 02:18:44PM -0800, Tim Chen wrote: > > From: Chen Yu <yu.c.chen@intel.com> > > > > Introduce an index mapping between CPUs and their LLCs. This provides > > a continuous per LLC index needed for cache-aware load balancing in > > later patches. > > > > The existing per_cpu llc_id usually points to the first CPU of the > > LLC domain, which is sparse and unsuitable as an array index. Using > > llc_id directly would waste memory. > > > > With the new mapping, CPUs in the same LLC share a continuous id: > > > > per_cpu(llc_id, CPU=0...15) = 0 > > per_cpu(llc_id, CPU=16...31) = 1 > > per_cpu(llc_id, CPU=32...47) = 2 > > ... > > > > Once a CPU has been assigned an llc_id, this ID persists even when > > the CPU is taken offline and brought back online, which can facilitate > > the management of the ID. > > > > Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com> > > Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com> > > Co-developed-by: K Prateek Nayak <kprateek.nayak@amd.com> > > Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com> > > Signed-off-by: Chen Yu <yu.c.chen@intel.com> > > Note that Tim is the one sending this email, so his SOB should be last. > It is also fine to have a SOB occur multiple times in a chain. > > Please double check all these SOB chains, because I think this isn't the > first one that isn't right (possibly the very first patch already has > problems). Sorry about that. Will correct this on the next version. Tim
Hello Tim, Chenyu,
On 2/11/2026 3:48 AM, Tim Chen wrote:
> From: Chen Yu <yu.c.chen@intel.com>
>
> Introduce an index mapping between CPUs and their LLCs. This provides
> a continuous per LLC index needed for cache-aware load balancing in
> later patches.
>
> The existing per_cpu llc_id usually points to the first CPU of the
> LLC domain, which is sparse and unsuitable as an array index. Using
> llc_id directly would waste memory.
>
> With the new mapping, CPUs in the same LLC share a continuous id:
>
> per_cpu(llc_id, CPU=0...15) = 0
> per_cpu(llc_id, CPU=16...31) = 1
> per_cpu(llc_id, CPU=32...47) = 2
> ...
>
> Once a CPU has been assigned an llc_id, this ID persists even when
> the CPU is taken offline and brought back online, which can facilitate
> the management of the ID.
>
> Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com>
> Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
> Co-developed-by: K Prateek Nayak <kprateek.nayak@amd.com>
> Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
> Signed-off-by: Chen Yu <yu.c.chen@intel.com>
> ---
>
> Notes:
> v2->v3:
> Allocate the LLC id according to the topology level data directly, rather
> than calculating from the sched domain. This simplifies the code.
> (Peter Zijlstra, K Prateek Nayak)
>
> kernel/sched/topology.c | 47 ++++++++++++++++++++++++++++++++++++++---
> 1 file changed, 44 insertions(+), 3 deletions(-)
>
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index cf643a5ddedd..ca46b5cf7f78 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -20,6 +20,7 @@ void sched_domains_mutex_unlock(void)
> /* Protected by sched_domains_mutex: */
> static cpumask_var_t sched_domains_tmpmask;
> static cpumask_var_t sched_domains_tmpmask2;
> +static int tl_max_llcs;
>
> static int __init sched_debug_setup(char *str)
> {
> @@ -658,7 +659,7 @@ static void destroy_sched_domains(struct sched_domain *sd)
> */
> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
> DEFINE_PER_CPU(int, sd_llc_size);
> -DEFINE_PER_CPU(int, sd_llc_id);
> +DEFINE_PER_CPU(int, sd_llc_id) = -1;
> DEFINE_PER_CPU(int, sd_share_id);
> DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
> @@ -684,7 +685,6 @@ static void update_top_cache_domain(int cpu)
>
> rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
> per_cpu(sd_llc_size, cpu) = size;
> - per_cpu(sd_llc_id, cpu) = id;
> rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
>
> sd = lowest_flag_domain(cpu, SD_CLUSTER);
> @@ -2567,10 +2567,18 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
>
> /* Set up domains for CPUs specified by the cpu_map: */
> for_each_cpu(i, cpu_map) {
> - struct sched_domain_topology_level *tl;
> + struct sched_domain_topology_level *tl, *tl_llc = NULL;
> + int lid;
>
> sd = NULL;
> for_each_sd_topology(tl) {
> + int flags = 0;
> +
> + if (tl->sd_flags)
> + flags = (*tl->sd_flags)();
> +
> + if (flags & SD_SHARE_LLC)
> + tl_llc = tl;
nit. This loop breaks out when sched_domain_span(sd) covers the entire
cpu_map and it might have not reached the topmost SD_SHARE_LLC domain
yet. Is that cause for any concern?
>
> sd = build_sched_domain(tl, cpu_map, attr, sd, i);
>
> @@ -2581,6 +2589,39 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
> if (cpumask_equal(cpu_map, sched_domain_span(sd)))
> break;
> }
> +
> + lid = per_cpu(sd_llc_id, i);
> + if (lid == -1) {
> + int j;
> +
> + /*
> + * Assign the llc_id to the CPUs that do not
> + * have an LLC.
> + */
> + if (!tl_llc) {
> + per_cpu(sd_llc_id, i) = tl_max_llcs++;
> +
> + continue;
> + }
> +
> + /* try to reuse the llc_id of its siblings */
> + for_each_cpu(j, tl_llc->mask(tl_llc, i)) {
My only large concern that remains is the fact that offline CPUs are
taken out the the tl->mask() which can lead to interesting cases where
CPUs on same LLC can have different llc_id:
o Boot with maxcpus=1
o Run:
for i in {1..$NRCPUS}; do
echo 1 > /sys/devices/system/cpu/cpu$i/online;
echo 0 > /sys/devices/system/cpu/cpu$i/online;
done
o Finally run:
echo 1 | tee /sys/devices/system/cpu/cpu*/online;
Once all CPUs are online, only the CPUs in boot CPU's LLC will have
the same llc_id. Every other CPU will have a unique llc_id which might
make the system behave unexpectedly.
I'm wondering if we can do something like below on top of this patch:
(Only build tested; Prepared on top of this patch in Tim's tree)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c6efa71cf500..aee1be89ab4c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8268,6 +8268,8 @@ static void cpuset_cpu_active(void)
static void cpuset_cpu_inactive(unsigned int cpu)
{
if (!cpuhp_tasks_frozen) {
+ /* XXX: Is this the right spot? */
+ sched_domains_free_llc_id(cpu);
cpuset_update_active_cpus();
} else {
num_cpus_frozen++;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index de5b701c3950..31a8910297c7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3903,6 +3903,7 @@ static inline bool sched_cache_enabled(void)
}
#endif
extern void init_sched_mm(struct task_struct *p);
+void sched_domains_free_llc_id(int cpu);
extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index ca46b5cf7f78..04c1ab489ee2 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -18,6 +18,7 @@ void sched_domains_mutex_unlock(void)
}
/* Protected by sched_domains_mutex: */
+static cpumask_var_t sched_domains_llc_id_allocmask;
static cpumask_var_t sched_domains_tmpmask;
static cpumask_var_t sched_domains_tmpmask2;
static int tl_max_llcs;
@@ -2543,6 +2544,53 @@ static bool topology_span_sane(const struct cpumask *cpu_map)
return true;
}
+static int __sched_domains_alloc_llc_id(void)
+{
+ int lid;
+
+ lockdep_assert_held(&sched_domains_mutex);
+
+ lid = cpumask_first_zero(sched_domains_llc_id_allocmask);
+ if (lid >= tl_max_llcs)
+ tl_max_llcs++;
+
+ /*
+ * llc_id space should never grow larger than the
+ * possible number of CPUs in the system.
+ */
+ if (!unlikely(WARN_ON_ONCE(lid >= nr_cpumask_bits)))
+ cpumask_set_cpu(lid, sched_domains_llc_id_allocmask);
+ return lid;
+}
+
+static void __sched_domains_free_llc_id(int cpu)
+{
+ int i, lid;
+
+ lockdep_assert_held(&sched_domains_mutex);
+
+ lid = per_cpu(sd_llc_id, cpu);
+ if (lid == -1)
+ return;
+
+ per_cpu(sd_llc_id, cpu) = -1;
+
+ for_each_online_cpu(i) {
+ /* An online CPU owns the llc_id. */
+ if (per_cpu(sd_llc_id, i) == lid)
+ return;
+ }
+
+ cpumask_clear_cpu(lid, sched_domains_llc_id_allocmask);
+}
+
+void sched_domains_free_llc_id(int cpu)
+{
+ sched_domains_mutex_lock();
+ __sched_domains_free_llc_id(cpu);
+ sched_domains_mutex_unlock();
+}
+
/*
* Build sched domains for a given set of CPUs and attach the sched domains
* to the individual CPUs
@@ -2599,7 +2647,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
* have an LLC.
*/
if (!tl_llc) {
- per_cpu(sd_llc_id, i) = tl_max_llcs++;
+ per_cpu(sd_llc_id, i) = __sched_domains_alloc_llc_id();
continue;
}
@@ -2620,7 +2668,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
/* a new LLC is detected */
if (lid == -1)
- per_cpu(sd_llc_id, i) = tl_max_llcs++;
+ per_cpu(sd_llc_id, i) = __sched_domains_alloc_llc_id();
}
}
@@ -2798,6 +2846,7 @@ int __init sched_init_domains(const struct cpumask *cpu_map)
{
int err;
+ zalloc_cpumask_var(&sched_domains_llc_id_allocmask, GFP_KERNEL);
zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
---
It doesn't compact tl_max_llcs, but it should promote reuse of llc_id if
all CPUs of a LLC go offline. I know it is a ridiculous scenario but it
is possible nonetheless.
I'll let Peter and Valentin be the judge of additional space and
complexity needed for these bits :-)
> + if (i == j)
> + continue;
> +
> + lid = per_cpu(sd_llc_id, j);
> +
> + if (lid != -1) {
> + per_cpu(sd_llc_id, i) = lid;
> +
> + break;
> + }
> + }
> +
> + /* a new LLC is detected */
> + if (lid == -1)
> + per_cpu(sd_llc_id, i) = tl_max_llcs++;
> + }
> }
>
> if (WARN_ON(!topology_span_sane(cpu_map)))
--
Thanks and Regards,
Prateek
On Mon, Feb 16, 2026 at 01:14:20PM +0530, K Prateek Nayak wrote:
> > @@ -2581,6 +2589,39 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
> > if (cpumask_equal(cpu_map, sched_domain_span(sd)))
> > break;
> > }
> > +
> > + lid = per_cpu(sd_llc_id, i);
> > + if (lid == -1) {
> > + int j;
> > +
> > + /*
> > + * Assign the llc_id to the CPUs that do not
> > + * have an LLC.
> > + */
> > + if (!tl_llc) {
> > + per_cpu(sd_llc_id, i) = tl_max_llcs++;
> > +
> > + continue;
> > + }
> > +
> > + /* try to reuse the llc_id of its siblings */
> > + for_each_cpu(j, tl_llc->mask(tl_llc, i)) {
>
>
> My only large concern that remains is the fact that offline CPUs are
> taken out the the tl->mask() which can lead to interesting cases where
> CPUs on same LLC can have different llc_id:
>
> o Boot with maxcpus=1
>
> o Run:
>
> for i in {1..$NRCPUS}; do
> echo 1 > /sys/devices/system/cpu/cpu$i/online;
> echo 0 > /sys/devices/system/cpu/cpu$i/online;
> done
Lol, cute ;-)
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index c6efa71cf500..aee1be89ab4c 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -8268,6 +8268,8 @@ static void cpuset_cpu_active(void)
> static void cpuset_cpu_inactive(unsigned int cpu)
> {
> if (!cpuhp_tasks_frozen) {
> + /* XXX: Is this the right spot? */
> + sched_domains_free_llc_id(cpu);
> cpuset_update_active_cpus();
> } else {
> num_cpus_frozen++;
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index de5b701c3950..31a8910297c7 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -3903,6 +3903,7 @@ static inline bool sched_cache_enabled(void)
> }
> #endif
> extern void init_sched_mm(struct task_struct *p);
> +void sched_domains_free_llc_id(int cpu);
>
> extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
> extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index ca46b5cf7f78..04c1ab489ee2 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -18,6 +18,7 @@ void sched_domains_mutex_unlock(void)
> }
>
> /* Protected by sched_domains_mutex: */
> +static cpumask_var_t sched_domains_llc_id_allocmask;
> static cpumask_var_t sched_domains_tmpmask;
> static cpumask_var_t sched_domains_tmpmask2;
> static int tl_max_llcs;
> @@ -2543,6 +2544,53 @@ static bool topology_span_sane(const struct cpumask *cpu_map)
> return true;
> }
>
> +static int __sched_domains_alloc_llc_id(void)
> +{
> + int lid;
> +
> + lockdep_assert_held(&sched_domains_mutex);
> +
> + lid = cpumask_first_zero(sched_domains_llc_id_allocmask);
> + if (lid >= tl_max_llcs)
> + tl_max_llcs++;
Urgh,. should we not rather track the max lid?
Also, we allocate max_llc sized data structures, if this thing is
'variable' we must also always store a copy of the 'lid' size of the
time of allocation.
> +
> + /*
> + * llc_id space should never grow larger than the
> + * possible number of CPUs in the system.
> + */
> + if (!unlikely(WARN_ON_ONCE(lid >= nr_cpumask_bits)))
> + cpumask_set_cpu(lid, sched_domains_llc_id_allocmask);
__cpumask_set_cpu()
Since you're serializing everything with that sched_domains_mutex, this
need not be an atomic op.
> + return lid;
> +}
> +
> +static void __sched_domains_free_llc_id(int cpu)
> +{
> + int i, lid;
> +
> + lockdep_assert_held(&sched_domains_mutex);
> +
> + lid = per_cpu(sd_llc_id, cpu);
> + if (lid == -1)
> + return;
> +
> + per_cpu(sd_llc_id, cpu) = -1;
> +
> + for_each_online_cpu(i) {
> + /* An online CPU owns the llc_id. */
> + if (per_cpu(sd_llc_id, i) == lid)
> + return;
> + }
> +
> + cpumask_clear_cpu(lid, sched_domains_llc_id_allocmask);
__cpumask_clear_cpu()
> +}
So this deals with Madadi's issue I suppose.
> +void sched_domains_free_llc_id(int cpu)
> +{
> + sched_domains_mutex_lock();
> + __sched_domains_free_llc_id(cpu);
> + sched_domains_mutex_unlock();
> +}
> +
> /*
> * Build sched domains for a given set of CPUs and attach the sched domains
> * to the individual CPUs
> @@ -2599,7 +2647,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
> * have an LLC.
> */
> if (!tl_llc) {
> - per_cpu(sd_llc_id, i) = tl_max_llcs++;
> + per_cpu(sd_llc_id, i) = __sched_domains_alloc_llc_id();
>
> continue;
> }
> @@ -2620,7 +2668,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
>
> /* a new LLC is detected */
> if (lid == -1)
> - per_cpu(sd_llc_id, i) = tl_max_llcs++;
> + per_cpu(sd_llc_id, i) = __sched_domains_alloc_llc_id();
> }
> }
>
> @@ -2798,6 +2846,7 @@ int __init sched_init_domains(const struct cpumask *cpu_map)
> {
> int err;
>
> + zalloc_cpumask_var(&sched_domains_llc_id_allocmask, GFP_KERNEL);
> zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
> zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
> zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
> ---
>
> It doesn't compact tl_max_llcs, but it should promote reuse of llc_id if
> all CPUs of a LLC go offline. I know it is a ridiculous scenario but it
> is possible nonetheless.
>
> I'll let Peter and Valentin be the judge of additional space and
> complexity needed for these bits :-)
It appears straight forward enough I suppose.
Hi Peter,
On 2/19/2026 11:40 PM, Peter Zijlstra wrote:
> On Mon, Feb 16, 2026 at 01:14:20PM +0530, K Prateek Nayak wrote:
[ ... ]
>> +static int __sched_domains_alloc_llc_id(void)
>> +{
>> + int lid;
>> +
>> + lockdep_assert_held(&sched_domains_mutex);
>> +
>> + lid = cpumask_first_zero(sched_domains_llc_id_allocmask);
>> + if (lid >= tl_max_llcs)
>> + tl_max_llcs++;
>
> Urgh,. should we not rather track the max lid?
>
Do you mean we should not always increment the max lid,
but instead decrease it when an llc_id is released?
I think Tim has adjusted the code to shrink tl_max_llcs
when an llc_id is released:
https://lore.kernel.org/all/acc7a5c96e8235bf11af640798ce1b60bcaa8196.camel@linux.intel.com/
> Also, we allocate max_llc sized data structures, if this thing is
> 'variable' we must also always store a copy of the 'lid' size of the
> time of allocation.
>
Do you mean we should save the latest llc_max in the sched_domain
and publish it during sd attachment, as suggested at:
https://lore.kernel.org/all/20260220104533.GO1395266@noisy.programming.kicks-ass.net/
thanks,
Chenyu
On Fri, Feb 20, 2026 at 11:53:31PM +0800, Chen, Yu C wrote:
> Hi Peter,
>
> On 2/19/2026 11:40 PM, Peter Zijlstra wrote:
> > On Mon, Feb 16, 2026 at 01:14:20PM +0530, K Prateek Nayak wrote:
>
> [ ... ]
>
> > > +static int __sched_domains_alloc_llc_id(void)
> > > +{
> > > + int lid;
> > > +
> > > + lockdep_assert_held(&sched_domains_mutex);
> > > +
> > > + lid = cpumask_first_zero(sched_domains_llc_id_allocmask);
> > > + if (lid >= tl_max_llcs)
> > > + tl_max_llcs++;
> >
> > Urgh,. should we not rather track the max lid?
> >
>
> Do you mean we should not always increment the max lid,
> but instead decrease it when an llc_id is released?
> I think Tim has adjusted the code to shrink tl_max_llcs
> when an llc_id is released:
> https://lore.kernel.org/all/acc7a5c96e8235bf11af640798ce1b60bcaa8196.camel@linux.intel.com/
You can only shrink when the max lid is released. Since lid is an array
index, something like max_lid = weight(mask) would be terribly broken.
But what I was getting at is that the code as presented there is rather
non-obvious. Yes, if the lid is higher, it cannot be more than one
higher than the previous value, but something like:
lid = cpumask_first_zero();
BUG_ON(lid >= nr_cpu_ids);
max_lid = max(max_lid, lid);
Is way simpler to follow since it doesn't have that hidden assumption.
Then, if you want to allow shrinking, then the clear side could do
something like:
__cpumask_clear(lid, mask);
if (lid == max_lid)
max_lid = cpumask_last(mask);
or something like that.
> > Also, we allocate max_llc sized data structures, if this thing is
> > 'variable' we must also always store a copy of the 'lid' size of the
> > time of allocation.
> >
>
> Do you mean we should save the latest llc_max in the sched_domain
> and publish it during sd attachment, as suggested at:
>
> https://lore.kernel.org/all/20260220104533.GO1395266@noisy.programming.kicks-ass.net/
Yeah, having it separated like it is now feels super fragile.
On 2/21/2026 12:03 AM, Peter Zijlstra wrote:
> On Fri, Feb 20, 2026 at 11:53:31PM +0800, Chen, Yu C wrote:
>> Hi Peter,
>>
>> On 2/19/2026 11:40 PM, Peter Zijlstra wrote:
>>> On Mon, Feb 16, 2026 at 01:14:20PM +0530, K Prateek Nayak wrote:
>>
>> [ ... ]
>>
>>>> +static int __sched_domains_alloc_llc_id(void)
>>>> +{
>>>> + int lid;
>>>> +
>>>> + lockdep_assert_held(&sched_domains_mutex);
>>>> +
>>>> + lid = cpumask_first_zero(sched_domains_llc_id_allocmask);
>>>> + if (lid >= tl_max_llcs)
>>>> + tl_max_llcs++;
>>>
>>> Urgh,. should we not rather track the max lid?
>>>
>>
>> Do you mean we should not always increment the max lid,
>> but instead decrease it when an llc_id is released?
>> I think Tim has adjusted the code to shrink tl_max_llcs
>> when an llc_id is released:
>> https://lore.kernel.org/all/acc7a5c96e8235bf11af640798ce1b60bcaa8196.camel@linux.intel.com/
>
> You can only shrink when the max lid is released. Since lid is an array
> index, something like max_lid = weight(mask) would be terribly broken.
>
> But what I was getting at is that the code as presented there is rather
> non-obvious. Yes, if the lid is higher, it cannot be more than one
> higher than the previous value, but something like:
>
> lid = cpumask_first_zero();
> BUG_ON(lid >= nr_cpu_ids);
> max_lid = max(max_lid, lid);
>
> Is way simpler to follow since it doesn't have that hidden assumption.
>
> Then, if you want to allow shrinking, then the clear side could do
> something like:
>
> __cpumask_clear(lid, mask);
> if (lid == max_lid)
> max_lid = cpumask_last(mask);
>
> or something like that.
>
Got it, we will adjust the code accordingly.
thanks,
Chenyu
On Sat, 2026-02-21 at 00:10 +0800, Chen, Yu C wrote:
> On 2/21/2026 12:03 AM, Peter Zijlstra wrote:
> > On Fri, Feb 20, 2026 at 11:53:31PM +0800, Chen, Yu C wrote:
> > > Hi Peter,
> > >
> > > On 2/19/2026 11:40 PM, Peter Zijlstra wrote:
> > > > On Mon, Feb 16, 2026 at 01:14:20PM +0530, K Prateek Nayak wrote:
> > >
> > > [ ... ]
> > >
> > > > > +static int __sched_domains_alloc_llc_id(void)
> > > > > +{
> > > > > + int lid;
> > > > > +
> > > > > + lockdep_assert_held(&sched_domains_mutex);
> > > > > +
> > > > > + lid = cpumask_first_zero(sched_domains_llc_id_allocmask);
> > > > > + if (lid >= tl_max_llcs)
> > > > > + tl_max_llcs++;
> > > >
> > > > Urgh,. should we not rather track the max lid?
> > > >
> > >
> > > Do you mean we should not always increment the max lid,
> > > but instead decrease it when an llc_id is released?
> > > I think Tim has adjusted the code to shrink tl_max_llcs
> > > when an llc_id is released:
> > > https://lore.kernel.org/all/acc7a5c96e8235bf11af640798ce1b60bcaa8196.camel@linux.intel.com/
> >
> > You can only shrink when the max lid is released. Since lid is an array
> > index, something like max_lid = weight(mask) would be terribly broken.
> >
> > But what I was getting at is that the code as presented there is rather
> > non-obvious. Yes, if the lid is higher, it cannot be more than one
> > higher than the previous value, but something like:
> >
> > lid = cpumask_first_zero();
> > BUG_ON(lid >= nr_cpu_ids);
> > max_lid = max(max_lid, lid);
> >
> > Is way simpler to follow since it doesn't have that hidden assumption.
> >
> > Then, if you want to allow shrinking, then the clear side could do
> > something like:
> >
> > __cpumask_clear(lid, mask);
> > if (lid == max_lid)
> > max_lid = cpumask_last(mask);
> >
> > or something like that.
> >
>
> Got it, we will adjust the code accordingly.
>
How about modifying this patch like the following:
Thanks.
Tim
---
diff --git a/init/Kconfig b/init/Kconfig
index f4b2649f8401..da405c00e9e3 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -994,6 +994,7 @@ config SCHED_CACHE
bool "Cache aware load balance"
default y
depends on SMP
+ depends on SCHED_MC
help
When enabled, the scheduler will attempt to aggregate tasks from
the same process onto a single Last Level Cache (LLC) domain when
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c464e370576f..e34b5842caa4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8372,6 +8372,8 @@ int sched_cpu_deactivate(unsigned int cpu)
*/
synchronize_rcu();
+ sched_domains_free_llc_id(cpu);
+
sched_set_rq_offline(rq, cpu);
scx_rq_deactivate(rq);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f4785f84b1f1..3096adc13074 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3932,6 +3932,13 @@ static inline bool sched_cache_enabled(void)
extern void sched_cache_active_set_unlocked(void);
#endif
+
+#ifdef CONFIG_SMP
+void sched_domains_free_llc_id(int cpu);
+#else /* !CONFIG_SMP: */
+static inline void sched_domains_free_llc_id(int cpu) { }
+#endif /* !CONFIG_SMP */
+
extern void init_sched_mm(struct task_struct *p);
extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index e86dea1b9e86..f3bc6636170f 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -18,6 +18,7 @@ void sched_domains_mutex_unlock(void)
}
/* Protected by sched_domains_mutex: */
+static cpumask_var_t sched_domains_llc_id_allocmask;
static cpumask_var_t sched_domains_tmpmask;
static cpumask_var_t sched_domains_tmpmask2;
static int tl_max_llcs;
@@ -2660,6 +2661,61 @@ static bool topology_span_sane(const struct cpumask *cpu_map)
return true;
}
+#ifdef CONFIG_SMP
+static int __sched_domains_alloc_llc_id(void)
+{
+ int lid, max_lid;
+
+ lockdep_assert_held(&sched_domains_mutex);
+
+ lid = cpumask_first_zero(sched_domains_llc_id_allocmask);
+ /*
+ * llc_id space should never grow larger than the
+ * possible number of CPUs in the system.
+ */
+ BUG_ON(lid >= nr_cpu_ids);
+ max_lid = cpumask_last(sched_domains_llc_id_allocmask);
+ /* size is one more than max index */
+ tl_max_llcs = max(lid, max_lid) + 1;
+
+ return lid;
+}
+
+static void __sched_domains_free_llc_id(int cpu)
+{
+ int i, lid, last_lid;
+
+ lockdep_assert_held(&sched_domains_mutex);
+
+ lid = per_cpu(sd_llc_id, cpu);
+ if (lid == -1)
+ return;
+
+ BUG_ON(lid >= nr_cpu_ids);
+ per_cpu(sd_llc_id, cpu) = -1;
+
+ for_each_cpu(i, cpu_coregroup_mask(cpu)) {
+ /* An online CPU owns the llc_id. */
+ if (per_cpu(sd_llc_id, i) == lid)
+ return;
+ }
+
+ cpumask_clear_cpu(lid, sched_domains_llc_id_allocmask);
+
+ last_lid = cpumask_last(sched_domains_llc_id_allocmask);
+ /* shrink max LLC size to save memory */
+ if (last_lid < tl_max_llcs - 1)
+ tl_max_llcs = last_lid + 1;
+}
+
+void sched_domains_free_llc_id(int cpu)
+{
+ sched_domains_mutex_lock();
+ __sched_domains_free_llc_id(cpu);
+ sched_domains_mutex_unlock();
+}
+#endif /* CONFIG_SMP */
+
/*
* Build sched domains for a given set of CPUs and attach the sched domains
* to the individual CPUs
@@ -2685,18 +2741,11 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
/* Set up domains for CPUs specified by the cpu_map: */
for_each_cpu(i, cpu_map) {
- struct sched_domain_topology_level *tl, *tl_llc = NULL;
+ struct sched_domain_topology_level *tl;
int lid;
sd = NULL;
for_each_sd_topology(tl) {
- int flags = 0;
-
- if (tl->sd_flags)
- flags = (*tl->sd_flags)();
-
- if (flags & SD_SHARE_LLC)
- tl_llc = tl;
sd = build_sched_domain(tl, cpu_map, attr, sd, i);
@@ -2708,22 +2757,14 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
break;
}
+#ifdef CONFIG_SMP
lid = per_cpu(sd_llc_id, i);
if (lid == -1) {
int j;
- /*
- * Assign the llc_id to the CPUs that do not
- * have an LLC.
- */
- if (!tl_llc) {
- per_cpu(sd_llc_id, i) = tl_max_llcs++;
-
- continue;
- }
-
+ j = cpumask_first(cpu_coregroup_mask(i));
/* try to reuse the llc_id of its siblings */
- for_each_cpu(j, tl_llc->mask(tl_llc, i)) {
+ for (; j < nr_cpu_ids; j = cpumask_next(j, cpu_coregroup_mask(i))) {
if (i == j)
continue;
@@ -2738,8 +2779,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
/* a new LLC is detected */
if (lid == -1)
- per_cpu(sd_llc_id, i) = tl_max_llcs++;
+ per_cpu(sd_llc_id, i) = __sched_domains_alloc_llc_id();
}
+#endif /* CONFIG_SMP */
}
if (WARN_ON(!topology_span_sane(cpu_map)))
@@ -2939,6 +2981,7 @@ int __init sched_init_domains(const struct cpumask *cpu_map)
{
int err;
+ zalloc_cpumask_var(&sched_domains_llc_id_allocmask, GFP_KERNEL);
zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
On Fri, Feb 20, 2026 at 11:24:11AM -0800, Tim Chen wrote:
> +static int __sched_domains_alloc_llc_id(void)
> +{
> + int lid, max_lid;
> +
> + lockdep_assert_held(&sched_domains_mutex);
> +
> + lid = cpumask_first_zero(sched_domains_llc_id_allocmask);
> + /*
> + * llc_id space should never grow larger than the
> + * possible number of CPUs in the system.
> + */
> + BUG_ON(lid >= nr_cpu_ids);
__cpumask_set_cpu(lid, sched_domains_llc_is_allocmask);
> + max_lid = cpumask_last(sched_domains_llc_id_allocmask);
> + /* size is one more than max index */
> + tl_max_llcs = max(lid, max_lid) + 1;
> +
> + return lid;
> +}
> +
> +static void __sched_domains_free_llc_id(int cpu)
> +{
> + int i, lid, last_lid;
> +
> + lockdep_assert_held(&sched_domains_mutex);
> +
> + lid = per_cpu(sd_llc_id, cpu);
> + if (lid == -1)
> + return;
> +
> + BUG_ON(lid >= nr_cpu_ids);
> + per_cpu(sd_llc_id, cpu) = -1;
> +
> + for_each_cpu(i, cpu_coregroup_mask(cpu)) {
> + /* An online CPU owns the llc_id. */
> + if (per_cpu(sd_llc_id, i) == lid)
> + return;
> + }
> +
__cpumask_clear_cpu(lid, sched_domains_llc_id_allocmask);
> +
> + last_lid = cpumask_last(sched_domains_llc_id_allocmask);
> + /* shrink max LLC size to save memory */
> + if (last_lid < tl_max_llcs - 1)
> + tl_max_llcs = last_lid + 1;
> +}
Might be simpler to just track max_lid, and do the +1 at the alloc site?
On Fri, 2026-02-20 at 20:30 +0100, Peter Zijlstra wrote:
> On Fri, Feb 20, 2026 at 11:24:11AM -0800, Tim Chen wrote:
>
> > +static int __sched_domains_alloc_llc_id(void)
> > +{
> > + int lid, max_lid;
> > +
> > + lockdep_assert_held(&sched_domains_mutex);
> > +
> > + lid = cpumask_first_zero(sched_domains_llc_id_allocmask);
> > + /*
> > + * llc_id space should never grow larger than the
> > + * possible number of CPUs in the system.
> > + */
> > + BUG_ON(lid >= nr_cpu_ids);
>
> __cpumask_set_cpu(lid, sched_domains_llc_is_allocmask);
Ah yes, fat fingers delete one line too many.
>
> > + max_lid = cpumask_last(sched_domains_llc_id_allocmask);
> > + /* size is one more than max index */
> > + tl_max_llcs = max(lid, max_lid) + 1;
> > +
> > + return lid;
> > +}
> > +
> > +static void __sched_domains_free_llc_id(int cpu)
> > +{
> > + int i, lid, last_lid;
> > +
> > + lockdep_assert_held(&sched_domains_mutex);
> > +
> > + lid = per_cpu(sd_llc_id, cpu);
> > + if (lid == -1)
> > + return;
> > +
> > + BUG_ON(lid >= nr_cpu_ids);
> > + per_cpu(sd_llc_id, cpu) = -1;
> > +
> > + for_each_cpu(i, cpu_coregroup_mask(cpu)) {
> > + /* An online CPU owns the llc_id. */
> > + if (per_cpu(sd_llc_id, i) == lid)
> > + return;
> > + }
> > +
> __cpumask_clear_cpu(lid, sched_domains_llc_id_allocmask);
> > +
> > + last_lid = cpumask_last(sched_domains_llc_id_allocmask);
> > + /* shrink max LLC size to save memory */
> > + if (last_lid < tl_max_llcs - 1)
> > + tl_max_llcs = last_lid + 1;
> > +}
>
> Might be simpler to just track max_lid, and do the +1 at the alloc site?
>
Sure, will do. Will also update the code to validate lid value accordingly.
Tim
Hi Prateek,
On 2/16/2026 3:44 PM, K Prateek Nayak wrote:
> Hello Tim, Chenyu,
>
> On 2/11/2026 3:48 AM, Tim Chen wrote:
>> From: Chen Yu <yu.c.chen@intel.com>
>>
>> Introduce an index mapping between CPUs and their LLCs. This provides
>> a continuous per LLC index needed for cache-aware load balancing in
>> later patches.
>>
>> The existing per_cpu llc_id usually points to the first CPU of the
>> LLC domain, which is sparse and unsuitable as an array index. Using
>> llc_id directly would waste memory.
>>
>> With the new mapping, CPUs in the same LLC share a continuous id:
>>
>> per_cpu(llc_id, CPU=0...15) = 0
>> per_cpu(llc_id, CPU=16...31) = 1
>> per_cpu(llc_id, CPU=32...47) = 2
>> ...
>>
>> Once a CPU has been assigned an llc_id, this ID persists even when
>> the CPU is taken offline and brought back online, which can facilitate
>> the management of the ID.
>>
>> Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com>
>> Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
>> Co-developed-by: K Prateek Nayak <kprateek.nayak@amd.com>
>> Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
>> Signed-off-by: Chen Yu <yu.c.chen@intel.com>
>> ---
>>
>> Notes:
>> v2->v3:
>> Allocate the LLC id according to the topology level data directly, rather
>> than calculating from the sched domain. This simplifies the code.
>> (Peter Zijlstra, K Prateek Nayak)
>>
>> kernel/sched/topology.c | 47 ++++++++++++++++++++++++++++++++++++++---
>> 1 file changed, 44 insertions(+), 3 deletions(-)
>>
>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
>> index cf643a5ddedd..ca46b5cf7f78 100644
>> --- a/kernel/sched/topology.c
>> +++ b/kernel/sched/topology.c
>> @@ -20,6 +20,7 @@ void sched_domains_mutex_unlock(void)
>> /* Protected by sched_domains_mutex: */
>> static cpumask_var_t sched_domains_tmpmask;
>> static cpumask_var_t sched_domains_tmpmask2;
>> +static int tl_max_llcs;
>>
>> static int __init sched_debug_setup(char *str)
>> {
>> @@ -658,7 +659,7 @@ static void destroy_sched_domains(struct sched_domain *sd)
>> */
>> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
>> DEFINE_PER_CPU(int, sd_llc_size);
>> -DEFINE_PER_CPU(int, sd_llc_id);
>> +DEFINE_PER_CPU(int, sd_llc_id) = -1;
>> DEFINE_PER_CPU(int, sd_share_id);
>> DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
>> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
>> @@ -684,7 +685,6 @@ static void update_top_cache_domain(int cpu)
>>
>> rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
>> per_cpu(sd_llc_size, cpu) = size;
>> - per_cpu(sd_llc_id, cpu) = id;
>> rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
>>
>> sd = lowest_flag_domain(cpu, SD_CLUSTER);
>> @@ -2567,10 +2567,18 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
>>
>> /* Set up domains for CPUs specified by the cpu_map: */
>> for_each_cpu(i, cpu_map) {
>> - struct sched_domain_topology_level *tl;
>> + struct sched_domain_topology_level *tl, *tl_llc = NULL;
>> + int lid;
>>
>> sd = NULL;
>> for_each_sd_topology(tl) {
>> + int flags = 0;
>> +
>> + if (tl->sd_flags)
>> + flags = (*tl->sd_flags)();
>> +
>> + if (flags & SD_SHARE_LLC)
>> + tl_llc = tl;
>
> nit. This loop breaks out when sched_domain_span(sd) covers the entire
> cpu_map and it might have not reached the topmost SD_SHARE_LLC domain
> yet. Is that cause for any concern?
>
Could you please elaborate a little more on this? If it covers the
entire cpu_map shouldn't it stop going up to its parent domain?
Do you mean, sd_llc_1 and its parent sd_llc_2 could cover the same cpu_map,
and we should let tl_llc to assigned to sd_llc_2 (sd_llc_1 be degenerated? )
>>
>> sd = build_sched_domain(tl, cpu_map, attr, sd, i);
>>
>> @@ -2581,6 +2589,39 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
>> if (cpumask_equal(cpu_map, sched_domain_span(sd)))
>> break;
>> }
>> +
>> + lid = per_cpu(sd_llc_id, i);
>> + if (lid == -1) {
>> + int j;
>> +
>> + /*
>> + * Assign the llc_id to the CPUs that do not
>> + * have an LLC.
>> + */
>> + if (!tl_llc) {
>> + per_cpu(sd_llc_id, i) = tl_max_llcs++;
>> +
>> + continue;
>> + }
>> +
>> + /* try to reuse the llc_id of its siblings */
>> + for_each_cpu(j, tl_llc->mask(tl_llc, i)) {
>
>
> My only large concern that remains is the fact that offline CPUs are
> taken out the the tl->mask() which can lead to interesting cases where
> CPUs on same LLC can have different llc_id:
>
> o Boot with maxcpus=1
>
> o Run:
>
> for i in {1..$NRCPUS}; do
> echo 1 > /sys/devices/system/cpu/cpu$i/online;
> echo 0 > /sys/devices/system/cpu/cpu$i/online;
> done
>
> o Finally run:
>
> echo 1 | tee /sys/devices/system/cpu/cpu*/online;
>
>
> Once all CPUs are online, only the CPUs in boot CPU's LLC will have
> the same llc_id. Every other CPU will have a unique llc_id which might
> make the system behave unexpectedly.
>
You are right, I did not realize that the tl->mask would be unreliable
for detecting offline CPUs, and this case is brilliant for exposing
the bug in current code, nice catch!
> I'm wondering if we can do something like below on top of this patch:
>
> (Only build tested; Prepared on top of this patch in Tim's tree)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index c6efa71cf500..aee1be89ab4c 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -8268,6 +8268,8 @@ static void cpuset_cpu_active(void)
> static void cpuset_cpu_inactive(unsigned int cpu)
> {
> if (!cpuhp_tasks_frozen) {
> + /* XXX: Is this the right spot? */
> + sched_domains_free_llc_id(cpu);
> cpuset_update_active_cpus();
> } else {
> num_cpus_frozen++;
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index de5b701c3950..31a8910297c7 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -3903,6 +3903,7 @@ static inline bool sched_cache_enabled(void)
> }
> #endif
> extern void init_sched_mm(struct task_struct *p);
> +void sched_domains_free_llc_id(int cpu);
>
> extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
> extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index ca46b5cf7f78..04c1ab489ee2 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -18,6 +18,7 @@ void sched_domains_mutex_unlock(void)
> }
>
> /* Protected by sched_domains_mutex: */
> +static cpumask_var_t sched_domains_llc_id_allocmask;
> static cpumask_var_t sched_domains_tmpmask;
> static cpumask_var_t sched_domains_tmpmask2;
> static int tl_max_llcs;
> @@ -2543,6 +2544,53 @@ static bool topology_span_sane(const struct cpumask *cpu_map)
> return true;
> }
>
> +static int __sched_domains_alloc_llc_id(void)
> +{
> + int lid;
> +
> + lockdep_assert_held(&sched_domains_mutex);
> +
> + lid = cpumask_first_zero(sched_domains_llc_id_allocmask);
> + if (lid >= tl_max_llcs)
> + tl_max_llcs++;
> +
> + /*
> + * llc_id space should never grow larger than the
> + * possible number of CPUs in the system.
> + */
> + if (!unlikely(WARN_ON_ONCE(lid >= nr_cpumask_bits)))
> + cpumask_set_cpu(lid, sched_domains_llc_id_allocmask);
> + return lid;
> +}
> +
> +static void __sched_domains_free_llc_id(int cpu)
> +{
> + int i, lid;
> +
> + lockdep_assert_held(&sched_domains_mutex);
> +
> + lid = per_cpu(sd_llc_id, cpu);
> + if (lid == -1)
> + return;
> +
> + per_cpu(sd_llc_id, cpu) = -1;
> +
> + for_each_online_cpu(i) {
> + /* An online CPU owns the llc_id. */
> + if (per_cpu(sd_llc_id, i) == lid)
> + return;
> + }
> +
> + cpumask_clear_cpu(lid, sched_domains_llc_id_allocmask);
> +}
> +
> +void sched_domains_free_llc_id(int cpu)
> +{
> + sched_domains_mutex_lock();
> + __sched_domains_free_llc_id(cpu);
> + sched_domains_mutex_unlock();
> +}
> +
> /*
> * Build sched domains for a given set of CPUs and attach the sched domains
> * to the individual CPUs
> @@ -2599,7 +2647,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
> * have an LLC.
> */
> if (!tl_llc) {
> - per_cpu(sd_llc_id, i) = tl_max_llcs++;
> + per_cpu(sd_llc_id, i) = __sched_domains_alloc_llc_id();
>
> continue;
> }
> @@ -2620,7 +2668,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
>
> /* a new LLC is detected */
> if (lid == -1)
> - per_cpu(sd_llc_id, i) = tl_max_llcs++;
> + per_cpu(sd_llc_id, i) = __sched_domains_alloc_llc_id();
> }
> }
>
> @@ -2798,6 +2846,7 @@ int __init sched_init_domains(const struct cpumask *cpu_map)
> {
> int err;
>
> + zalloc_cpumask_var(&sched_domains_llc_id_allocmask, GFP_KERNEL);
> zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
> zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
> zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
> ---
>
> It doesn't compact tl_max_llcs, but it should promote reuse of llc_id if
> all CPUs of a LLC go offline. I know it is a ridiculous scenario but it
> is possible nonetheless.
>
> I'll let Peter and Valentin be the judge of additional space and
> complexity needed for these bits :-)
>
Smart approach! Dynamically reallocating the llc_id should be feasible,
as it releases the llc_id when the last CPU of that LLC is offlined. My
only concern is data synchronization issues arising from the reuse of
llc_id during load balancing - I’ll audit the logic to check for any race
conditions. Alternatively, what if we introduce a tl->static_mask? It would
be similar to tl->mask, but would not remove CPUs from static_mask when
they
are offlined. This way, we can always find and reuse the llc_id of CPUs in
that LLC (even if all CPUs in the LLC have been offlined at some point,
provided they were once online), and we would thus maintain a static llc_id.
Anyway, let do some testings on your proposal as well as static_mask things,
and I'll reply to this thread later. Thanks for the insights!
thanks,
Chenyu
Hello Chenyu,
On 2/17/2026 11:37 AM, Chen, Yu C wrote:
> Hi Prateek,
>
> On 2/16/2026 3:44 PM, K Prateek Nayak wrote:
>> Hello Tim, Chenyu,
>>
>> On 2/11/2026 3:48 AM, Tim Chen wrote:
>>> From: Chen Yu <yu.c.chen@intel.com>
>>>
>>> Introduce an index mapping between CPUs and their LLCs. This provides
>>> a continuous per LLC index needed for cache-aware load balancing in
>>> later patches.
>>>
>>> The existing per_cpu llc_id usually points to the first CPU of the
>>> LLC domain, which is sparse and unsuitable as an array index. Using
>>> llc_id directly would waste memory.
>>>
>>> With the new mapping, CPUs in the same LLC share a continuous id:
>>>
>>> per_cpu(llc_id, CPU=0...15) = 0
>>> per_cpu(llc_id, CPU=16...31) = 1
>>> per_cpu(llc_id, CPU=32...47) = 2
>>> ...
>>>
>>> Once a CPU has been assigned an llc_id, this ID persists even when
>>> the CPU is taken offline and brought back online, which can facilitate
>>> the management of the ID.
>>>
>>> Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com>
>>> Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
>>> Co-developed-by: K Prateek Nayak <kprateek.nayak@amd.com>
>>> Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
>>> Signed-off-by: Chen Yu <yu.c.chen@intel.com>
>>> ---
>>>
>>> Notes:
>>> v2->v3:
>>> Allocate the LLC id according to the topology level data directly, rather
>>> than calculating from the sched domain. This simplifies the code.
>>> (Peter Zijlstra, K Prateek Nayak)
>>>
>>> kernel/sched/topology.c | 47 ++++++++++++++++++++++++++++++++++++++---
>>> 1 file changed, 44 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
>>> index cf643a5ddedd..ca46b5cf7f78 100644
>>> --- a/kernel/sched/topology.c
>>> +++ b/kernel/sched/topology.c
>>> @@ -20,6 +20,7 @@ void sched_domains_mutex_unlock(void)
>>> /* Protected by sched_domains_mutex: */
>>> static cpumask_var_t sched_domains_tmpmask;
>>> static cpumask_var_t sched_domains_tmpmask2;
>>> +static int tl_max_llcs;
>>> static int __init sched_debug_setup(char *str)
>>> {
>>> @@ -658,7 +659,7 @@ static void destroy_sched_domains(struct sched_domain *sd)
>>> */
>>> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
>>> DEFINE_PER_CPU(int, sd_llc_size);
>>> -DEFINE_PER_CPU(int, sd_llc_id);
>>> +DEFINE_PER_CPU(int, sd_llc_id) = -1;
>>> DEFINE_PER_CPU(int, sd_share_id);
>>> DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
>>> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
>>> @@ -684,7 +685,6 @@ static void update_top_cache_domain(int cpu)
>>> rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
>>> per_cpu(sd_llc_size, cpu) = size;
>>> - per_cpu(sd_llc_id, cpu) = id;
>>> rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
>>> sd = lowest_flag_domain(cpu, SD_CLUSTER);
>>> @@ -2567,10 +2567,18 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
>>> /* Set up domains for CPUs specified by the cpu_map: */
>>> for_each_cpu(i, cpu_map) {
>>> - struct sched_domain_topology_level *tl;
>>> + struct sched_domain_topology_level *tl, *tl_llc = NULL;
>>> + int lid;
>>> sd = NULL;
>>> for_each_sd_topology(tl) {
>>> + int flags = 0;
>>> +
>>> + if (tl->sd_flags)
>>> + flags = (*tl->sd_flags)();
>>> +
>>> + if (flags & SD_SHARE_LLC)
>>> + tl_llc = tl;
>>
>> nit. This loop breaks out when sched_domain_span(sd) covers the entire
>> cpu_map and it might have not reached the topmost SD_SHARE_LLC domain
>> yet. Is that cause for any concern?
>>
>
> Could you please elaborate a little more on this? If it covers the
> entire cpu_map shouldn't it stop going up to its parent domain?
> Do you mean, sd_llc_1 and its parent sd_llc_2 could cover the same cpu_map,
> and we should let tl_llc to assigned to sd_llc_2 (sd_llc_1 be degenerated? )
I'm not sure if this is technically possible but assume following
topology:
[ LLC: 8-15 ]
[ SMT: 8,9 ][ SMT: 10,11 ] ... [ SMT: 14,15 ]
and the following series of events:
o All CPUs in LLC are offline to begin with (maxcpus = 1 like scenario).
o CPUs 10-15 are onlined first.
o CPU8 is put in a separate root partition and brought online.
(XXX: I'm not 100% sure if this is possible in this order)
o build_sched_domains() will bail out at SMT domain since the cpumap
is covered by tl->mask() and tl_llc = tl_smt.
o llc_id calculation uses the tl_smt->mask() which will not contain
CPUs 10-15 and CPU8 will get a unique LLC id even though there are
other online CPUs in the LLC with a different llc_id (!!!)
Instead, if we traversed to tl_mc, we would have seen all the online
CPUs in the MC and reused the llc_id from them. Might not be an issue on
its own but if this root partition is removed later, CPU8 will continue
to have the unique llc_id even after merging into the same MC domain.
[..snip..]
>>
>> It doesn't compact tl_max_llcs, but it should promote reuse of llc_id if
>> all CPUs of a LLC go offline. I know it is a ridiculous scenario but it
>> is possible nonetheless.
>>
>> I'll let Peter and Valentin be the judge of additional space and
>> complexity needed for these bits :-)
>>
>
> Smart approach! Dynamically reallocating the llc_id should be feasible,
> as it releases the llc_id when the last CPU of that LLC is offlined. My
> only concern is data synchronization issues arising from the reuse of
> llc_id during load balancing - I’ll audit the logic to check for any race
> conditions. Alternatively, what if we introduce a tl->static_mask? It would
> be similar to tl->mask, but would not remove CPUs from static_mask when they
> are offlined. This way, we can always find and reuse the llc_id of CPUs in
> that LLC (even if all CPUs in the LLC have been offlined at some point,
> provided they were once online), and we would thus maintain a static llc_id.
That is possible but it would require a larger arch/ wide audit to add
support for. Might be less complex to handle in the generic layer but
again I'll let Peter and Valentin comment on this part :-)
>
> Anyway, let do some testings on your proposal as well as static_mask things,
> and I'll reply to this thread later. Thanks for the insights!
Thanks a ton! Much appreciated.
--
Thanks and Regards,
Prateek
On Tue, Feb 17, 2026 at 01:39:45PM +0530, K Prateek Nayak wrote: > I'm not sure if this is technically possible but assume following > topology: > > [ LLC: 8-15 ] > [ SMT: 8,9 ][ SMT: 10,11 ] ... [ SMT: 14,15 ] > > and the following series of events: > > o All CPUs in LLC are offline to begin with (maxcpus = 1 like scenario). > > o CPUs 10-15 are onlined first. > > o CPU8 is put in a separate root partition and brought online. > (XXX: I'm not 100% sure if this is possible in this order) > > o build_sched_domains() will bail out at SMT domain since the cpumap > is covered by tl->mask() and tl_llc = tl_smt. > > o llc_id calculation uses the tl_smt->mask() which will not contain > CPUs 10-15 and CPU8 will get a unique LLC id even though there are > other online CPUs in the LLC with a different llc_id (!!!) Yeah, so partitions (including isol_cpus) could wreck things here, since this is purely about the sched_domains. You can create N single CPU partitions (isol_cpus does this) and end up with the same 'problem' that online one at a time loop did. Except this time it would not be 'wrong'. Since they are single CPU domains, you also don't get load-balancing, so who cares I suppose. But it will inflate max_lid. But suppose you create N/2 partitions (where N is the number of CPUs in the physical LLC), then you get many individual 'LLC's and load-balancing inside them. I suppose this is correct, although it does inflate max_lid somewhat beyond what you would normally expect. However, most of that space would be wasted, since you're not actually allowed to migrate to them.
On 2/19/2026 11:48 PM, Peter Zijlstra wrote: > On Tue, Feb 17, 2026 at 01:39:45PM +0530, K Prateek Nayak wrote: >> I'm not sure if this is technically possible but assume following >> topology: >> >> [ LLC: 8-15 ] >> [ SMT: 8,9 ][ SMT: 10,11 ] ... [ SMT: 14,15 ] >> >> and the following series of events: >> >> o All CPUs in LLC are offline to begin with (maxcpus = 1 like scenario). >> >> o CPUs 10-15 are onlined first. >> >> o CPU8 is put in a separate root partition and brought online. >> (XXX: I'm not 100% sure if this is possible in this order) >> >> o build_sched_domains() will bail out at SMT domain since the cpumap >> is covered by tl->mask() and tl_llc = tl_smt. >> >> o llc_id calculation uses the tl_smt->mask() which will not contain >> CPUs 10-15 and CPU8 will get a unique LLC id even though there are >> other online CPUs in the LLC with a different llc_id (!!!) > > Yeah, so partitions (including isol_cpus) could wreck things here, since > this is purely about the sched_domains. > > You can create N single CPU partitions (isol_cpus does this) and end up > with the same 'problem' that online one at a time loop did. Except this > time it would not be 'wrong'. Since they are single CPU domains, you > also don't get load-balancing, so who cares I suppose. But it will > inflate max_lid. > > But suppose you create N/2 partitions (where N is the number of CPUs in > the physical LLC), then you get many individual 'LLC's and > load-balancing inside them. I suppose this is correct, although it does > inflate max_lid somewhat beyond what you would normally expect. > > However, most of that space would be wasted, since you're not actually > allowed to migrate to them. > Besides wasting space, after removing CPUs from all N/2 partitions and merging them into the root partition, each CPU would still have a distinct llc_id from the other CPUs in the same LLC domain, because we do not reassign llc_id values to CPUs in current version. This issue should be resolved by switching to the new dynamic llc_id allocation and release method. thanks, Chenyu
On Tue, 2026-02-17 at 13:39 +0530, K Prateek Nayak wrote:
> Hello Chenyu,
>
>
[...snip...]
> > > > */
> > > > DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
> > > > DEFINE_PER_CPU(int, sd_llc_size);
> > > > -DEFINE_PER_CPU(int, sd_llc_id);
> > > > +DEFINE_PER_CPU(int, sd_llc_id) = -1;
> > > > DEFINE_PER_CPU(int, sd_share_id);
> > > > DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
> > > > DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
> > > > @@ -684,7 +685,6 @@ static void update_top_cache_domain(int cpu)
> > > > rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
> > > > per_cpu(sd_llc_size, cpu) = size;
> > > > - per_cpu(sd_llc_id, cpu) = id;
> > > > rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
> > > > sd = lowest_flag_domain(cpu, SD_CLUSTER);
> > > > @@ -2567,10 +2567,18 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
> > > > /* Set up domains for CPUs specified by the cpu_map: */
> > > > for_each_cpu(i, cpu_map) {
> > > > - struct sched_domain_topology_level *tl;
> > > > + struct sched_domain_topology_level *tl, *tl_llc = NULL;
> > > > + int lid;
> > > > sd = NULL;
> > > > for_each_sd_topology(tl) {
> > > > + int flags = 0;
> > > > +
> > > > + if (tl->sd_flags)
> > > > + flags = (*tl->sd_flags)();
> > > > +
> > > > + if (flags & SD_SHARE_LLC)
> > > > + tl_llc = tl;
> > >
> > > nit. This loop breaks out when sched_domain_span(sd) covers the entire
> > > cpu_map and it might have not reached the topmost SD_SHARE_LLC domain
> > > yet. Is that cause for any concern?
> > >
> >
> > Could you please elaborate a little more on this? If it covers the
> > entire cpu_map shouldn't it stop going up to its parent domain?
> > Do you mean, sd_llc_1 and its parent sd_llc_2 could cover the same cpu_map,
> > and we should let tl_llc to assigned to sd_llc_2 (sd_llc_1 be degenerated? )
>
> I'm not sure if this is technically possible but assume following
> topology:
>
> [ LLC: 8-15 ]
> [ SMT: 8,9 ][ SMT: 10,11 ] ... [ SMT: 14,15 ]
>
> and the following series of events:
>
> o All CPUs in LLC are offline to begin with (maxcpus = 1 like scenario).
>
> o CPUs 10-15 are onlined first.
>
> o CPU8 is put in a separate root partition and brought online.
> (XXX: I'm not 100% sure if this is possible in this order)
>
> o build_sched_domains() will bail out at SMT domain since the cpumap
> is covered by tl->mask() and tl_llc = tl_smt.
>
> o llc_id calculation uses the tl_smt->mask() which will not contain
> CPUs 10-15 and CPU8 will get a unique LLC id even though there are
> other online CPUs in the LLC with a different llc_id (!!!)
>
>
> Instead, if we traversed to tl_mc, we would have seen all the online
> CPUs in the MC and reused the llc_id from them. Might not be an issue on
> its own but if this root partition is removed later, CPU8 will continue
> to have the unique llc_id even after merging into the same MC domain.
There is really no reason to reuse the llc_id as far as cache aware scheduling
goes in its v3 revision (see my reply to Madadi on this patch).
I am thinking that if we just simply rebuild LLC id across sched domain
rebuilds, that is probably the cleanest solution. There could be some races
in cpus_share_cache() as llc_id gets reassigned for some CPUs when they
come online/offline. But we also having similar races in current mainline code.
Worst it can do is some temporary sub-optimal scheduling task placement.
Thoughts?
Tim
>
> [..snip..]
>
> > >
> > > It doesn't compact tl_max_llcs, but it should promote reuse of llc_id if
> > > all CPUs of a LLC go offline. I know it is a ridiculous scenario but it
> > > is possible nonetheless.
> > >
> > > I'll let Peter and Valentin be the judge of additional space and
> > > complexity needed for these bits :-)
> > >
> >
> > Smart approach! Dynamically reallocating the llc_id should be feasible,
> > as it releases the llc_id when the last CPU of that LLC is offlined. My
> > only concern is data synchronization issues arising from the reuse of
> > llc_id during load balancing - I’ll audit the logic to check for any race
> > conditions. Alternatively, what if we introduce a tl->static_mask? It would
> > be similar to tl->mask, but would not remove CPUs from static_mask when they
> > are offlined. This way, we can always find and reuse the llc_id of CPUs in
> > that LLC (even if all CPUs in the LLC have been offlined at some point,
> > provided they were once online), and we would thus maintain a static llc_id.
>
> That is possible but it would require a larger arch/ wide audit to add
> support for. Might be less complex to handle in the generic layer but
> again I'll let Peter and Valentin comment on this part :-)
>
> >
> > Anyway, let do some testings on your proposal as well as static_mask things,
> > and I'll reply to this thread later. Thanks for the insights!
>
> Thanks a ton! Much appreciated.
Hello Tim,
On 2/18/2026 4:42 AM, Tim Chen wrote:
> On Tue, 2026-02-17 at 13:39 +0530, K Prateek Nayak wrote:
>> Hello Chenyu,
>>
>>
>
> [...snip...]
>
>
>>>>> */
>>>>> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
>>>>> DEFINE_PER_CPU(int, sd_llc_size);
>>>>> -DEFINE_PER_CPU(int, sd_llc_id);
>>>>> +DEFINE_PER_CPU(int, sd_llc_id) = -1;
>>>>> DEFINE_PER_CPU(int, sd_share_id);
>>>>> DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
>>>>> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
>>>>> @@ -684,7 +685,6 @@ static void update_top_cache_domain(int cpu)
>>>>> rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
>>>>> per_cpu(sd_llc_size, cpu) = size;
>>>>> - per_cpu(sd_llc_id, cpu) = id;
>>>>> rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
>>>>> sd = lowest_flag_domain(cpu, SD_CLUSTER);
>>>>> @@ -2567,10 +2567,18 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
>>>>> /* Set up domains for CPUs specified by the cpu_map: */
>>>>> for_each_cpu(i, cpu_map) {
>>>>> - struct sched_domain_topology_level *tl;
>>>>> + struct sched_domain_topology_level *tl, *tl_llc = NULL;
>>>>> + int lid;
>>>>> sd = NULL;
>>>>> for_each_sd_topology(tl) {
>>>>> + int flags = 0;
>>>>> +
>>>>> + if (tl->sd_flags)
>>>>> + flags = (*tl->sd_flags)();
>>>>> +
>>>>> + if (flags & SD_SHARE_LLC)
>>>>> + tl_llc = tl;
>>>>
>>>> nit. This loop breaks out when sched_domain_span(sd) covers the entire
>>>> cpu_map and it might have not reached the topmost SD_SHARE_LLC domain
>>>> yet. Is that cause for any concern?
>>>>
>>>
>>> Could you please elaborate a little more on this? If it covers the
>>> entire cpu_map shouldn't it stop going up to its parent domain?
>>> Do you mean, sd_llc_1 and its parent sd_llc_2 could cover the same cpu_map,
>>> and we should let tl_llc to assigned to sd_llc_2 (sd_llc_1 be degenerated? )
>>
>> I'm not sure if this is technically possible but assume following
>> topology:
>>
>> [ LLC: 8-15 ]
>> [ SMT: 8,9 ][ SMT: 10,11 ] ... [ SMT: 14,15 ]
>>
>> and the following series of events:
>>
>> o All CPUs in LLC are offline to begin with (maxcpus = 1 like scenario).
>>
>> o CPUs 10-15 are onlined first.
>>
>> o CPU8 is put in a separate root partition and brought online.
>> (XXX: I'm not 100% sure if this is possible in this order)
>>
>> o build_sched_domains() will bail out at SMT domain since the cpumap
>> is covered by tl->mask() and tl_llc = tl_smt.
>>
>> o llc_id calculation uses the tl_smt->mask() which will not contain
>> CPUs 10-15 and CPU8 will get a unique LLC id even though there are
>> other online CPUs in the LLC with a different llc_id (!!!)
>>
>>
>> Instead, if we traversed to tl_mc, we would have seen all the online
>> CPUs in the MC and reused the llc_id from them. Might not be an issue on
>> its own but if this root partition is removed later, CPU8 will continue
>> to have the unique llc_id even after merging into the same MC domain.
>
> There is really no reason to reuse the llc_id as far as cache aware scheduling
> goes in its v3 revision (see my reply to Madadi on this patch).
Even I don't mind having some holes in the llc_id space when CPUs are
offlined but my major concern would be seeing an inconsistent state
where CPUs in same MC domains end up with different llc_id when after
a bunch of hotplug activity.
>
> I am thinking that if we just simply rebuild LLC id across sched domain
> rebuilds, that is probably the cleanest solution. There could be some races
> in cpus_share_cache() as llc_id gets reassigned for some CPUs when they
> come online/offline. But we also having similar races in current mainline code.
> Worst it can do is some temporary sub-optimal scheduling task placement.
>
> Thoughts?
If you are suggesting populating the sd_llc_id for all the CPUs on
topology rebuild, I'm not entirely against the idea.
On a separate note, if we add a dependency on SCHED_MC for SCHED_CACHE,
we can simply look at cpu_coregroup_mask() and either allocate a new
llc_id / borrow llc id in sched_cpu_activate() when CPU is onlined or
reassign them in sched_cpu_deactivate() if an entire LLC is offlined.
--
Thanks and Regards,
Prateek
On Wed, 2026-02-18 at 08:58 +0530, K Prateek Nayak wrote:
> Hello Tim,
>
> On 2/18/2026 4:42 AM, Tim Chen wrote:
> > On Tue, 2026-02-17 at 13:39 +0530, K Prateek Nayak wrote:
> > > Hello Chenyu,
> > >
> > >
> >
> > [...snip...]
> >
> >
> > > > > > */
> > > > > > DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
> > > > > > DEFINE_PER_CPU(int, sd_llc_size);
> > > > > > -DEFINE_PER_CPU(int, sd_llc_id);
> > > > > > +DEFINE_PER_CPU(int, sd_llc_id) = -1;
> > > > > > DEFINE_PER_CPU(int, sd_share_id);
> > > > > > DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
> > > > > > DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
> > > > > > @@ -684,7 +685,6 @@ static void update_top_cache_domain(int cpu)
> > > > > > rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
> > > > > > per_cpu(sd_llc_size, cpu) = size;
> > > > > > - per_cpu(sd_llc_id, cpu) = id;
> > > > > > rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
> > > > > > sd = lowest_flag_domain(cpu, SD_CLUSTER);
> > > > > > @@ -2567,10 +2567,18 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
> > > > > > /* Set up domains for CPUs specified by the cpu_map: */
> > > > > > for_each_cpu(i, cpu_map) {
> > > > > > - struct sched_domain_topology_level *tl;
> > > > > > + struct sched_domain_topology_level *tl, *tl_llc = NULL;
> > > > > > + int lid;
> > > > > > sd = NULL;
> > > > > > for_each_sd_topology(tl) {
> > > > > > + int flags = 0;
> > > > > > +
> > > > > > + if (tl->sd_flags)
> > > > > > + flags = (*tl->sd_flags)();
> > > > > > +
> > > > > > + if (flags & SD_SHARE_LLC)
> > > > > > + tl_llc = tl;
> > > > >
> > > > > nit. This loop breaks out when sched_domain_span(sd) covers the entire
> > > > > cpu_map and it might have not reached the topmost SD_SHARE_LLC domain
> > > > > yet. Is that cause for any concern?
> > > > >
> > > >
> > > > Could you please elaborate a little more on this? If it covers the
> > > > entire cpu_map shouldn't it stop going up to its parent domain?
> > > > Do you mean, sd_llc_1 and its parent sd_llc_2 could cover the same cpu_map,
> > > > and we should let tl_llc to assigned to sd_llc_2 (sd_llc_1 be degenerated? )
> > >
> > > I'm not sure if this is technically possible but assume following
> > > topology:
> > >
> > > [ LLC: 8-15 ]
> > > [ SMT: 8,9 ][ SMT: 10,11 ] ... [ SMT: 14,15 ]
> > >
> > > and the following series of events:
> > >
> > > o All CPUs in LLC are offline to begin with (maxcpus = 1 like scenario).
> > >
> > > o CPUs 10-15 are onlined first.
> > >
> > > o CPU8 is put in a separate root partition and brought online.
> > > (XXX: I'm not 100% sure if this is possible in this order)
> > >
> > > o build_sched_domains() will bail out at SMT domain since the cpumap
> > > is covered by tl->mask() and tl_llc = tl_smt.
> > >
> > > o llc_id calculation uses the tl_smt->mask() which will not contain
> > > CPUs 10-15 and CPU8 will get a unique LLC id even though there are
> > > other online CPUs in the LLC with a different llc_id (!!!)
> > >
> > >
> > > Instead, if we traversed to tl_mc, we would have seen all the online
> > > CPUs in the MC and reused the llc_id from them. Might not be an issue on
> > > its own but if this root partition is removed later, CPU8 will continue
> > > to have the unique llc_id even after merging into the same MC domain.
> >
> > There is really no reason to reuse the llc_id as far as cache aware scheduling
> > goes in its v3 revision (see my reply to Madadi on this patch).
>
> Even I don't mind having some holes in the llc_id space when CPUs are
> offlined but my major concern would be seeing an inconsistent state
> where CPUs in same MC domains end up with different llc_id when after
> a bunch of hotplug activity.
>
> >
> > I am thinking that if we just simply rebuild LLC id across sched domain
> > rebuilds, that is probably the cleanest solution. There could be some races
> > in cpus_share_cache() as llc_id gets reassigned for some CPUs when they
> > come online/offline. But we also having similar races in current mainline code.
> > Worst it can do is some temporary sub-optimal scheduling task placement.
> >
> > Thoughts?
>
> If you are suggesting populating the sd_llc_id for all the CPUs on
> topology rebuild, I'm not entirely against the idea.
>
> On a separate note, if we add a dependency on SCHED_MC for SCHED_CACHE,
> we can simply look at cpu_coregroup_mask() and either allocate a new
> llc_id / borrow llc id in sched_cpu_activate() when CPU is onlined or
> reassign them in sched_cpu_deactivate() if an entire LLC is offlined.
I also think that cpu_coregroup_mask() is a better choice than
tl->mask for getting the mask of CPUs in LLC.
Okay, we'll consider an implemenation along your suggestion of
__sched_domains_alloc_llc_id() to reuse llc id when all CPUs
in LLC deactivate. That will minimize holes in LLC ids while
avoiding races in cpus_share_cache().
Thanks.
Tim
On 2/18/2026 11:28 AM, K Prateek Nayak wrote:
> Hello Tim,
>
> On 2/18/2026 4:42 AM, Tim Chen wrote:
>> On Tue, 2026-02-17 at 13:39 +0530, K Prateek Nayak wrote:
>>> Hello Chenyu,
>>>
>>>
>>
>> [...snip...]
>>
>>
>>>>>> */
>>>>>> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
>>>>>> DEFINE_PER_CPU(int, sd_llc_size);
>>>>>> -DEFINE_PER_CPU(int, sd_llc_id);
>>>>>> +DEFINE_PER_CPU(int, sd_llc_id) = -1;
>>>>>> DEFINE_PER_CPU(int, sd_share_id);
>>>>>> DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
>>>>>> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
>>>>>> @@ -684,7 +685,6 @@ static void update_top_cache_domain(int cpu)
>>>>>> rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
>>>>>> per_cpu(sd_llc_size, cpu) = size;
>>>>>> - per_cpu(sd_llc_id, cpu) = id;
>>>>>> rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
>>>>>> sd = lowest_flag_domain(cpu, SD_CLUSTER);
>>>>>> @@ -2567,10 +2567,18 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
>>>>>> /* Set up domains for CPUs specified by the cpu_map: */
>>>>>> for_each_cpu(i, cpu_map) {
>>>>>> - struct sched_domain_topology_level *tl;
>>>>>> + struct sched_domain_topology_level *tl, *tl_llc = NULL;
>>>>>> + int lid;
>>>>>> sd = NULL;
>>>>>> for_each_sd_topology(tl) {
>>>>>> + int flags = 0;
>>>>>> +
>>>>>> + if (tl->sd_flags)
>>>>>> + flags = (*tl->sd_flags)();
>>>>>> +
>>>>>> + if (flags & SD_SHARE_LLC)
>>>>>> + tl_llc = tl;
>>>>>
>>>>> nit. This loop breaks out when sched_domain_span(sd) covers the entire
>>>>> cpu_map and it might have not reached the topmost SD_SHARE_LLC domain
>>>>> yet. Is that cause for any concern?
>>>>>
>>>>
>>>> Could you please elaborate a little more on this? If it covers the
>>>> entire cpu_map shouldn't it stop going up to its parent domain?
>>>> Do you mean, sd_llc_1 and its parent sd_llc_2 could cover the same cpu_map,
>>>> and we should let tl_llc to assigned to sd_llc_2 (sd_llc_1 be degenerated? )
>>>
>>> I'm not sure if this is technically possible but assume following
>>> topology:
>>>
>>> [ LLC: 8-15 ]
>>> [ SMT: 8,9 ][ SMT: 10,11 ] ... [ SMT: 14,15 ]
>>>
>>> and the following series of events:
>>>
>>> o All CPUs in LLC are offline to begin with (maxcpus = 1 like scenario).
>>>
>>> o CPUs 10-15 are onlined first.
>>>
>>> o CPU8 is put in a separate root partition and brought online.
>>> (XXX: I'm not 100% sure if this is possible in this order)
>>>
>>> o build_sched_domains() will bail out at SMT domain since the cpumap
>>> is covered by tl->mask() and tl_llc = tl_smt.
>>>
>>> o llc_id calculation uses the tl_smt->mask() which will not contain
>>> CPUs 10-15 and CPU8 will get a unique LLC id even though there are
>>> other online CPUs in the LLC with a different llc_id (!!!)
>>>
>>>
>>> Instead, if we traversed to tl_mc, we would have seen all the online
>>> CPUs in the MC and reused the llc_id from them. Might not be an issue on
>>> its own but if this root partition is removed later, CPU8 will continue
>>> to have the unique llc_id even after merging into the same MC domain.
>>
>> There is really no reason to reuse the llc_id as far as cache aware scheduling
>> goes in its v3 revision (see my reply to Madadi on this patch).
>
> Even I don't mind having some holes in the llc_id space when CPUs are
> offlined but my major concern would be seeing an inconsistent state
> where CPUs in same MC domains end up with different llc_id when after
> a bunch of hotplug activity.
>
>>
>> I am thinking that if we just simply rebuild LLC id across sched domain
>> rebuilds, that is probably the cleanest solution.
Tim, do you mean reset all CPUs' LLC id to -1 whenever there is hotplug
event in partition_sched_domains_locked(), and rebuild them from scratch
in build_sched_domains(), so we already refresh the LLC id for every
CPU(I discussed with Vineeth here:
https://lore.kernel.org/all/54e60704-b0f3-44df-9b83-070806b5a00c@intel.com/)
>> There could be some races
>> in cpus_share_cache() as llc_id gets reassigned for some CPUs when they
>> come online/offline. But we also having similar races in current mainline code.
>> Worst it can do is some temporary sub-optimal scheduling task placement.
>>
>> Thoughts?
>
> If you are suggesting populating the sd_llc_id for all the CPUs on
> topology rebuild, I'm not entirely against the idea.
>
> On a separate note, if we add a dependency on SCHED_MC for SCHED_CACHE,
> we can simply look at cpu_coregroup_mask() and either allocate a new
> llc_id / borrow llc id in sched_cpu_activate() when CPU is onlined or
> reassign them in sched_cpu_deactivate() if an entire LLC is offlined.
>
Prateek, may I know if you are thinking of updating every CPU's LLC id
during its hotplug and not update all percpu LLC id in
build_sched_domains()?
thanks,
Chenyu
On Wed, 2026-02-18 at 23:22 +0800, Chen, Yu C wrote:
> On 2/18/2026 11:28 AM, K Prateek Nayak wrote:
> > Hello Tim,
> >
> > On 2/18/2026 4:42 AM, Tim Chen wrote:
> > > On Tue, 2026-02-17 at 13:39 +0530, K Prateek Nayak wrote:
> > > > Hello Chenyu,
> > > >
> > > >
> > >
> > > [...snip...]
> > >
> > >
> > > > > > > */
> > > > > > > DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
> > > > > > > DEFINE_PER_CPU(int, sd_llc_size);
> > > > > > > -DEFINE_PER_CPU(int, sd_llc_id);
> > > > > > > +DEFINE_PER_CPU(int, sd_llc_id) = -1;
> > > > > > > DEFINE_PER_CPU(int, sd_share_id);
> > > > > > > DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
> > > > > > > DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
> > > > > > > @@ -684,7 +685,6 @@ static void update_top_cache_domain(int cpu)
> > > > > > > rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
> > > > > > > per_cpu(sd_llc_size, cpu) = size;
> > > > > > > - per_cpu(sd_llc_id, cpu) = id;
> > > > > > > rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
> > > > > > > sd = lowest_flag_domain(cpu, SD_CLUSTER);
> > > > > > > @@ -2567,10 +2567,18 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
> > > > > > > /* Set up domains for CPUs specified by the cpu_map: */
> > > > > > > for_each_cpu(i, cpu_map) {
> > > > > > > - struct sched_domain_topology_level *tl;
> > > > > > > + struct sched_domain_topology_level *tl, *tl_llc = NULL;
> > > > > > > + int lid;
> > > > > > > sd = NULL;
> > > > > > > for_each_sd_topology(tl) {
> > > > > > > + int flags = 0;
> > > > > > > +
> > > > > > > + if (tl->sd_flags)
> > > > > > > + flags = (*tl->sd_flags)();
> > > > > > > +
> > > > > > > + if (flags & SD_SHARE_LLC)
> > > > > > > + tl_llc = tl;
> > > > > >
> > > > > > nit. This loop breaks out when sched_domain_span(sd) covers the entire
> > > > > > cpu_map and it might have not reached the topmost SD_SHARE_LLC domain
> > > > > > yet. Is that cause for any concern?
> > > > > >
> > > > >
> > > > > Could you please elaborate a little more on this? If it covers the
> > > > > entire cpu_map shouldn't it stop going up to its parent domain?
> > > > > Do you mean, sd_llc_1 and its parent sd_llc_2 could cover the same cpu_map,
> > > > > and we should let tl_llc to assigned to sd_llc_2 (sd_llc_1 be degenerated? )
> > > >
> > > > I'm not sure if this is technically possible but assume following
> > > > topology:
> > > >
> > > > [ LLC: 8-15 ]
> > > > [ SMT: 8,9 ][ SMT: 10,11 ] ... [ SMT: 14,15 ]
> > > >
> > > > and the following series of events:
> > > >
> > > > o All CPUs in LLC are offline to begin with (maxcpus = 1 like scenario).
> > > >
> > > > o CPUs 10-15 are onlined first.
> > > >
> > > > o CPU8 is put in a separate root partition and brought online.
> > > > (XXX: I'm not 100% sure if this is possible in this order)
> > > >
> > > > o build_sched_domains() will bail out at SMT domain since the cpumap
> > > > is covered by tl->mask() and tl_llc = tl_smt.
> > > >
> > > > o llc_id calculation uses the tl_smt->mask() which will not contain
> > > > CPUs 10-15 and CPU8 will get a unique LLC id even though there are
> > > > other online CPUs in the LLC with a different llc_id (!!!)
> > > >
> > > >
> > > > Instead, if we traversed to tl_mc, we would have seen all the online
> > > > CPUs in the MC and reused the llc_id from them. Might not be an issue on
> > > > its own but if this root partition is removed later, CPU8 will continue
> > > > to have the unique llc_id even after merging into the same MC domain.
> > >
> > > There is really no reason to reuse the llc_id as far as cache aware scheduling
> > > goes in its v3 revision (see my reply to Madadi on this patch).
> >
> > Even I don't mind having some holes in the llc_id space when CPUs are
> > offlined but my major concern would be seeing an inconsistent state
> > where CPUs in same MC domains end up with different llc_id when after
> > a bunch of hotplug activity.
> >
> > >
> > > I am thinking that if we just simply rebuild LLC id across sched domain
> > > rebuilds, that is probably the cleanest solution.
>
> Tim, do you mean reset all CPUs' LLC id to -1 whenever there is hotplug
> event in partition_sched_domains_locked(), and rebuild them from scratch
> in build_sched_domains(), so we already refresh the LLC id for every
> CPU(I discussed with Vineeth here:
> https://lore.kernel.org/all/54e60704-b0f3-44df-9b83-070806b5a00c@intel.com/)
Yes, that's what I was thinking. However, there could be some races in
cpus_share_cache() with this approach.
Tim
>
>
> > > There could be some races
> > > in cpus_share_cache() as llc_id gets reassigned for some CPUs when they
> > > come online/offline. But we also having similar races in current mainline code.
> > > Worst it can do is some temporary sub-optimal scheduling task placement.
> > >
> > > Thoughts?
> >
> > If you are suggesting populating the sd_llc_id for all the CPUs on
> > topology rebuild, I'm not entirely against the idea.
> >
> > On a separate note, if we add a dependency on SCHED_MC for SCHED_CACHE,
> > we can simply look at cpu_coregroup_mask() and either allocate a new
> > llc_id / borrow llc id in sched_cpu_activate() when CPU is onlined or
> > reassign them in sched_cpu_deactivate() if an entire LLC is offlined.
> >
>
> Prateek, may I know if you are thinking of updating every CPU's LLC id
> during its hotplug and not update all percpu LLC id in
> build_sched_domains()?
>
> thanks,
> Chenyu
Hello Chenyu, On 2/18/2026 8:52 PM, Chen, Yu C wrote: >> On a separate note, if we add a dependency on SCHED_MC for SCHED_CACHE, >> we can simply look at cpu_coregroup_mask() and either allocate a new >> llc_id / borrow llc id in sched_cpu_activate() when CPU is onlined or >> reassign them in sched_cpu_deactivate() if an entire LLC is offlined. >> > > Prateek, may I know if you are thinking of updating every CPU's LLC id > during its hotplug and not update all percpu LLC id in build_sched_domains()? I was still thinking of build_sched_domains() (or somewhere in the online and offline path) where we can first simply look at cpu_coregroup_mask() and decide if we need to traverse all CPUs and shuffle the IDs. -- Thanks and Regards, Prateek
On Wed, 2026-02-18 at 23:16 +0530, K Prateek Nayak wrote:
> Hello Chenyu,
>
> On 2/18/2026 8:52 PM, Chen, Yu C wrote:
> > > On a separate note, if we add a dependency on SCHED_MC for SCHED_CACHE,
> > > we can simply look at cpu_coregroup_mask() and either allocate a new
> > > llc_id / borrow llc id in sched_cpu_activate() when CPU is onlined or
> > > reassign them in sched_cpu_deactivate() if an entire LLC is offlined.
> > >
> >
> > Prateek, may I know if you are thinking of updating every CPU's LLC id
> > during its hotplug and not update all percpu LLC id in build_sched_domains()?
>
> I was still thinking of build_sched_domains() (or somewhere in the
> online and offline path) where we can first simply look at
> cpu_coregroup_mask() and decide if we need to traverse all CPUs and
> shuffle the IDs.
Pratek,
How about modifying the patch like the following, stealing
a lot of your code. Also added change to shrink max LLCs when
the LLC with max id lost its last CPU.
Thanks.
Tim
---
diff --git a/init/Kconfig b/init/Kconfig
index 9848de949afa..4ddf54ab9cf7 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -987,6 +987,7 @@ config SCHED_CACHE
bool "Cache aware load balance"
default y
depends on SMP
+ depends on SCHED_MC
help
When enabled, the scheduler will attempt to aggregate tasks from
the same process onto a single Last Level Cache (LLC) domain when
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 48626c81ba8e..75ba4e0bfcd3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8474,6 +8474,8 @@ int sched_cpu_deactivate(unsigned int cpu)
*/
synchronize_rcu();
+ sched_domains_free_llc_id(cpu);
+
sched_set_rq_offline(rq, cpu);
scx_rq_deactivate(rq);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6cbc56e9adfc..04f42526e6f0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3862,6 +3862,7 @@ static inline bool sched_cache_enabled(void)
extern void sched_cache_active_set_unlocked(void);
#endif
extern void init_sched_mm(struct task_struct *p);
+void sched_domains_free_llc_id(int cpu);
extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 580fb2fbc900..5e59340ad9a9 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -18,6 +18,7 @@ void sched_domains_mutex_unlock(void)
}
/* Protected by sched_domains_mutex: */
+static cpumask_var_t sched_domains_llc_id_allocmask;
static cpumask_var_t sched_domains_tmpmask;
static cpumask_var_t sched_domains_tmpmask2;
static int tl_max_llcs;
@@ -2590,6 +2591,57 @@ static bool topology_span_sane(const struct cpumask *cpu_map)
return true;
}
+static int __sched_domains_alloc_llc_id(void)
+{
+ int lid;
+
+ lockdep_assert_held(&sched_domains_mutex);
+
+ lid = cpumask_first_zero(sched_domains_llc_id_allocmask);
+ if (lid >= tl_max_llcs)
+ tl_max_llcs = lid + 1;
+
+ /*
+ * llc_id space should never grow larger than the
+ * possible number of CPUs in the system.
+ */
+ if (!unlikely(WARN_ON_ONCE(lid >= nr_cpumask_bits)))
+ cpumask_set_cpu(lid, sched_domains_llc_id_allocmask);
+ return lid;
+}
+
+static void __sched_domains_free_llc_id(int cpu)
+{
+ int i, lid;
+
+ lockdep_assert_held(&sched_domains_mutex);
+
+ lid = per_cpu(sd_llc_id, cpu);
+ if (lid == -1)
+ return;
+
+ per_cpu(sd_llc_id, cpu) = -1;
+
+ for_each_online_cpu(i) {
+ /* An online CPU owns the llc_id. */
+ if (per_cpu(sd_llc_id, i) == lid)
+ return;
+ }
+
+ cpumask_clear_cpu(lid, sched_domains_llc_id_allocmask);
+
+ /* shrink max LLC size to save memory */
+ if (lid == tl_max_llcs - 1)
+ lid = tl_max_llcs--;
+}
+
+void sched_domains_free_llc_id(int cpu)
+{
+ sched_domains_mutex_lock();
+ __sched_domains_free_llc_id(cpu);
+ sched_domains_mutex_unlock();
+}
+
/*
* Build sched domains for a given set of CPUs and attach the sched domains
* to the individual CPUs
@@ -2615,18 +2667,11 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
/* Set up domains for CPUs specified by the cpu_map: */
for_each_cpu(i, cpu_map) {
- struct sched_domain_topology_level *tl, *tl_llc = NULL;
+ struct sched_domain_topology_level *tl;
int lid;
sd = NULL;
for_each_sd_topology(tl) {
- int flags = 0;
-
- if (tl->sd_flags)
- flags = (*tl->sd_flags)();
-
- if (flags & SD_SHARE_LLC)
- tl_llc = tl;
sd = build_sched_domain(tl, cpu_map, attr, sd, i);
@@ -2642,18 +2687,19 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
if (lid == -1) {
int j;
+ j = cpumask_first(cpu_coregroup_mask(i));
/*
* Assign the llc_id to the CPUs that do not
* have an LLC.
*/
- if (!tl_llc) {
- per_cpu(sd_llc_id, i) = tl_max_llcs++;
+ if (j >= nr_cpu_ids) {
+ per_cpu(sd_llc_id, i) = __sched_domains_alloc_llc_id();
continue;
}
/* try to reuse the llc_id of its siblings */
- for_each_cpu(j, tl_llc->mask(tl_llc, i)) {
+ for (; j < nr_cpu_ids; j = cpumask_next(j, cpu_coregroup_mask(i))) {
if (i == j)
continue;
@@ -2668,7 +2714,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
/* a new LLC is detected */
if (lid == -1)
- per_cpu(sd_llc_id, i) = tl_max_llcs++;
+ per_cpu(sd_llc_id, i) = __sched_domains_alloc_llc_id();
}
}
@@ -2869,6 +2915,7 @@ int __init sched_init_domains(const struct cpumask *cpu_map)
{
int err;
+ zalloc_cpumask_var(&sched_domains_llc_id_allocmask, GFP_KERNEL);
zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
On 2/19/2026 7:21 AM, Tim Chen wrote:
> On Wed, 2026-02-18 at 23:16 +0530, K Prateek Nayak wrote:
>> Hello Chenyu,
>>
>> On 2/18/2026 8:52 PM, Chen, Yu C wrote:
[ ... ]
>
> Pratek,
>
> How about modifying the patch like the following, stealing
> a lot of your code. Also added change to shrink max LLCs when
> the LLC with max id lost its last CPU.
>
> Thanks.
>
[ ... ]
> +
> +static void __sched_domains_free_llc_id(int cpu)
> +{
> + int i, lid;
> +
> + lockdep_assert_held(&sched_domains_mutex);
> +
> + lid = per_cpu(sd_llc_id, cpu);
> + if (lid == -1)
> + return;
> +
> + per_cpu(sd_llc_id, cpu) = -1;
> +
> + for_each_online_cpu(i) {
One minor question: should we only iterate through
cpu_coregroup_mask(cpu) to check if any sibling CPU
within this LLC owns the llc_id? If there are no online
CPUs within this LLC, I assume we should release this
llc_id.
thanks,
Chenyu
> + /* An online CPU owns the llc_id. */
> + if (per_cpu(sd_llc_id, i) == lid)
> + return;
> + }
> +
> + cpumask_clear_cpu(lid, sched_domains_llc_id_allocmask);
Hello Chenyu,
On 2/19/2026 4:55 PM, Chen, Yu C wrote:
>> +static void __sched_domains_free_llc_id(int cpu)
>> +{
>> + int i, lid;
>> +
>> + lockdep_assert_held(&sched_domains_mutex);
>> +
>> + lid = per_cpu(sd_llc_id, cpu);
>> + if (lid == -1)
>> + return;
>> +
>> + per_cpu(sd_llc_id, cpu) = -1;
>> +
>> + for_each_online_cpu(i) {
>
> One minor question: should we only iterate through
> cpu_coregroup_mask(cpu) to check if any sibling CPU
> within this LLC owns the llc_id? If there are no online
> CPUs within this LLC, I assume we should release this
> llc_id.
That should work too! I'm assuming the arch/ side
unlink happens before this in which case we can simply
check cpumask_empty(cpu_coregroup_mask(cpu)).
>
> thanks,
> Chenyu
>
>> + /* An online CPU owns the llc_id. */
>> + if (per_cpu(sd_llc_id, i) == lid)
>> + return;
>> + }
>> +
>> + cpumask_clear_cpu(lid, sched_domains_llc_id_allocmask);
>
--
Thanks and Regards,
Prateek
Hello Tim,
Thank you for the patch.
On 2/19/2026 4:51 AM, Tim Chen wrote:
> diff --git a/init/Kconfig b/init/Kconfig
> index 9848de949afa..4ddf54ab9cf7 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -987,6 +987,7 @@ config SCHED_CACHE
> bool "Cache aware load balance"
> default y
> depends on SMP
> + depends on SCHED_MC
> help
> When enabled, the scheduler will attempt to aggregate tasks from
> the same process onto a single Last Level Cache (LLC) domain when
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 48626c81ba8e..75ba4e0bfcd3 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -8474,6 +8474,8 @@ int sched_cpu_deactivate(unsigned int cpu)
> */
> synchronize_rcu();
>
> + sched_domains_free_llc_id(cpu);
> +
> sched_set_rq_offline(rq, cpu);
>
> scx_rq_deactivate(rq);
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 6cbc56e9adfc..04f42526e6f0 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -3862,6 +3862,7 @@ static inline bool sched_cache_enabled(void)
> extern void sched_cache_active_set_unlocked(void);
> #endif
> extern void init_sched_mm(struct task_struct *p);
> +void sched_domains_free_llc_id(int cpu);
>
> extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
> extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index 580fb2fbc900..5e59340ad9a9 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -18,6 +18,7 @@ void sched_domains_mutex_unlock(void)
> }
>
> /* Protected by sched_domains_mutex: */
> +static cpumask_var_t sched_domains_llc_id_allocmask;
> static cpumask_var_t sched_domains_tmpmask;
> static cpumask_var_t sched_domains_tmpmask2;
> static int tl_max_llcs;
> @@ -2590,6 +2591,57 @@ static bool topology_span_sane(const struct cpumask *cpu_map)
> return true;
> }
>
> +static int __sched_domains_alloc_llc_id(void)
> +{
> + int lid;
> +
> + lockdep_assert_held(&sched_domains_mutex);
> +
> + lid = cpumask_first_zero(sched_domains_llc_id_allocmask);
> + if (lid >= tl_max_llcs)
> + tl_max_llcs = lid + 1;
> +
> + /*
> + * llc_id space should never grow larger than the
> + * possible number of CPUs in the system.
> + */
> + if (!unlikely(WARN_ON_ONCE(lid >= nr_cpumask_bits)))
> + cpumask_set_cpu(lid, sched_domains_llc_id_allocmask);
> + return lid;
> +}
> +
> +static void __sched_domains_free_llc_id(int cpu)
> +{
> + int i, lid;
> +
> + lockdep_assert_held(&sched_domains_mutex);
> +
> + lid = per_cpu(sd_llc_id, cpu);
> + if (lid == -1)
> + return;
> +
> + per_cpu(sd_llc_id, cpu) = -1;
> +
> + for_each_online_cpu(i) {
> + /* An online CPU owns the llc_id. */
> + if (per_cpu(sd_llc_id, i) == lid)
> + return;
> + }
We should perhaps warn and skip clearing lid from cpumask if lid was
found to be larger than "nr_cpumask_bits". Shouldn't happen but just
as a precaution.
> +
> + cpumask_clear_cpu(lid, sched_domains_llc_id_allocmask);
> +
> + /* shrink max LLC size to save memory */
> + if (lid == tl_max_llcs - 1)
> + lid = tl_max_llcs--;
No need to assign the local "lid" variable here; Simple decrement
should do.
> +}
> +
> +void sched_domains_free_llc_id(int cpu)
> +{
> + sched_domains_mutex_lock();
> + __sched_domains_free_llc_id(cpu);
> + sched_domains_mutex_unlock();
> +}
> +
> /*
> * Build sched domains for a given set of CPUs and attach the sched domains
> * to the individual CPUs
> @@ -2615,18 +2667,11 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
>
> /* Set up domains for CPUs specified by the cpu_map: */
> for_each_cpu(i, cpu_map) {
> - struct sched_domain_topology_level *tl, *tl_llc = NULL;
> + struct sched_domain_topology_level *tl;
> int lid;
>
> sd = NULL;
> for_each_sd_topology(tl) {
> - int flags = 0;
> -
> - if (tl->sd_flags)
> - flags = (*tl->sd_flags)();
> -
> - if (flags & SD_SHARE_LLC)
> - tl_llc = tl;
>
> sd = build_sched_domain(tl, cpu_map, attr, sd, i);
>
> @@ -2642,18 +2687,19 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
> if (lid == -1) {
> int j;
>
> + j = cpumask_first(cpu_coregroup_mask(i));
> /*
> * Assign the llc_id to the CPUs that do not
> * have an LLC.
> */
> - if (!tl_llc) {
> - per_cpu(sd_llc_id, i) = tl_max_llcs++;
> + if (j >= nr_cpu_ids) {
> + per_cpu(sd_llc_id, i) = __sched_domains_alloc_llc_id();
>
> continue;
> }
I don't think we need to special case this out since:
for_each_cpu(j, cpu_coregroup_mask(i)) {
...
}
would bail out if no CPU is set (also CPU "i" would definitely be
set on it since it must be online) and the "if" after the loop will
see "lid" as "-1" and DTRT.
>
> /* try to reuse the llc_id of its siblings */
> - for_each_cpu(j, tl_llc->mask(tl_llc, i)) {
> + for (; j < nr_cpu_ids; j = cpumask_next(j, cpu_coregroup_mask(i))) {
> if (i == j)
> continue;
>
> @@ -2668,7 +2714,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
>
> /* a new LLC is detected */
> if (lid == -1)
> - per_cpu(sd_llc_id, i) = tl_max_llcs++;
> + per_cpu(sd_llc_id, i) = __sched_domains_alloc_llc_id();
> }
> }
>
> @@ -2869,6 +2915,7 @@ int __init sched_init_domains(const struct cpumask *cpu_map)
> {
> int err;
>
> + zalloc_cpumask_var(&sched_domains_llc_id_allocmask, GFP_KERNEL);
> zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
> zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
> zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
--
Thanks and Regards,
Prateek
On Thu, 2026-02-19 at 11:42 +0530, K Prateek Nayak wrote:
> Hello Tim,
>
> Thank you for the patch.
>
> On 2/19/2026 4:51 AM, Tim Chen wrote:
> > diff --git a/init/Kconfig b/init/Kconfig
> > index 9848de949afa..4ddf54ab9cf7 100644
> > --- a/init/Kconfig
> > +++ b/init/Kconfig
> > @@ -987,6 +987,7 @@ config SCHED_CACHE
> > bool "Cache aware load balance"
> > default y
> > depends on SMP
> > + depends on SCHED_MC
> > help
> > When enabled, the scheduler will attempt to aggregate tasks from
> > the same process onto a single Last Level Cache (LLC) domain when
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index 48626c81ba8e..75ba4e0bfcd3 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -8474,6 +8474,8 @@ int sched_cpu_deactivate(unsigned int cpu)
> > */
> > synchronize_rcu();
> >
> > + sched_domains_free_llc_id(cpu);
> > +
> > sched_set_rq_offline(rq, cpu);
> >
> > scx_rq_deactivate(rq);
> > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> > index 6cbc56e9adfc..04f42526e6f0 100644
> > --- a/kernel/sched/sched.h
> > +++ b/kernel/sched/sched.h
> > @@ -3862,6 +3862,7 @@ static inline bool sched_cache_enabled(void)
> > extern void sched_cache_active_set_unlocked(void);
> > #endif
> > extern void init_sched_mm(struct task_struct *p);
> > +void sched_domains_free_llc_id(int cpu);
> >
> > extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
> > extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
> > diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> > index 580fb2fbc900..5e59340ad9a9 100644
> > --- a/kernel/sched/topology.c
> > +++ b/kernel/sched/topology.c
> > @@ -18,6 +18,7 @@ void sched_domains_mutex_unlock(void)
> > }
> >
> > /* Protected by sched_domains_mutex: */
> > +static cpumask_var_t sched_domains_llc_id_allocmask;
> > static cpumask_var_t sched_domains_tmpmask;
> > static cpumask_var_t sched_domains_tmpmask2;
> > static int tl_max_llcs;
> > @@ -2590,6 +2591,57 @@ static bool topology_span_sane(const struct cpumask *cpu_map)
> > return true;
> > }
> >
> > +static int __sched_domains_alloc_llc_id(void)
> > +{
> > + int lid;
> > +
> > + lockdep_assert_held(&sched_domains_mutex);
> > +
> > + lid = cpumask_first_zero(sched_domains_llc_id_allocmask);
> > + if (lid >= tl_max_llcs)
> > + tl_max_llcs = lid + 1;
> > +
> > + /*
> > + * llc_id space should never grow larger than the
> > + * possible number of CPUs in the system.
> > + */
> > + if (!unlikely(WARN_ON_ONCE(lid >= nr_cpumask_bits)))
> > + cpumask_set_cpu(lid, sched_domains_llc_id_allocmask);
> > + return lid;
> > +}
> > +
> > +static void __sched_domains_free_llc_id(int cpu)
> > +{
> > + int i, lid;
> > +
> > + lockdep_assert_held(&sched_domains_mutex);
> > +
> > + lid = per_cpu(sd_llc_id, cpu);
> > + if (lid == -1)
> > + return;
> > +
> > + per_cpu(sd_llc_id, cpu) = -1;
> > +
> > + for_each_online_cpu(i) {
> > + /* An online CPU owns the llc_id. */
> > + if (per_cpu(sd_llc_id, i) == lid)
> > + return;
> > + }
>
> We should perhaps warn and skip clearing lid from cpumask if lid was
> found to be larger than "nr_cpumask_bits". Shouldn't happen but just
> as a precaution.
Will do
>
> > +
> > + cpumask_clear_cpu(lid, sched_domains_llc_id_allocmask);
> > +
> > + /* shrink max LLC size to save memory */
> > + if (lid == tl_max_llcs - 1)
> > + lid = tl_max_llcs--;
>
> No need to assign the local "lid" variable here; Simple decrement
> should do.
Good point
>
> > +}
> > +
> > +void sched_domains_free_llc_id(int cpu)
> > +{
> > + sched_domains_mutex_lock();
> > + __sched_domains_free_llc_id(cpu);
> > + sched_domains_mutex_unlock();
> > +}
> > +
> > /*
> > * Build sched domains for a given set of CPUs and attach the sched domains
> > * to the individual CPUs
> > @@ -2615,18 +2667,11 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
> >
> > /* Set up domains for CPUs specified by the cpu_map: */
> > for_each_cpu(i, cpu_map) {
> > - struct sched_domain_topology_level *tl, *tl_llc = NULL;
> > + struct sched_domain_topology_level *tl;
> > int lid;
> >
> > sd = NULL;
> > for_each_sd_topology(tl) {
> > - int flags = 0;
> > -
> > - if (tl->sd_flags)
> > - flags = (*tl->sd_flags)();
> > -
> > - if (flags & SD_SHARE_LLC)
> > - tl_llc = tl;
> >
> > sd = build_sched_domain(tl, cpu_map, attr, sd, i);
> >
> > @@ -2642,18 +2687,19 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
> > if (lid == -1) {
> > int j;
> >
> > + j = cpumask_first(cpu_coregroup_mask(i));
> > /*
> > * Assign the llc_id to the CPUs that do not
> > * have an LLC.
> > */
> > - if (!tl_llc) {
> > - per_cpu(sd_llc_id, i) = tl_max_llcs++;
> > + if (j >= nr_cpu_ids) {
> > + per_cpu(sd_llc_id, i) = __sched_domains_alloc_llc_id();
> >
> > continue;
> > }
>
> I don't think we need to special case this out since:
>
> for_each_cpu(j, cpu_coregroup_mask(i)) {
> ...
> }
>
> would bail out if no CPU is set (also CPU "i" would definitely be
> set on it since it must be online) and the "if" after the loop will
> see "lid" as "-1" and DTRT.
That's right. Will take out the non-needed code.
Also found out that cpu_coregroup_mask() is not defined for config
without CONFIG_SMP. So will put the llc id assignment code under
CONFIG_SMP.
Thanks for the code reviews and suggestions.
Tim
>
> >
> > /* try to reuse the llc_id of its siblings */
> > - for_each_cpu(j, tl_llc->mask(tl_llc, i)) {
> > + for (; j < nr_cpu_ids; j = cpumask_next(j, cpu_coregroup_mask(i))) {
> > if (i == j)
> > continue;
> >
> > @@ -2668,7 +2714,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
> >
> > /* a new LLC is detected */
> > if (lid == -1)
> > - per_cpu(sd_llc_id, i) = tl_max_llcs++;
> > + per_cpu(sd_llc_id, i) = __sched_domains_alloc_llc_id();
> > }
> > }
> >
> > @@ -2869,6 +2915,7 @@ int __init sched_init_domains(const struct cpumask *cpu_map)
> > {
> > int err;
> >
> > + zalloc_cpumask_var(&sched_domains_llc_id_allocmask, GFP_KERNEL);
> > zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
> > zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
> > zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
On Thu, Feb 19, 2026 at 11:42:58AM +0530, K Prateek Nayak wrote:
> I don't think we need to special case this out since:
>
> for_each_cpu(j, cpu_coregroup_mask(i)) {
> ...
> }
>
> would bail out if no CPU is set (also CPU "i" would definitely be
> set on it since it must be online) and the "if" after the loop will
> see "lid" as "-1" and DTRT.
So tying lid to coregroup_mask, rather than sched_domains might make
sense. It avoids that partitions nonsense.
On 2/17/2026 4:09 PM, K Prateek Nayak wrote:
> Hello Chenyu,
>
> On 2/17/2026 11:37 AM, Chen, Yu C wrote:
>> Hi Prateek,
>>
>> On 2/16/2026 3:44 PM, K Prateek Nayak wrote:
>>> Hello Tim, Chenyu,
>>>
>>> On 2/11/2026 3:48 AM, Tim Chen wrote:
>>>> From: Chen Yu <yu.c.chen@intel.com>
[...]
>>>> @@ -2567,10 +2567,18 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
>>>> /* Set up domains for CPUs specified by the cpu_map: */
>>>> for_each_cpu(i, cpu_map) {
>>>> - struct sched_domain_topology_level *tl;
>>>> + struct sched_domain_topology_level *tl, *tl_llc = NULL;
>>>> + int lid;
>>>> sd = NULL;
>>>> for_each_sd_topology(tl) {
>>>> + int flags = 0;
>>>> +
>>>> + if (tl->sd_flags)
>>>> + flags = (*tl->sd_flags)();
>>>> +
>>>> + if (flags & SD_SHARE_LLC)
>>>> + tl_llc = tl;
>>>
>>> nit. This loop breaks out when sched_domain_span(sd) covers the entire
>>> cpu_map and it might have not reached the topmost SD_SHARE_LLC domain
>>> yet. Is that cause for any concern?
>>>
>>
>> Could you please elaborate a little more on this? If it covers the
>> entire cpu_map shouldn't it stop going up to its parent domain?
>> Do you mean, sd_llc_1 and its parent sd_llc_2 could cover the same cpu_map,
>> and we should let tl_llc to assigned to sd_llc_2 (sd_llc_1 be degenerated? )
>
> I'm not sure if this is technically possible but assume following
> topology:
>
> [ LLC: 8-15 ]
> [ SMT: 8,9 ][ SMT: 10,11 ] ... [ SMT: 14,15 ]
>
> and the following series of events:
>
> o All CPUs in LLC are offline to begin with (maxcpus = 1 like scenario).
>
> o CPUs 10-15 are onlined first.
>
> o CPU8 is put in a separate root partition and brought online.
> (XXX: I'm not 100% sure if this is possible in this order)
>
It can happen, I had a try on VM, and there would be different
llc_id within 1 LLC even after CPU8 has been taken out of the separate
partition.
> o build_sched_domains() will bail out at SMT domain since the cpumap
> is covered by tl->mask() and tl_llc = tl_smt.
>
> o llc_id calculation uses the tl_smt->mask() which will not contain
> CPUs 10-15 and CPU8 will get a unique LLC id even though there are
> other online CPUs in the LLC with a different llc_id (!!!)
>
>
Fair enough, thanks for the explanation in detail.
thanks,
Chenyu
On 11/02/26 03:48, Tim Chen wrote:
> From: Chen Yu <yu.c.chen@intel.com>
>
> Introduce an index mapping between CPUs and their LLCs. This provides
> a continuous per LLC index needed for cache-aware load balancing in
> later patches.
>
> The existing per_cpu llc_id usually points to the first CPU of the
> LLC domain, which is sparse and unsuitable as an array index. Using
> llc_id directly would waste memory.
>
> With the new mapping, CPUs in the same LLC share a continuous id:
>
> per_cpu(llc_id, CPU=0...15) = 0
> per_cpu(llc_id, CPU=16...31) = 1
> per_cpu(llc_id, CPU=32...47) = 2
> ...
>
> Once a CPU has been assigned an llc_id, this ID persists even when
> the CPU is taken offline and brought back online, which can facilitate
> the management of the ID.
tl_max_llcs is never reset across multiple invocations of build_sched_domains().
While this preserves LLC IDs across normal CPU hotplug events, I'm wondering about
scenarios where hardware topology changes, such as physically removing/replacing
CPU sockets.
Example scenario:
Boot with 3 LLCs: IDs {0,1,2}, tl_max_llcs=3
Physical hardware change removes LLC 1
New hardware added at a different position gets ID=3
After multiple such events: System has 4 LLCs but IDs {0,2,5,7}, tl_max_llcs=8
This creates gaps in the ID space. However, I understand this trade-off might be
intentional since physical topology changes are rare, and resetting tl_max_llcs and
all sd_llc_id values would rebuild IDs on every invocation of build_sched_domains().
Would like to know your thoughts on overhead of resetting tl_max_llcs and sd_llc_id
so that IDs are rebuilt on each invocation of build_sched_domains() to always maintain
a dense mapping.
Thanks,
Vineeth
>
> Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com>
> Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
> Co-developed-by: K Prateek Nayak <kprateek.nayak@amd.com>
> Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
> Signed-off-by: Chen Yu <yu.c.chen@intel.com>
> ---
>
> Notes:
> v2->v3:
> Allocate the LLC id according to the topology level data directly, rather
> than calculating from the sched domain. This simplifies the code.
> (Peter Zijlstra, K Prateek Nayak)
>
> kernel/sched/topology.c | 47 ++++++++++++++++++++++++++++++++++++++---
> 1 file changed, 44 insertions(+), 3 deletions(-)
>
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index cf643a5ddedd..ca46b5cf7f78 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -20,6 +20,7 @@ void sched_domains_mutex_unlock(void)
> /* Protected by sched_domains_mutex: */
> static cpumask_var_t sched_domains_tmpmask;
> static cpumask_var_t sched_domains_tmpmask2;
> +static int tl_max_llcs;
>
> static int __init sched_debug_setup(char *str)
> {
> @@ -658,7 +659,7 @@ static void destroy_sched_domains(struct sched_domain *sd)
> */
> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
> DEFINE_PER_CPU(int, sd_llc_size);
> -DEFINE_PER_CPU(int, sd_llc_id);
> +DEFINE_PER_CPU(int, sd_llc_id) = -1;
> DEFINE_PER_CPU(int, sd_share_id);
> DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
> @@ -684,7 +685,6 @@ static void update_top_cache_domain(int cpu)
>
> rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
> per_cpu(sd_llc_size, cpu) = size;
> - per_cpu(sd_llc_id, cpu) = id;
> rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
>
> sd = lowest_flag_domain(cpu, SD_CLUSTER);
> @@ -2567,10 +2567,18 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
>
> /* Set up domains for CPUs specified by the cpu_map: */
> for_each_cpu(i, cpu_map) {
> - struct sched_domain_topology_level *tl;
> + struct sched_domain_topology_level *tl, *tl_llc = NULL;
> + int lid;
>
> sd = NULL;
> for_each_sd_topology(tl) {
> + int flags = 0;
> +
> + if (tl->sd_flags)
> + flags = (*tl->sd_flags)();
> +
> + if (flags & SD_SHARE_LLC)
> + tl_llc = tl;
>
> sd = build_sched_domain(tl, cpu_map, attr, sd, i);
>
> @@ -2581,6 +2589,39 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
> if (cpumask_equal(cpu_map, sched_domain_span(sd)))
> break;
> }
> +
> + lid = per_cpu(sd_llc_id, i);
> + if (lid == -1) {
> + int j;
> +
> + /*
> + * Assign the llc_id to the CPUs that do not
> + * have an LLC.
> + */
> + if (!tl_llc) {
> + per_cpu(sd_llc_id, i) = tl_max_llcs++;
> +
> + continue;
> + }
> +
> + /* try to reuse the llc_id of its siblings */
> + for_each_cpu(j, tl_llc->mask(tl_llc, i)) {
> + if (i == j)
> + continue;
> +
> + lid = per_cpu(sd_llc_id, j);
> +
> + if (lid != -1) {
> + per_cpu(sd_llc_id, i) = lid;
> +
> + break;
> + }
> + }
> +
> + /* a new LLC is detected */
> + if (lid == -1)
> + per_cpu(sd_llc_id, i) = tl_max_llcs++;
> + }
> }
>
> if (WARN_ON(!topology_span_sane(cpu_map)))
On 2/15/2026 1:53 AM, Madadi Vineeth Reddy wrote:
> On 11/02/26 03:48, Tim Chen wrote:
>> From: Chen Yu <yu.c.chen@intel.com>
>>
>> Introduce an index mapping between CPUs and their LLCs. This provides
>> a continuous per LLC index needed for cache-aware load balancing in
>> later patches.
>>
>> The existing per_cpu llc_id usually points to the first CPU of the
>> LLC domain, which is sparse and unsuitable as an array index. Using
>> llc_id directly would waste memory.
>>
>> With the new mapping, CPUs in the same LLC share a continuous id:
>>
>> per_cpu(llc_id, CPU=0...15) = 0
>> per_cpu(llc_id, CPU=16...31) = 1
>> per_cpu(llc_id, CPU=32...47) = 2
>> ...
>>
>> Once a CPU has been assigned an llc_id, this ID persists even when
>> the CPU is taken offline and brought back online, which can facilitate
>> the management of the ID.
>
> tl_max_llcs is never reset across multiple invocations of build_sched_domains().
> While this preserves LLC IDs across normal CPU hotplug events, I'm wondering about
> scenarios where hardware topology changes, such as physically removing/replacing
> CPU sockets.
>
> Example scenario:
> Boot with 3 LLCs: IDs {0,1,2}, tl_max_llcs=3
> Physical hardware change removes LLC 1
> New hardware added at a different position gets ID=3
> After multiple such events: System has 4 LLCs but IDs {0,2,5,7}, tl_max_llcs=8
>
I agree that keeping tl_max_llcs non-decreasing might waste some space. The
original motivation for introducing a dynamic sd_llc_id was mainly that a
static sd_llc_id[NR_LLC] is not suitable, as we cannot find a proper upper
limit for NR_LLC-an arbitrary value for NR_LLC is unacceptable. That is to
say, tl_max_llcs serves as the historical maximum LLC index that has ever
been detected - like other terms such as CPU id. It is possible that the
number of available LLCs shrinks due to CPU offline after boot-up. A value
of tl_max_llcs=8 indicates that this system once had 8 valid LLCs. On the
other hand, dense mapping is a side effect of dynamically allocating
sd_llc_id.
> This creates gaps in the ID space. However, I understand this trade-off might be
> intentional since physical topology changes are rare, and resetting tl_max_llcs and
> all sd_llc_id values would rebuild IDs on every invocation of build_sched_domains().
>
> Would like to know your thoughts on overhead of resetting tl_max_llcs and sd_llc_id
> so that IDs are rebuilt on each invocation of build_sched_domains() to always maintain
> a dense mapping.
>
The current implementation is intentionally kept simple for easier
review, and
I agree that strictly enforcing a dense mapping for sd_llc_id - by
recalculating
the actual maximum LLC count (max_llcs) whenever the CPU topology
changes - could
be an optimization direction once the basic version has been accepted. I
assume what
you are suggesting is that we could reset tl_max_llcs/max_llcs/sd_llc_id
for CPUs
in doms_new[i] within partition_sched_domains_locked() - and then
rebuild these
values in build_sched_domains() accordingly. One risk here is a race
condition when
modifying the llc_id of a specific CPU - but off the top of my head,
valid_llc_buf()
should help prevent out-of-range access to sd->pf caused by such races.
Thoughts?
thanks,
Chenyu
On 15/02/26 19:55, Chen, Yu C wrote:
> On 2/15/2026 1:53 AM, Madadi Vineeth Reddy wrote:
>> On 11/02/26 03:48, Tim Chen wrote:
>>> From: Chen Yu <yu.c.chen@intel.com>
>>>
>>> Introduce an index mapping between CPUs and their LLCs. This provides
>>> a continuous per LLC index needed for cache-aware load balancing in
>>> later patches.
>>>
>>> The existing per_cpu llc_id usually points to the first CPU of the
>>> LLC domain, which is sparse and unsuitable as an array index. Using
>>> llc_id directly would waste memory.
>>>
>>> With the new mapping, CPUs in the same LLC share a continuous id:
>>>
>>> per_cpu(llc_id, CPU=0...15) = 0
>>> per_cpu(llc_id, CPU=16...31) = 1
>>> per_cpu(llc_id, CPU=32...47) = 2
>>> ...
>>>
>>> Once a CPU has been assigned an llc_id, this ID persists even when
>>> the CPU is taken offline and brought back online, which can facilitate
>>> the management of the ID.
>>
>> tl_max_llcs is never reset across multiple invocations of build_sched_domains().
>> While this preserves LLC IDs across normal CPU hotplug events, I'm wondering about
>> scenarios where hardware topology changes, such as physically removing/replacing
>> CPU sockets.
>>
>> Example scenario:
>> Boot with 3 LLCs: IDs {0,1,2}, tl_max_llcs=3
>> Physical hardware change removes LLC 1
>> New hardware added at a different position gets ID=3
>> After multiple such events: System has 4 LLCs but IDs {0,2,5,7}, tl_max_llcs=8
>>
>
> I agree that keeping tl_max_llcs non-decreasing might waste some space. The
> original motivation for introducing a dynamic sd_llc_id was mainly that a
> static sd_llc_id[NR_LLC] is not suitable, as we cannot find a proper upper
> limit for NR_LLC-an arbitrary value for NR_LLC is unacceptable. That is to
> say, tl_max_llcs serves as the historical maximum LLC index that has ever
> been detected - like other terms such as CPU id. It is possible that the
> number of available LLCs shrinks due to CPU offline after boot-up. A value
> of tl_max_llcs=8 indicates that this system once had 8 valid LLCs. On the
> other hand, dense mapping is a side effect of dynamically allocating sd_llc_id.
>
>> This creates gaps in the ID space. However, I understand this trade-off might be
>> intentional since physical topology changes are rare, and resetting tl_max_llcs and
>> all sd_llc_id values would rebuild IDs on every invocation of build_sched_domains().
>>
>> Would like to know your thoughts on overhead of resetting tl_max_llcs and sd_llc_id
>> so that IDs are rebuilt on each invocation of build_sched_domains() to always maintain
>> a dense mapping.
>>
>
> The current implementation is intentionally kept simple for easier review, and
> I agree that strictly enforcing a dense mapping for sd_llc_id - by recalculating
> the actual maximum LLC count (max_llcs) whenever the CPU topology changes - could
> be an optimization direction once the basic version has been accepted. I assume what
> you are suggesting is that we could reset tl_max_llcs/max_llcs/sd_llc_id for CPUs
> in doms_new[i] within partition_sched_domains_locked() - and then rebuild these
> values in build_sched_domains() accordingly. One risk here is a race condition when
> modifying the llc_id of a specific CPU - but off the top of my head, valid_llc_buf()
> should help prevent out-of-range access to sd->pf caused by such races.
> Thoughts?
Yes, resetting and rebuilding would maintain dense mapping. Given the added complexity
of race conditions vs. minimal benefit (gaps only occur with physical topology changes),
I think the current approach is better. We can revisit it once this version goes through.
Thanks,
Vineeth
>
> thanks,
> Chenyu
>
On Tue, 2026-02-17 at 15:35 +0530, Madadi Vineeth Reddy wrote:
> On 15/02/26 19:55, Chen, Yu C wrote:
> > On 2/15/2026 1:53 AM, Madadi Vineeth Reddy wrote:
> > > On 11/02/26 03:48, Tim Chen wrote:
> > > > From: Chen Yu <yu.c.chen@intel.com>
> > > >
> > > > Introduce an index mapping between CPUs and their LLCs. This provides
> > > > a continuous per LLC index needed for cache-aware load balancing in
> > > > later patches.
> > > >
> > > > The existing per_cpu llc_id usually points to the first CPU of the
> > > > LLC domain, which is sparse and unsuitable as an array index. Using
> > > > llc_id directly would waste memory.
> > > >
> > > > With the new mapping, CPUs in the same LLC share a continuous id:
> > > >
> > > > per_cpu(llc_id, CPU=0...15) = 0
> > > > per_cpu(llc_id, CPU=16...31) = 1
> > > > per_cpu(llc_id, CPU=32...47) = 2
> > > > ...
> > > >
> > > > Once a CPU has been assigned an llc_id, this ID persists even when
> > > > the CPU is taken offline and brought back online, which can facilitate
> > > > the management of the ID.
> > >
> > > tl_max_llcs is never reset across multiple invocations of build_sched_domains().
> > > While this preserves LLC IDs across normal CPU hotplug events, I'm wondering about
> > > scenarios where hardware topology changes, such as physically removing/replacing
> > > CPU sockets.
> > >
> > > Example scenario:
> > > Boot with 3 LLCs: IDs {0,1,2}, tl_max_llcs=3
> > > Physical hardware change removes LLC 1
> > > New hardware added at a different position gets ID=3
> > > After multiple such events: System has 4 LLCs but IDs {0,2,5,7}, tl_max_llcs=8
> > >
> >
> > I agree that keeping tl_max_llcs non-decreasing might waste some space. The
> > original motivation for introducing a dynamic sd_llc_id was mainly that a
> > static sd_llc_id[NR_LLC] is not suitable, as we cannot find a proper upper
> > limit for NR_LLC-an arbitrary value for NR_LLC is unacceptable. That is to
> > say, tl_max_llcs serves as the historical maximum LLC index that has ever
> > been detected - like other terms such as CPU id. It is possible that the
> > number of available LLCs shrinks due to CPU offline after boot-up. A value
> > of tl_max_llcs=8 indicates that this system once had 8 valid LLCs. On the
> > other hand, dense mapping is a side effect of dynamically allocating sd_llc_id.
> >
> > > This creates gaps in the ID space. However, I understand this trade-off might be
> > > intentional since physical topology changes are rare, and resetting tl_max_llcs and
> > > all sd_llc_id values would rebuild IDs on every invocation of build_sched_domains().
> > >
> > > Would like to know your thoughts on overhead of resetting tl_max_llcs and sd_llc_id
> > > so that IDs are rebuilt on each invocation of build_sched_domains() to always maintain
> > > a dense mapping.
> > >
> >
> > The current implementation is intentionally kept simple for easier review, and
> > I agree that strictly enforcing a dense mapping for sd_llc_id - by recalculating
> > the actual maximum LLC count (max_llcs) whenever the CPU topology changes - could
> > be an optimization direction once the basic version has been accepted. I assume what
> > you are suggesting is that we could reset tl_max_llcs/max_llcs/sd_llc_id for CPUs
> > in doms_new[i] within partition_sched_domains_locked() - and then rebuild these
> > values in build_sched_domains() accordingly. One risk here is a race condition when
> > modifying the llc_id of a specific CPU - but off the top of my head, valid_llc_buf()
> > should help prevent out-of-range access to sd->pf caused by such races.
> > Thoughts?
>
> Yes, resetting and rebuilding would maintain dense mapping. Given the added complexity
> of race conditions vs. minimal benefit (gaps only occur with physical topology changes),
> I think the current approach is better. We can revisit it once this version goes through.
>
The current implementation keep LLC id unchanged across sched domain rebuild.
The idea was to allow pf[id] to be kept across rebuilds, and point to
the same LLC.
That said, now that we clear pf[id] across sched domain rebuild, this constraint can
be relaxed. And it should be okay to change the LLC id from the perspective of cache
aware scheduling.
However, there could be some transient races with cpus_share_cache() while the
LLC id got changed, which the current implementation avoid.
Tim
© 2016 - 2026 Red Hat, Inc.