Introduce an index mapping between CPUs and their LLCs. This provides
a continuous per LLC index needed for cache-aware load balancing in
later patches.
The existing per_cpu llc_id usually points to the first CPU of the
LLC domain, which is sparse and unsuitable as an array index. Using
llc_id directly would waste memory.
With the new mapping, CPUs in the same LLC share a continuous id:
per_cpu(llc_id, CPU=0...15) = 0
per_cpu(llc_id, CPU=16...31) = 1
per_cpu(llc_id, CPU=32...47) = 2
...
Co-developed-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
---
Notes:
v1->v2:
Convert the static LLC id to be allocated sequentially as LLCs are
discovered, and replace the old sd_llc_id. (Peter Zijlstra)
kernel/sched/fair.c | 9 ++++++-
kernel/sched/sched.h | 1 +
kernel/sched/topology.c | 60 +++++++++++++++++++++++++++++++++++++++--
3 files changed, 67 insertions(+), 3 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 710ed9943d27..0a3918269906 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1210,10 +1210,17 @@ __read_mostly unsigned int llc_imb_pct = 20;
static int llc_id(int cpu)
{
+ int llc;
+
if (cpu < 0)
return -1;
- return per_cpu(sd_llc_id, cpu);
+ llc = per_cpu(sd_llc_id, cpu);
+ /* avoid race with cpu hotplug */
+ if (unlikely(llc >= max_llcs))
+ return -1;
+
+ return llc;
}
void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index bf72c5bab506..728737641847 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2075,6 +2075,7 @@ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
extern struct static_key_false sched_asym_cpucapacity;
extern struct static_key_false sched_cluster_active;
+extern int max_llcs;
static __always_inline bool sched_asym_cpucap_active(void)
{
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 444bdfdab731..f25d950ab015 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -17,6 +17,8 @@ void sched_domains_mutex_unlock(void)
mutex_unlock(&sched_domains_mutex);
}
+int max_llcs;
+
/* Protected by sched_domains_mutex: */
static cpumask_var_t sched_domains_tmpmask;
static cpumask_var_t sched_domains_tmpmask2;
@@ -668,6 +670,55 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
DEFINE_STATIC_KEY_FALSE(sched_cluster_active);
+/*
+ * Assign continuous llc id for the CPU, and return
+ * the assigned llc id.
+ */
+static int update_llc_id(struct sched_domain *sd,
+ int cpu)
+{
+ int id = per_cpu(sd_llc_id, cpu), i;
+
+ if (id >= 0)
+ return id;
+
+ if (sd) {
+ /* Look for any assigned id and reuse it.*/
+ for_each_cpu(i, sched_domain_span(sd)) {
+ id = per_cpu(sd_llc_id, i);
+
+ if (id >= 0) {
+ per_cpu(sd_llc_id, cpu) = id;
+ return id;
+ }
+ }
+ }
+
+ /*
+ * When 1. there is no id assigned to this LLC domain,
+ * or 2. the sd is NULL, we reach here.
+ * Consider the following scenario,
+ * CPU0~CPU95 are in the node0, CPU96~CPU191 are
+ * in the node1. During bootup, maxcpus=96 is
+ * appended.
+ * case 1: When running cpu_attach_domain(CPU24)
+ * during boot up, CPU24 is the first CPU in its
+ * non-NULL LLC domain. However,
+ * its corresponding llc id has not been assigned yet.
+ *
+ * case 2: After boot up, the CPU100 is brought up
+ * via sysfs manually. As a result, CPU100 has only a
+ * Numa domain attached, because CPU100 is the only CPU
+ * of a sched domain, all its bottom domains are degenerated.
+ * The LLC domain pointer sd is NULL for CPU100.
+ *
+ * For both cases, we want to increase the number of LLCs.
+ */
+ per_cpu(sd_llc_id, cpu) = max_llcs++;
+
+ return per_cpu(sd_llc_id, cpu);
+}
+
static void update_top_cache_domain(int cpu)
{
struct sched_domain_shared *sds = NULL;
@@ -677,14 +728,13 @@ static void update_top_cache_domain(int cpu)
sd = highest_flag_domain(cpu, SD_SHARE_LLC);
if (sd) {
- id = cpumask_first(sched_domain_span(sd));
size = cpumask_weight(sched_domain_span(sd));
sds = sd->shared;
}
rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
per_cpu(sd_llc_size, cpu) = size;
- per_cpu(sd_llc_id, cpu) = id;
+ id = update_llc_id(sd, cpu);
rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
sd = lowest_flag_domain(cpu, SD_CLUSTER);
@@ -2488,6 +2538,12 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
bool has_asym = false;
bool has_cluster = false;
+ /* first scan of LLCs */
+ if (!max_llcs) {
+ for_each_possible_cpu(i)
+ per_cpu(sd_llc_id, i) = -1;
+ }
+
if (WARN_ON(cpumask_empty(cpu_map)))
goto error;
--
2.32.0
Hello Tim, Chenyu,
On 12/4/2025 4:37 AM, Tim Chen wrote:
> +/*
> + * Assign continuous llc id for the CPU, and return
> + * the assigned llc id.
> + */
> +static int update_llc_id(struct sched_domain *sd,
> + int cpu)
> +{
> + int id = per_cpu(sd_llc_id, cpu), i;
> +
> + if (id >= 0)
> + return id;
> +
> + if (sd) {
> + /* Look for any assigned id and reuse it.*/
> + for_each_cpu(i, sched_domain_span(sd)) {
> + id = per_cpu(sd_llc_id, i);
> +
> + if (id >= 0) {
> + per_cpu(sd_llc_id, cpu) = id;
> + return id;
> + }
> + }
> + }
I don't really like tying this down to the sched_domain span since
partition and other weirdness can cause the max_llc count to go
unnecessarily high. The tl->mask() (from sched_domain_topology_level)
should give the mask considering all online CPUs and not bothering
about cpusets.
How about something like:
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5b17d8e3cb55..c19b1c4e6472 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8270,6 +8270,18 @@ static void cpuset_cpu_active(void)
static void cpuset_cpu_inactive(unsigned int cpu)
{
if (!cpuhp_tasks_frozen) {
+ /*
+ * This is necessary since offline CPUs are
+ * taken out of the tl->mask() and a newly
+ * onlined CPU in same LLC will not realize
+ * whether it should reuse the LLC ID owned
+ * by an offline CPU without knowing the
+ * LLC association.
+ *
+ * Safe to release the reference if this is
+ * the last CPU in the LLC going offline.
+ */
+ sched_domain_free_llc_id(cpu);
cpuset_update_active_cpus();
} else {
num_cpus_frozen++;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 41caa22e0680..1378a1cfad18 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -631,6 +631,7 @@ void update_sched_domain_debugfs(void)
i++;
}
+ debugfs_create_u32("llc_id", 0444, d_cpu, (u32 *)per_cpu_ptr(&sd_llc_id, cpu));
__cpumask_clear_cpu(cpu, sd_sysctl_cpus);
}
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3ceaa9dc9a9e..69fad88b57d8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2142,6 +2142,7 @@ extern int group_balance_cpu(struct sched_group *sg);
extern void update_sched_domain_debugfs(void);
extern void dirty_sched_domain_sysctl(int cpu);
+void sched_domain_free_llc_id(int cpu);
extern int sched_update_scaling(void);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index cf643a5ddedd..d6e134767f30 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -20,6 +20,46 @@ void sched_domains_mutex_unlock(void)
/* Protected by sched_domains_mutex: */
static cpumask_var_t sched_domains_tmpmask;
static cpumask_var_t sched_domains_tmpmask2;
+static cpumask_var_t sched_llc_id_alloc_mask;
+DEFINE_PER_CPU(int, sd_llc_id) = -1;
+static int max_llcs = 0;
+
+static inline int sched_domain_alloc_llc_id(void)
+{
+ int llc_id;
+
+ lockdep_assert_held(&sched_domains_mutex);
+
+ llc_id = cpumask_first_zero(sched_llc_id_alloc_mask);
+ BUG_ON((unsigned int)llc_id >= nr_cpumask_bits);
+ cpumask_set_cpu(llc_id, sched_llc_id_alloc_mask);
+ ++max_llcs;
+
+ return llc_id;
+}
+
+void sched_domain_free_llc_id(int cpu)
+{
+ int i, llc_id = per_cpu(sd_llc_id, cpu);
+ bool found = false;
+
+ lockdep_assert_cpus_held(); /* For cpu_active_mask. */
+ guard(mutex)(&sched_domains_mutex);
+
+ per_cpu(sd_llc_id, cpu) = -1;
+ for_each_cpu(i, cpu_active_mask) {
+ if (per_cpu(sd_llc_id, i) == llc_id) {
+ found = true;
+ break;
+ }
+ }
+
+ /* Allow future hotplugs to claim this ID */
+ if (!found) {
+ cpumask_clear_cpu(llc_id, sched_llc_id_alloc_mask);
+ --max_llcs;
+ }
+}
static int __init sched_debug_setup(char *str)
{
@@ -658,7 +698,6 @@ static void destroy_sched_domains(struct sched_domain *sd)
*/
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
DEFINE_PER_CPU(int, sd_llc_size);
-DEFINE_PER_CPU(int, sd_llc_id);
DEFINE_PER_CPU(int, sd_share_id);
DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
@@ -684,7 +723,6 @@ static void update_top_cache_domain(int cpu)
rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
per_cpu(sd_llc_size, cpu) = size;
- per_cpu(sd_llc_id, cpu) = id;
rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
sd = lowest_flag_domain(cpu, SD_CLUSTER);
@@ -2567,10 +2605,35 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
/* Set up domains for CPUs specified by the cpu_map: */
for_each_cpu(i, cpu_map) {
- struct sched_domain_topology_level *tl;
+ struct sched_domain_topology_level *tl, *tl_llc = NULL;
+ bool done = false;
sd = NULL;
for_each_sd_topology(tl) {
+ int flags = 0;
+
+ if (tl->sd_flags)
+ flags = (*tl->sd_flags)();
+
+ if (flags & SD_SHARE_LLC) {
+ tl_llc = tl;
+
+ /*
+ * Entire cpu_map has been covered. We are
+ * traversing only to find the highest
+ * SD_SHARE_LLC level.
+ */
+ if (done)
+ continue;
+ }
+
+ /*
+ * Since SD_SHARE_LLC is SDF_SHARED_CHILD, we can
+ * safely break out if the entire cpu_map has been
+ * covered by a child domain.
+ */
+ if (done)
+ break;
sd = build_sched_domain(tl, cpu_map, attr, sd, i);
@@ -2579,7 +2642,41 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
if (tl == sched_domain_topology)
*per_cpu_ptr(d.sd, i) = sd;
if (cpumask_equal(cpu_map, sched_domain_span(sd)))
- break;
+ done = true;
+ }
+
+ /* First time visiting this CPU. Assign the llc_id. */
+ if (per_cpu(sd_llc_id, i) == -1) {
+ int j, llc_id = -1;
+
+ /*
+ * In case there are no SD_SHARE_LLC domains,
+ * each CPU gets its own llc_id. Find the first
+ * free bit on the mask and use it.
+ */
+ if (!tl_llc) {
+ per_cpu(sd_llc_id, i) = sched_domain_alloc_llc_id();
+ continue;
+ }
+
+ /*
+ * Visit all the CPUs of the LLC irrespective of the
+ * partition constraints and find if any of them have
+ * a valid llc_id.
+ */
+ for_each_cpu(j, tl_llc->mask(tl, i)) {
+ llc_id = per_cpu(sd_llc_id, j);
+
+ /* Found a valid llc_id for CPU's LLC. */
+ if (llc_id != -1)
+ break;
+ }
+
+ /* Valid llc_id not found. Allocate a new one. */
+ if (llc_id == -1)
+ llc_id = sched_domain_alloc_llc_id();
+
+ per_cpu(sd_llc_id, i) = llc_id;
}
}
@@ -2759,6 +2856,7 @@ int __init sched_init_domains(const struct cpumask *cpu_map)
zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
+ zalloc_cpumask_var(&sched_llc_id_alloc_mask, GFP_KERNEL);
zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
arch_update_cpu_topology();
---
AFAICT, "sd_llc_id" isn't compared across different partitions so having
the CPUs that are actually associated with same physical LLC but across
different partitions sharing the same "sd_llc_id" shouldn't be a problem.
Thoughts?
--
Thanks and Regards,
Prateek
Hello Prateek,
On 12/23/2025 1:31 PM, K Prateek Nayak wrote:
> Hello Tim, Chenyu,
>
> On 12/4/2025 4:37 AM, Tim Chen wrote:
>> +/*
>> + * Assign continuous llc id for the CPU, and return
>> + * the assigned llc id.
>> + */
>> +static int update_llc_id(struct sched_domain *sd,
>> + int cpu)
>> +{
>> + int id = per_cpu(sd_llc_id, cpu), i;
>> +
>> + if (id >= 0)
>> + return id;
>> +
>> + if (sd) {
>> + /* Look for any assigned id and reuse it.*/
>> + for_each_cpu(i, sched_domain_span(sd)) {
>> + id = per_cpu(sd_llc_id, i);
>> +
>> + if (id >= 0) {
>> + per_cpu(sd_llc_id, cpu) = id;
>> + return id;
>> + }
>> + }
>> + }
>
> I don't really like tying this down to the sched_domain span since
> partition and other weirdness can cause the max_llc count to go
> unnecessarily high. The tl->mask() (from sched_domain_topology_level)
> should give the mask considering all online CPUs and not bothering
> about cpusets.
OK, using the topology_level's mask (tl's mask) should allow us to
skip the cpuset partition. I just wanted to check if your concern
is about the excessive number of sd_llc_ids caused by the cpuset?
I was under the impression that without this patch, llc_ids are
unique across different partitions.
For example, on vanilla kernel without cache_aware,
suppose 1 LLC has CPU0,1,2,3. Before partition, all
CPUs have the same llc_id 0. Then create a new partition,
mkdir -p /sys/fs/cgroup/cgroup0
echo "3" > /sys/fs/cgroup/cgroup0/cpuset.cpus
echo root > /sys/fs/cgroup/cgroup0/cpuset.cpus.partition
CPU0,1,2 share llc_id 0, and CPU3 has a dedicated llc_id 3.
Do you suggest to let CPU3 reuse llc_id 0, so as to save
more llc_id space?
>
> How about something like:
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 5b17d8e3cb55..c19b1c4e6472 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -8270,6 +8270,18 @@ static void cpuset_cpu_active(void)
> static void cpuset_cpu_inactive(unsigned int cpu)
> {
> if (!cpuhp_tasks_frozen) {
> + /*
> + * This is necessary since offline CPUs are
> + * taken out of the tl->mask() and a newly
> + * onlined CPU in same LLC will not realize
> + * whether it should reuse the LLC ID owned
> + * by an offline CPU without knowing the
> + * LLC association.
> + *
> + * Safe to release the reference if this is
> + * the last CPU in the LLC going offline.
> + */
> + sched_domain_free_llc_id(cpu);
I'm OK with replacing the domain based cpumask by the topology_level
mask, just wondering whether re-using the llc_id would increase
the risk of race condition - it is possible that, a CPU has different
llc_ids before/after online/offline. Can we assign/reserve a "static"
llc_id for each CPU, whether it is online or offline? In this way,
we don't need to worry about the data synchronization when using
llc_id(). For example, I can think of adjusting the data in
percpu nr_pref_llc[max_llcs] on every CPU whenever a CPU gets
offline/online.
> cpuset_update_active_cpus();
> } else {
> num_cpus_frozen++;
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index 41caa22e0680..1378a1cfad18 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -631,6 +631,7 @@ void update_sched_domain_debugfs(void)
> i++;
> }
>
> + debugfs_create_u32("llc_id", 0444, d_cpu, (u32 *)per_cpu_ptr(&sd_llc_id, cpu));
> __cpumask_clear_cpu(cpu, sd_sysctl_cpus);
> }
> }
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 3ceaa9dc9a9e..69fad88b57d8 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2142,6 +2142,7 @@ extern int group_balance_cpu(struct sched_group *sg);
>
> extern void update_sched_domain_debugfs(void);
> extern void dirty_sched_domain_sysctl(int cpu);
> +void sched_domain_free_llc_id(int cpu);
>
> extern int sched_update_scaling(void);
>
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index cf643a5ddedd..d6e134767f30 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -20,6 +20,46 @@ void sched_domains_mutex_unlock(void)
> /* Protected by sched_domains_mutex: */
> static cpumask_var_t sched_domains_tmpmask;
> static cpumask_var_t sched_domains_tmpmask2;
> +static cpumask_var_t sched_llc_id_alloc_mask;
> +DEFINE_PER_CPU(int, sd_llc_id) = -1;
> +static int max_llcs = 0;
> +
> +static inline int sched_domain_alloc_llc_id(void)
> +{
> + int llc_id;
> +
> + lockdep_assert_held(&sched_domains_mutex);
> +
> + llc_id = cpumask_first_zero(sched_llc_id_alloc_mask);
> + BUG_ON((unsigned int)llc_id >= nr_cpumask_bits);
> + cpumask_set_cpu(llc_id, sched_llc_id_alloc_mask);
> + ++max_llcs;
> +
> + return llc_id;
> +}
> +
> +void sched_domain_free_llc_id(int cpu)
> +{
> + int i, llc_id = per_cpu(sd_llc_id, cpu);
> + bool found = false;
> +
> + lockdep_assert_cpus_held(); /* For cpu_active_mask. */
> + guard(mutex)(&sched_domains_mutex);
> +
> + per_cpu(sd_llc_id, cpu) = -1;
> + for_each_cpu(i, cpu_active_mask) {
> + if (per_cpu(sd_llc_id, i) == llc_id) {
> + found = true;
> + break;
> + }
> + }
> +
> + /* Allow future hotplugs to claim this ID */
> + if (!found) {
> + cpumask_clear_cpu(llc_id, sched_llc_id_alloc_mask);
> + --max_llcs;
Maybe only allow increasing the value of max_llcs when a new LLC
is detected. That says, max_llcs represents the total number of LLCs
that have ever been detected, even if some of the corresponding
CPUs have been taken offline via runtime hotplug. In this way, the
data synchronization might be simpler, maybe trade additional memory
space for code simplicity?
> + }
> +}
>
> static int __init sched_debug_setup(char *str)
> {
> @@ -658,7 +698,6 @@ static void destroy_sched_domains(struct sched_domain *sd)
> */
> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
> DEFINE_PER_CPU(int, sd_llc_size);
> -DEFINE_PER_CPU(int, sd_llc_id);
> DEFINE_PER_CPU(int, sd_share_id);
> DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
> @@ -684,7 +723,6 @@ static void update_top_cache_domain(int cpu)
>
> rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
> per_cpu(sd_llc_size, cpu) = size;
> - per_cpu(sd_llc_id, cpu) = id;
> rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
>
> sd = lowest_flag_domain(cpu, SD_CLUSTER);
> @@ -2567,10 +2605,35 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
>
> /* Set up domains for CPUs specified by the cpu_map: */
> for_each_cpu(i, cpu_map) {
> - struct sched_domain_topology_level *tl;
> + struct sched_domain_topology_level *tl, *tl_llc = NULL;
> + bool done = false;
>
> sd = NULL;
> for_each_sd_topology(tl) {
> + int flags = 0;
> +
> + if (tl->sd_flags)
> + flags = (*tl->sd_flags)();
> +
> + if (flags & SD_SHARE_LLC) {
> + tl_llc = tl;
> +
> + /*
> + * Entire cpu_map has been covered. We are
> + * traversing only to find the highest
> + * SD_SHARE_LLC level.
> + */
> + if (done)
> + continue;
> + }
> +
> + /*
> + * Since SD_SHARE_LLC is SDF_SHARED_CHILD, we can
> + * safely break out if the entire cpu_map has been
> + * covered by a child domain.
> + */
> + if (done)
> + break;
>
> sd = build_sched_domain(tl, cpu_map, attr, sd, i);
>
> @@ -2579,7 +2642,41 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
> if (tl == sched_domain_topology)
> *per_cpu_ptr(d.sd, i) = sd;
> if (cpumask_equal(cpu_map, sched_domain_span(sd)))
> - break;
> + done = true;
> + }
> +
> + /* First time visiting this CPU. Assign the llc_id. */
> + if (per_cpu(sd_llc_id, i) == -1) {
> + int j, llc_id = -1;
> +
> + /*
> + * In case there are no SD_SHARE_LLC domains,
> + * each CPU gets its own llc_id. Find the first
> + * free bit on the mask and use it.
> + */
> + if (!tl_llc) {
> + per_cpu(sd_llc_id, i) = sched_domain_alloc_llc_id();
> + continue;
> + }
> +
> + /*
> + * Visit all the CPUs of the LLC irrespective of the
> + * partition constraints and find if any of them have
> + * a valid llc_id.
> + */
> + for_each_cpu(j, tl_llc->mask(tl, i)) {
This is doable, we can use tl rather than domain's mask to
share llc_id among partitions.
> + llc_id = per_cpu(sd_llc_id, j);
> +
> + /* Found a valid llc_id for CPU's LLC. */
> + if (llc_id != -1)
> + break;
> + }
> +
> + /* Valid llc_id not found. Allocate a new one. */
> + if (llc_id == -1)
> + llc_id = sched_domain_alloc_llc_id();
> +
> + per_cpu(sd_llc_id, i) = llc_id;
> }
> }
>
> @@ -2759,6 +2856,7 @@ int __init sched_init_domains(const struct cpumask *cpu_map)
>
> zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
> zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
> + zalloc_cpumask_var(&sched_llc_id_alloc_mask, GFP_KERNEL);
> zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
>
> arch_update_cpu_topology();
> ---
>
> AFAICT, "sd_llc_id" isn't compared across different partitions so having
> the CPUs that are actually associated with same physical LLC but across
> different partitions sharing the same "sd_llc_id" shouldn't be a problem.
>
> Thoughts?
>
This means cpus_share_resources(int this_cpu, int that_cpu)
should be invoked when this_cpu and that_cpu belong to the same
partition.
In this way, we do not alter the context of cpus_share_resources(). We can
conduct an audit of the places where cpus_share_resources() is used.
Happy holidays,
Chenyu
Hello Chenyu,
On 12/24/2025 12:38 PM, Chen, Yu C wrote:
> Hello Prateek,
>
> On 12/23/2025 1:31 PM, K Prateek Nayak wrote:
>> Hello Tim, Chenyu,
>>
>> On 12/4/2025 4:37 AM, Tim Chen wrote:
>>> +/*
>>> + * Assign continuous llc id for the CPU, and return
>>> + * the assigned llc id.
>>> + */
>>> +static int update_llc_id(struct sched_domain *sd,
>>> + int cpu)
>>> +{
>>> + int id = per_cpu(sd_llc_id, cpu), i;
>>> +
>>> + if (id >= 0)
>>> + return id;
>>> +
>>> + if (sd) {
>>> + /* Look for any assigned id and reuse it.*/
>>> + for_each_cpu(i, sched_domain_span(sd)) {
>>> + id = per_cpu(sd_llc_id, i);
>>> +
>>> + if (id >= 0) {
>>> + per_cpu(sd_llc_id, cpu) = id;
>>> + return id;
>>> + }
>>> + }
>>> + }
>>
>> I don't really like tying this down to the sched_domain span since
>> partition and other weirdness can cause the max_llc count to go
>> unnecessarily high. The tl->mask() (from sched_domain_topology_level)
>> should give the mask considering all online CPUs and not bothering
>> about cpusets.
>
> OK, using the topology_level's mask (tl's mask) should allow us to
> skip the cpuset partition. I just wanted to check if your concern
> is about the excessive number of sd_llc_ids caused by the cpuset?
Yes. Basically all cases where sched_domain_span() isn't covering
the entire llc_span - even true for isolated partitions.
>
> I was under the impression that without this patch, llc_ids are
> unique across different partitions.
>
> For example, on vanilla kernel without cache_aware,
> suppose 1 LLC has CPU0,1,2,3. Before partition, all
> CPUs have the same llc_id 0. Then create a new partition,
> mkdir -p /sys/fs/cgroup/cgroup0
> echo "3" > /sys/fs/cgroup/cgroup0/cpuset.cpus
> echo root > /sys/fs/cgroup/cgroup0/cpuset.cpus.partition
> CPU0,1,2 share llc_id 0, and CPU3 has a dedicated llc_id 3.
> Do you suggest to let CPU3 reuse llc_id 0, so as to save
> more llc_id space?
Yes. And I think it is logical. Load balancing doesn't happen
across partitions so sd_llc_id reflecting the ID of the
physical LLC shouldn't be a problem.
>
>>
>> How about something like:
>>
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index 5b17d8e3cb55..c19b1c4e6472 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -8270,6 +8270,18 @@ static void cpuset_cpu_active(void)
>> static void cpuset_cpu_inactive(unsigned int cpu)
>> {
>> if (!cpuhp_tasks_frozen) {
>> + /*
>> + * This is necessary since offline CPUs are
>> + * taken out of the tl->mask() and a newly
>> + * onlined CPU in same LLC will not realize
>> + * whether it should reuse the LLC ID owned
>> + * by an offline CPU without knowing the
>> + * LLC association.
>> + *
>> + * Safe to release the reference if this is
>> + * the last CPU in the LLC going offline.
>> + */
>> + sched_domain_free_llc_id(cpu);
>
> I'm OK with replacing the domain based cpumask by the topology_level
> mask, just wondering whether re-using the llc_id would increase
> the risk of race condition - it is possible that, a CPU has different
> llc_ids before/after online/offline. Can we assign/reserve a "static"
> llc_id for each CPU, whether it is online or offline? In this way,
> we don't need to worry about the data synchronization when using
> llc_id(). For example, I can think of adjusting the data in
> percpu nr_pref_llc[max_llcs] on every CPU whenever a CPU gets
> offline/online.
So I was thinking of of expanding the rq->nr_pref_llc[] if the
max_llc increases but leave it as is if the number of LLCs
decreases. That way we don't have to worry about the
dereferencing past the array boundary.
We can also have a wrapper like:
struct nr_llc_stats {
int nr_llcs;
struct rcu_head rcu;
int *nr_pref_llc;
}
And re-allocate and attach it in rq_attach_root() during sd
rebuild. That way, RCU read-side can always grab a reference to
it, enqueue / dequeue don't need to care since it cannot change
under rq_lock, and partition can use call_rcu() to free the old
ones up.
>
>> cpuset_update_active_cpus();
>> } else {
>> num_cpus_frozen++;
>> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
>> index 41caa22e0680..1378a1cfad18 100644
>> --- a/kernel/sched/debug.c
>> +++ b/kernel/sched/debug.c
>> @@ -631,6 +631,7 @@ void update_sched_domain_debugfs(void)
>> i++;
>> }
>> + debugfs_create_u32("llc_id", 0444, d_cpu, (u32 *)per_cpu_ptr(&sd_llc_id, cpu));
>> __cpumask_clear_cpu(cpu, sd_sysctl_cpus);
>> }
>> }
>> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>> index 3ceaa9dc9a9e..69fad88b57d8 100644
>> --- a/kernel/sched/sched.h
>> +++ b/kernel/sched/sched.h
>> @@ -2142,6 +2142,7 @@ extern int group_balance_cpu(struct sched_group *sg);
>> extern void update_sched_domain_debugfs(void);
>> extern void dirty_sched_domain_sysctl(int cpu);
>> +void sched_domain_free_llc_id(int cpu);
>> extern int sched_update_scaling(void);
>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
>> index cf643a5ddedd..d6e134767f30 100644
>> --- a/kernel/sched/topology.c
>> +++ b/kernel/sched/topology.c
>> @@ -20,6 +20,46 @@ void sched_domains_mutex_unlock(void)
>> /* Protected by sched_domains_mutex: */
>> static cpumask_var_t sched_domains_tmpmask;
>> static cpumask_var_t sched_domains_tmpmask2;
>> +static cpumask_var_t sched_llc_id_alloc_mask;
>> +DEFINE_PER_CPU(int, sd_llc_id) = -1;
>> +static int max_llcs = 0;
>> +
>> +static inline int sched_domain_alloc_llc_id(void)
>> +{
>> + int llc_id;
>> +
>> + lockdep_assert_held(&sched_domains_mutex);
>> +
>> + llc_id = cpumask_first_zero(sched_llc_id_alloc_mask);
>> + BUG_ON((unsigned int)llc_id >= nr_cpumask_bits);
>> + cpumask_set_cpu(llc_id, sched_llc_id_alloc_mask);
>> + ++max_llcs;
>> +
>> + return llc_id;
>> +}
>> +
>> +void sched_domain_free_llc_id(int cpu)
>> +{
>> + int i, llc_id = per_cpu(sd_llc_id, cpu);
>> + bool found = false;
>> +
>> + lockdep_assert_cpus_held(); /* For cpu_active_mask. */
>> + guard(mutex)(&sched_domains_mutex);
>> +
>> + per_cpu(sd_llc_id, cpu) = -1;
>> + for_each_cpu(i, cpu_active_mask) {
>> + if (per_cpu(sd_llc_id, i) == llc_id) {
>> + found = true;
>> + break;
>> + }
>> + }
>> +
>> + /* Allow future hotplugs to claim this ID */
>> + if (!found) {
>> + cpumask_clear_cpu(llc_id, sched_llc_id_alloc_mask);
>> + --max_llcs;
>
> Maybe only allow increasing the value of max_llcs when a new LLC
> is detected. That says, max_llcs represents the total number of LLCs
> that have ever been detected, even if some of the corresponding
> CPUs have been taken offline via runtime hotplug. In this way, the
> data synchronization might be simpler, maybe trade additional memory
> space for code simplicity?
Ack. That "struct nr_llc_stats" might be over-engineering.
I don't mind working on it later after this goes in.
>
>> + }
>> +}
>> static int __init sched_debug_setup(char *str)
>> {
>> @@ -658,7 +698,6 @@ static void destroy_sched_domains(struct sched_domain *sd)
>> */
>> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
>> DEFINE_PER_CPU(int, sd_llc_size);
>> -DEFINE_PER_CPU(int, sd_llc_id);
>> DEFINE_PER_CPU(int, sd_share_id);
>> DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
>> DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
>> @@ -684,7 +723,6 @@ static void update_top_cache_domain(int cpu)
>> rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
>> per_cpu(sd_llc_size, cpu) = size;
>> - per_cpu(sd_llc_id, cpu) = id;
>> rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
>> sd = lowest_flag_domain(cpu, SD_CLUSTER);
>> @@ -2567,10 +2605,35 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
>> /* Set up domains for CPUs specified by the cpu_map: */
>> for_each_cpu(i, cpu_map) {
>> - struct sched_domain_topology_level *tl;
>> + struct sched_domain_topology_level *tl, *tl_llc = NULL;
>> + bool done = false;
>> sd = NULL;
>> for_each_sd_topology(tl) {
>> + int flags = 0;
>> +
>> + if (tl->sd_flags)
>> + flags = (*tl->sd_flags)();
>> +
>> + if (flags & SD_SHARE_LLC) {
>> + tl_llc = tl;
>> +
>> + /*
>> + * Entire cpu_map has been covered. We are
>> + * traversing only to find the highest
>> + * SD_SHARE_LLC level.
>> + */
>> + if (done)
>> + continue;
>> + }
>> +
>> + /*
>> + * Since SD_SHARE_LLC is SDF_SHARED_CHILD, we can
>> + * safely break out if the entire cpu_map has been
>> + * covered by a child domain.
>> + */
>> + if (done)
>> + break;
>> sd = build_sched_domain(tl, cpu_map, attr, sd, i);
>> @@ -2579,7 +2642,41 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
>> if (tl == sched_domain_topology)
>> *per_cpu_ptr(d.sd, i) = sd;
>> if (cpumask_equal(cpu_map, sched_domain_span(sd)))
>> - break;
>> + done = true;
>> + }
>> +
>> + /* First time visiting this CPU. Assign the llc_id. */
>> + if (per_cpu(sd_llc_id, i) == -1) {
>> + int j, llc_id = -1;
>> +
>> + /*
>> + * In case there are no SD_SHARE_LLC domains,
>> + * each CPU gets its own llc_id. Find the first
>> + * free bit on the mask and use it.
>> + */
>> + if (!tl_llc) {
>> + per_cpu(sd_llc_id, i) = sched_domain_alloc_llc_id();
>> + continue;
>> + }
>> +
>> + /*
>> + * Visit all the CPUs of the LLC irrespective of the
>> + * partition constraints and find if any of them have
>> + * a valid llc_id.
>> + */
>> + for_each_cpu(j, tl_llc->mask(tl, i)) {
>
> This is doable, we can use tl rather than domain's mask to
> share llc_id among partitions.
>
>> + llc_id = per_cpu(sd_llc_id, j);
>> +
>> + /* Found a valid llc_id for CPU's LLC. */
>> + if (llc_id != -1)
>> + break;
>> + }
>> +
>> + /* Valid llc_id not found. Allocate a new one. */
>> + if (llc_id == -1)
>> + llc_id = sched_domain_alloc_llc_id();
>> +
>> + per_cpu(sd_llc_id, i) = llc_id;
>> }
>> }
>> @@ -2759,6 +2856,7 @@ int __init sched_init_domains(const struct cpumask *cpu_map)
>> zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
>> zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
>> + zalloc_cpumask_var(&sched_llc_id_alloc_mask, GFP_KERNEL);
>> zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
>> arch_update_cpu_topology();
>> ---
>>
>> AFAICT, "sd_llc_id" isn't compared across different partitions so having
>> the CPUs that are actually associated with same physical LLC but across
>> different partitions sharing the same "sd_llc_id" shouldn't be a problem.
>>
>> Thoughts?
>>
>
> This means cpus_share_resources(int this_cpu, int that_cpu)
> should be invoked when this_cpu and that_cpu belong to the same partition.
> In this way, we do not alter the context of cpus_share_resources(). We can
> conduct an audit of the places where cpus_share_resources() is used.
Only case I can think of is a task wakes up after partitioning
and it's wake cpu from a different partition is mistaken to
share the LLC as the current CPU - but the task cannot actually
run on that old CPU and it'll have to take the
select_fallback_rq() path if prev_cpu was selected during
wake_affine().
I don't think it will be such a common occurence to cause an
issue and even without that wake_affine() could still the
prev_cpu if current CPU is busy or via wake_affine_weight().
--
Happy holidays!
Thanks and Regards,
Prateek
On 12/24/2025 4:19 PM, K Prateek Nayak wrote:
> Hello Chenyu,
>
> On 12/24/2025 12:38 PM, Chen, Yu C wrote:
>> Hello Prateek,
>>
>> On 12/23/2025 1:31 PM, K Prateek Nayak wrote:
>>> Hello Tim, Chenyu,
>>>
>>> On 12/4/2025 4:37 AM, Tim Chen wrote:
[snip]
>> I'm OK with replacing the domain based cpumask by the topology_level
>> mask, just wondering whether re-using the llc_id would increase
>> the risk of race condition - it is possible that, a CPU has different
>> llc_ids before/after online/offline. Can we assign/reserve a "static"
>> llc_id for each CPU, whether it is online or offline? In this way,
>> we don't need to worry about the data synchronization when using
>> llc_id(). For example, I can think of adjusting the data in
>> percpu nr_pref_llc[max_llcs] on every CPU whenever a CPU gets
>> offline/online.
>
> So I was thinking of of expanding the rq->nr_pref_llc[] if the
> max_llc increases but leave it as is if the number of LLCs
> decreases. That way we don't have to worry about the
> dereferencing past the array boundary.
>
Sure, we can do in this way.
> We can also have a wrapper like:
>
> struct nr_llc_stats {
> int nr_llcs;
> struct rcu_head rcu;
> int *nr_pref_llc;
> }
>
> And re-allocate and attach it in rq_attach_root() during sd
> rebuild. That way, RCU read-side can always grab a reference to
> it, enqueue / dequeue don't need to care since it cannot change
> under rq_lock, and partition can use call_rcu() to free the old
> ones up.
>
OK, doing it in this direction(and Peter also suggested something like this
in the domain)
>>
>>> cpuset_update_active_cpus();
>>> } else {
[snip]
>>> AFAICT, "sd_llc_id" isn't compared across different partitions so having
>>> the CPUs that are actually associated with same physical LLC but across
>>> different partitions sharing the same "sd_llc_id" shouldn't be a problem.
>>>
>>> Thoughts?
>>>
>>
>> This means cpus_share_resources(int this_cpu, int that_cpu)
Actually I was about to say cpus_share_cache().
>> should be invoked when this_cpu and that_cpu belong to the same partition.
>> In this way, we do not alter the context of cpus_share_resources(). We can
>> conduct an audit of the places where cpus_share_resources() is used.
>
> Only case I can think of is a task wakes up after partitioning
> and it's wake cpu from a different partition is mistaken to
> share the LLC as the current CPU - but the task cannot actually
> run on that old CPU and it'll have to take the
> select_fallback_rq() path if prev_cpu was selected during
> wake_affine().
>
OK, make sense.
Actually, prev_cpu might not be chosen by select_task_rq_fair()->
select_idle_sibling(), because fast path select_idle_sibling()
is expected to be triggered when prev_cpu and the current cpu are in the
same domain in select_task_rq_fair():
cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))
sd = NULL; //wake affine
curr cpu and prev_cpu are in different partitions, they
are not in the same domains.
> I don't think it will be such a common occurence to cause an
> issue and even without that wake_affine() could still the
> prev_cpu if current CPU is busy or via wake_affine_weight().
>
I realized that sched_cache has added cpus_share_cache() in
several places, most of which should be related to load
balancing, which should not be a problem if llc_id is shared
among partitions. I'll double check.
thanks,
Chenyu
Hello Chenyu, On 12/24/2025 3:16 PM, Chen, Yu C wrote: >> Only case I can think of is a task wakes up after partitioning >> and it's wake cpu from a different partition is mistaken to >> share the LLC as the current CPU - but the task cannot actually >> run on that old CPU and it'll have to take the >> select_fallback_rq() path if prev_cpu was selected during >> wake_affine(). >> > > OK, make sense. > Actually, prev_cpu might not be chosen by select_task_rq_fair()-> > select_idle_sibling(), because fast path select_idle_sibling() > is expected to be triggered when prev_cpu and the current cpu are in the > same domain in select_task_rq_fair(): > cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)) > sd = NULL; //wake affine Then again, there are cases where want_affine is false, and "new_cpu" is initialized to the prev_cpu and we continue down to select_idle_sibling() since "tmp->flags & sd_flag" is always false - WF_TTWU matches with SD_BALANCE_WAKE but no domain sets it anymore afaict. Again, going through the fallback selection path should be rare (once after partition on wakeup) and shouldn't cause any problems for most real-world scenarios. > curr cpu and prev_cpu are in different partitions, they > are not in the same domains. > >> I don't think it will be such a common occurence to cause an >> issue and even without that wake_affine() could still the >> prev_cpu if current CPU is busy or via wake_affine_weight(). >> > > I realized that sched_cache has added cpus_share_cache() in > several places, most of which should be related to load > balancing, which should not be a problem if llc_id is shared > among partitions. I'll double check. Thank you! -- Happy Holidays! Thanks and Regards, Prateek
On Wed, Dec 03, 2025 at 03:07:23PM -0800, Tim Chen wrote:
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 710ed9943d27..0a3918269906 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1210,10 +1210,17 @@ __read_mostly unsigned int llc_imb_pct = 20;
>
> static int llc_id(int cpu)
> {
> + int llc;
> +
> if (cpu < 0)
> return -1;
>
> + llc = per_cpu(sd_llc_id, cpu);
> + /* avoid race with cpu hotplug */
> + if (unlikely(llc >= max_llcs))
> + return -1;
> +
> + return llc;
> }
>
> void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
> @@ -668,6 +670,55 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
> DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
> DEFINE_STATIC_KEY_FALSE(sched_cluster_active);
>
> +/*
> + * Assign continuous llc id for the CPU, and return
> + * the assigned llc id.
> + */
> +static int update_llc_id(struct sched_domain *sd,
> + int cpu)
> +{
> + int id = per_cpu(sd_llc_id, cpu), i;
> +
> + if (id >= 0)
> + return id;
> +
> + if (sd) {
> + /* Look for any assigned id and reuse it.*/
> + for_each_cpu(i, sched_domain_span(sd)) {
> + id = per_cpu(sd_llc_id, i);
> +
> + if (id >= 0) {
> + per_cpu(sd_llc_id, cpu) = id;
> + return id;
> + }
> + }
> + }
> +
> + /*
> + * When 1. there is no id assigned to this LLC domain,
> + * or 2. the sd is NULL, we reach here.
> + * Consider the following scenario,
> + * CPU0~CPU95 are in the node0, CPU96~CPU191 are
> + * in the node1. During bootup, maxcpus=96 is
> + * appended.
> + * case 1: When running cpu_attach_domain(CPU24)
> + * during boot up, CPU24 is the first CPU in its
> + * non-NULL LLC domain. However,
> + * its corresponding llc id has not been assigned yet.
> + *
> + * case 2: After boot up, the CPU100 is brought up
> + * via sysfs manually. As a result, CPU100 has only a
> + * Numa domain attached, because CPU100 is the only CPU
> + * of a sched domain, all its bottom domains are degenerated.
> + * The LLC domain pointer sd is NULL for CPU100.
> + *
> + * For both cases, we want to increase the number of LLCs.
> + */
> + per_cpu(sd_llc_id, cpu) = max_llcs++;
> +
> + return per_cpu(sd_llc_id, cpu);
> +}
I'm not sure I follow. So partition_sched_domains() first calls
detach_destroy_domains() on the old set, and then build_sched_domains()
on the new set.
Do detach_destroy_domain() will do:
cpu_attach_domain(NULL,..);
That is, it will explicitly attach the NULL sched_domain to a CPU. At
which point I feel update_llc_id() should be returning -1, no?
Then later, build_sched_domains() will set a !NULL sched_domain, at
which point update_llc_id() can set a real value.
This should then also get rid of that weird max_llcs check in llc_id(),
right?
On Tue, 2025-12-09 at 12:58 +0100, Peter Zijlstra wrote:
> On Wed, Dec 03, 2025 at 03:07:23PM -0800, Tim Chen wrote:
>
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 710ed9943d27..0a3918269906 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -1210,10 +1210,17 @@ __read_mostly unsigned int llc_imb_pct = 20;
> >
> > static int llc_id(int cpu)
> > {
> > + int llc;
> > +
> > if (cpu < 0)
> > return -1;
> >
> > + llc = per_cpu(sd_llc_id, cpu);
> > + /* avoid race with cpu hotplug */
> > + if (unlikely(llc >= max_llcs))
> > + return -1;
> > +
> > + return llc;
> > }
> >
> > void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
>
> > @@ -668,6 +670,55 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
> > DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
> > DEFINE_STATIC_KEY_FALSE(sched_cluster_active);
> >
> > +/*
> > + * Assign continuous llc id for the CPU, and return
> > + * the assigned llc id.
> > + */
> > +static int update_llc_id(struct sched_domain *sd,
> > + int cpu)
> > +{
> > + int id = per_cpu(sd_llc_id, cpu), i;
> > +
> > + if (id >= 0)
> > + return id;
> > +
> > + if (sd) {
> > + /* Look for any assigned id and reuse it.*/
> > + for_each_cpu(i, sched_domain_span(sd)) {
> > + id = per_cpu(sd_llc_id, i);
> > +
> > + if (id >= 0) {
> > + per_cpu(sd_llc_id, cpu) = id;
> > + return id;
> > + }
> > + }
> > + }
> > +
> > + /*
> > + * When 1. there is no id assigned to this LLC domain,
> > + * or 2. the sd is NULL, we reach here.
> > + * Consider the following scenario,
> > + * CPU0~CPU95 are in the node0, CPU96~CPU191 are
> > + * in the node1. During bootup, maxcpus=96 is
> > + * appended.
> > + * case 1: When running cpu_attach_domain(CPU24)
> > + * during boot up, CPU24 is the first CPU in its
> > + * non-NULL LLC domain. However,
> > + * its corresponding llc id has not been assigned yet.
> > + *
> > + * case 2: After boot up, the CPU100 is brought up
> > + * via sysfs manually. As a result, CPU100 has only a
> > + * Numa domain attached, because CPU100 is the only CPU
> > + * of a sched domain, all its bottom domains are degenerated.
> > + * The LLC domain pointer sd is NULL for CPU100.
> > + *
> > + * For both cases, we want to increase the number of LLCs.
> > + */
> > + per_cpu(sd_llc_id, cpu) = max_llcs++;
> > +
> > + return per_cpu(sd_llc_id, cpu);
> > +}
>
> I'm not sure I follow. So partition_sched_domains() first calls
> detach_destroy_domains() on the old set, and then build_sched_domains()
> on the new set.
>
> Do detach_destroy_domain() will do:
>
> cpu_attach_domain(NULL,..);
>
> That is, it will explicitly attach the NULL sched_domain to a CPU. At
> which point I feel update_llc_id() should be returning -1, no?
>
> Then later, build_sched_domains() will set a !NULL sched_domain, at
> which point update_llc_id() can set a real value.
>
> This should then also get rid of that weird max_llcs check in llc_id(),
> right?
Thanks for pointing this out. Yes, we should take care of the
attachment of NULL sd. Will update the code accordingly.
Tim
On 12/16/2025 4:49 AM, Tim Chen wrote:
> On Tue, 2025-12-09 at 12:58 +0100, Peter Zijlstra wrote:
>> On Wed, Dec 03, 2025 at 03:07:23PM -0800, Tim Chen wrote:
>>
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index 710ed9943d27..0a3918269906 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -1210,10 +1210,17 @@ __read_mostly unsigned int llc_imb_pct = 20;
>>>
>>> static int llc_id(int cpu)
>>> {
>>> + int llc;
>>> +
>>> if (cpu < 0)
>>> return -1;
>>>
>>> + llc = per_cpu(sd_llc_id, cpu);
>>> + /* avoid race with cpu hotplug */
>>> + if (unlikely(llc >= max_llcs))
>>> + return -1;
>>> +
>>> + return llc;
>>> }
>>>
>>> void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
>>
>>> @@ -668,6 +670,55 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
>>> DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
>>> DEFINE_STATIC_KEY_FALSE(sched_cluster_active);
>>>
>>> +/*
>>> + * Assign continuous llc id for the CPU, and return
>>> + * the assigned llc id.
>>> + */
>>> +static int update_llc_id(struct sched_domain *sd,
>>> + int cpu)
>>> +{
>>> + int id = per_cpu(sd_llc_id, cpu), i;
>>> +
>>> + if (id >= 0)
>>> + return id;
>>> +
>>> + if (sd) {
>>> + /* Look for any assigned id and reuse it.*/
>>> + for_each_cpu(i, sched_domain_span(sd)) {
>>> + id = per_cpu(sd_llc_id, i);
>>> +
>>> + if (id >= 0) {
>>> + per_cpu(sd_llc_id, cpu) = id;
>>> + return id;
>>> + }
>>> + }
>>> + }
>>> +
>>> + /*
>>> + * When 1. there is no id assigned to this LLC domain,
>>> + * or 2. the sd is NULL, we reach here.
>>> + * Consider the following scenario,
>>> + * CPU0~CPU95 are in the node0, CPU96~CPU191 are
>>> + * in the node1. During bootup, maxcpus=96 is
>>> + * appended.
>>> + * case 1: When running cpu_attach_domain(CPU24)
>>> + * during boot up, CPU24 is the first CPU in its
>>> + * non-NULL LLC domain. However,
>>> + * its corresponding llc id has not been assigned yet.
>>> + *
>>> + * case 2: After boot up, the CPU100 is brought up
>>> + * via sysfs manually. As a result, CPU100 has only a
>>> + * Numa domain attached, because CPU100 is the only CPU
>>> + * of a sched domain, all its bottom domains are degenerated.
>>> + * The LLC domain pointer sd is NULL for CPU100.
>>> + *
>>> + * For both cases, we want to increase the number of LLCs.
>>> + */
>>> + per_cpu(sd_llc_id, cpu) = max_llcs++;
>>> +
>>> + return per_cpu(sd_llc_id, cpu);
>>> +}
>>
>> I'm not sure I follow. So partition_sched_domains() first calls
>> detach_destroy_domains() on the old set, and then build_sched_domains()
>> on the new set.
>>
>> Do detach_destroy_domain() will do:
>>
>> cpu_attach_domain(NULL,..);
>>
>> That is, it will explicitly attach the NULL sched_domain to a CPU. At
>> which point I feel update_llc_id() should be returning -1, no?
>>
>> Then later, build_sched_domains() will set a !NULL sched_domain, at
>> which point update_llc_id() can set a real value.
>>
>> This should then also get rid of that weird max_llcs check in llc_id(),
>> right?
The check for max_llcs was intended to prevent out-of-bounds access
to rq->nr_pref_llc[] at multiple points in the code.
Since dst_llc = llc_id(env->dst_cpu); — and while the LLC ID for the
CPU is updated in update_llc_id(), this update occurs before we reallocate
the nr_pref_llc buffer — dst_llc may end up exceeding the bounds of the
original nr_pref_llc buffer.
For this reason, we added a check if (dst_llc > max_llc) in llc_id()
when attempting to access rq->nr_pref_llc[dst_llc].
However, I agree that the max_llc check seems to not properly integrated
into the current patch: it should instead be placed in the 7th patch, as
this would better illustrate the rationale for the max_llc check here:
sched/cache: Introduce per runqueue task LLC preference counter
In the 7th patch, we actually increment new_max_llcs rather than
max_llcs — meaning max_llcs always represents the "old" number of LLCs.
As a result, there is a race window between extending the rq->nr_pref_llc
buffer and updating max_llcs.
@@ -714,7 +827,7 @@ static int update_llc_id(struct sched_domain *sd,
*
* For both cases, we want to increase the number of LLCs.
*/
- per_cpu(sd_llc_id, cpu) = max_llcs++;
+ per_cpu(sd_llc_id, cpu) = new_max_llcs++;
return per_cpu(sd_llc_id, cpu);
}
> Thanks for pointing this out. Yes, we should take care of the
> attachment of NULL sd. Will update the code accordingly.
>
My understanding is that, if the sd is NULL, it is either because invoked
by detach_destroy_domain() for the old set, or by case 2 mentioned in
above comments:
Say, CPU0-CPU95 are online during bootup, the boot command line is
maxcpus=96.
Later after bootup, the user wants to bring up CPU100, the LLC domain for
CPU100 is NULL in this case(due to sd generation), and a new LLC should be
detected.
That is to say, when we reach update_llc_id(), there could be 2 reasons
for NULL sd. For the detach_destroy_domain() case, update_llc_id()
should return a valid id without increasing the max_llcs, because of
if (id >= 0)
return id;
And for the latter, the max_llcs should be increased.
Let me double check on this.
thanks,
Chenyu
> Tim
On Tue, 2025-12-16 at 13:31 +0800, Chen, Yu C wrote:
> On 12/16/2025 4:49 AM, Tim Chen wrote:
> > On Tue, 2025-12-09 at 12:58 +0100, Peter Zijlstra wrote:
> > > On Wed, Dec 03, 2025 at 03:07:23PM -0800, Tim Chen wrote:
> > >
> > > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > > index 710ed9943d27..0a3918269906 100644
> > > > --- a/kernel/sched/fair.c
> > > > +++ b/kernel/sched/fair.c
> > > > @@ -1210,10 +1210,17 @@ __read_mostly unsigned int llc_imb_pct = 20;
> > > >
> > > > static int llc_id(int cpu)
> > > > {
> > > > + int llc;
> > > > +
> > > > if (cpu < 0)
> > > > return -1;
> > > >
> > > > + llc = per_cpu(sd_llc_id, cpu);
> > > > + /* avoid race with cpu hotplug */
> > > > + if (unlikely(llc >= max_llcs))
> > > > + return -1;
> > > > +
> > > > + return llc;
> > > > }
> > > >
> > > > void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
> > >
> > > > @@ -668,6 +670,55 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
> > > > DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
> > > > DEFINE_STATIC_KEY_FALSE(sched_cluster_active);
> > > >
> > > > +/*
> > > > + * Assign continuous llc id for the CPU, and return
> > > > + * the assigned llc id.
> > > > + */
> > > > +static int update_llc_id(struct sched_domain *sd,
> > > > + int cpu)
> > > > +{
> > > > + int id = per_cpu(sd_llc_id, cpu), i;
> > > > +
> > > > + if (id >= 0)
> > > > + return id;
> > > > +
> > > > + if (sd) {
> > > > + /* Look for any assigned id and reuse it.*/
> > > > + for_each_cpu(i, sched_domain_span(sd)) {
> > > > + id = per_cpu(sd_llc_id, i);
> > > > +
> > > > + if (id >= 0) {
> > > > + per_cpu(sd_llc_id, cpu) = id;
> > > > + return id;
> > > > + }
> > > > + }
> > > > + }
> > > > +
> > > > + /*
> > > > + * When 1. there is no id assigned to this LLC domain,
> > > > + * or 2. the sd is NULL, we reach here.
> > > > + * Consider the following scenario,
> > > > + * CPU0~CPU95 are in the node0, CPU96~CPU191 are
> > > > + * in the node1. During bootup, maxcpus=96 is
> > > > + * appended.
> > > > + * case 1: When running cpu_attach_domain(CPU24)
> > > > + * during boot up, CPU24 is the first CPU in its
> > > > + * non-NULL LLC domain. However,
> > > > + * its corresponding llc id has not been assigned yet.
> > > > + *
> > > > + * case 2: After boot up, the CPU100 is brought up
> > > > + * via sysfs manually. As a result, CPU100 has only a
> > > > + * Numa domain attached, because CPU100 is the only CPU
> > > > + * of a sched domain, all its bottom domains are degenerated.
> > > > + * The LLC domain pointer sd is NULL for CPU100.
> > > > + *
> > > > + * For both cases, we want to increase the number of LLCs.
> > > > + */
> > > > + per_cpu(sd_llc_id, cpu) = max_llcs++;
> > > > +
> > > > + return per_cpu(sd_llc_id, cpu);
> > > > +}
> > >
> > > I'm not sure I follow. So partition_sched_domains() first calls
> > > detach_destroy_domains() on the old set, and then build_sched_domains()
> > > on the new set.
> > >
> > > Do detach_destroy_domain() will do:
> > >
> > > cpu_attach_domain(NULL,..);
> > >
> > > That is, it will explicitly attach the NULL sched_domain to a CPU. At
> > > which point I feel update_llc_id() should be returning -1, no?
> > >
> > > Then later, build_sched_domains() will set a !NULL sched_domain, at
> > > which point update_llc_id() can set a real value.
> > >
> > > This should then also get rid of that weird max_llcs check in llc_id(),
> > > right?
>
> The check for max_llcs was intended to prevent out-of-bounds access
> to rq->nr_pref_llc[] at multiple points in the code.
> Since dst_llc = llc_id(env->dst_cpu); — and while the LLC ID for the
> CPU is updated in update_llc_id(), this update occurs before we reallocate
> the nr_pref_llc buffer — dst_llc may end up exceeding the bounds of the
> original nr_pref_llc buffer.
>
> For this reason, we added a check if (dst_llc > max_llc) in llc_id()
> when attempting to access rq->nr_pref_llc[dst_llc].
>
> However, I agree that the max_llc check seems to not properly integrated
> into the current patch: it should instead be placed in the 7th patch, as
> this would better illustrate the rationale for the max_llc check here:
> sched/cache: Introduce per runqueue task LLC preference counter
>
> In the 7th patch, we actually increment new_max_llcs rather than
> max_llcs — meaning max_llcs always represents the "old" number of LLCs.
> As a result, there is a race window between extending the rq->nr_pref_llc
> buffer and updating max_llcs.
>
>
> @@ -714,7 +827,7 @@ static int update_llc_id(struct sched_domain *sd,
> *
> * For both cases, we want to increase the number of LLCs.
> */
> - per_cpu(sd_llc_id, cpu) = max_llcs++;
> + per_cpu(sd_llc_id, cpu) = new_max_llcs++;
>
> return per_cpu(sd_llc_id, cpu);
> }
>
>
> > Thanks for pointing this out. Yes, we should take care of the
> > attachment of NULL sd. Will update the code accordingly.
> >
>
> My understanding is that, if the sd is NULL, it is either because invoked
> by detach_destroy_domain() for the old set, or by case 2 mentioned in
> above comments:
> Say, CPU0-CPU95 are online during bootup, the boot command line is
> maxcpus=96.
> Later after bootup, the user wants to bring up CPU100, the LLC domain for
> CPU100 is NULL in this case(due to sd generation), and a new LLC should be
> detected.
>
> That is to say, when we reach update_llc_id(), there could be 2 reasons
> for NULL sd. For the detach_destroy_domain() case, update_llc_id()
> should return a valid id without increasing the max_llcs, because of
> if (id >= 0)
> return id;
> And for the latter, the max_llcs should be increased.
> Let me double check on this.
The issue is we could offline all CPUs in a LLC and online them later.
In the current code, we will assign their ids all to -1. So on attach
of CPUs again, we'll be assigning a new LLC. I think the proper thing
to do is not to assign llc id of the offlined cpu (the case where sd == NULL)
and keep the original llc id assigned. Then we should be okay and not
increase max_llcs.
Tim
>
> thanks,
> Chenyu
>
>
> > Tim
On 12/17/2025 3:53 AM, Tim Chen wrote:
> On Tue, 2025-12-16 at 13:31 +0800, Chen, Yu C wrote:
>> On 12/16/2025 4:49 AM, Tim Chen wrote:
>>> On Tue, 2025-12-09 at 12:58 +0100, Peter Zijlstra wrote:
>>>> On Wed, Dec 03, 2025 at 03:07:23PM -0800, Tim Chen wrote:
>>>>
>>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>>>> index 710ed9943d27..0a3918269906 100644
>>>>> --- a/kernel/sched/fair.c
>>>>> +++ b/kernel/sched/fair.c
>>>>> @@ -1210,10 +1210,17 @@ __read_mostly unsigned int llc_imb_pct = 20;
>>>>>
>>>>> static int llc_id(int cpu)
>>>>> {
>>>>> + int llc;
>>>>> +
>>>>> if (cpu < 0)
>>>>> return -1;
>>>>>
>>>>> + llc = per_cpu(sd_llc_id, cpu);
>>>>> + /* avoid race with cpu hotplug */
>>>>> + if (unlikely(llc >= max_llcs))
>>>>> + return -1;
>>>>> +
>>>>> + return llc;
>>>>> }
>>>>>
>>>>> void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
>>>>
>>>>> @@ -668,6 +670,55 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
>>>>> DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
>>>>> DEFINE_STATIC_KEY_FALSE(sched_cluster_active);
>>>>>
>>>>> +/*
>>>>> + * Assign continuous llc id for the CPU, and return
>>>>> + * the assigned llc id.
>>>>> + */
>>>>> +static int update_llc_id(struct sched_domain *sd,
>>>>> + int cpu)
>>>>> +{
>>>>> + int id = per_cpu(sd_llc_id, cpu), i;
>>>>> +
>>>>> + if (id >= 0)
>>>>> + return id;
>>>>> +
>>>>> + if (sd) {
>>>>> + /* Look for any assigned id and reuse it.*/
>>>>> + for_each_cpu(i, sched_domain_span(sd)) {
>>>>> + id = per_cpu(sd_llc_id, i);
>>>>> +
>>>>> + if (id >= 0) {
>>>>> + per_cpu(sd_llc_id, cpu) = id;
>>>>> + return id;
>>>>> + }
>>>>> + }
>>>>> + }
>>>>> +
>>>>> + /*
>>>>> + * When 1. there is no id assigned to this LLC domain,
>>>>> + * or 2. the sd is NULL, we reach here.
>>>>> + * Consider the following scenario,
>>>>> + * CPU0~CPU95 are in the node0, CPU96~CPU191 are
>>>>> + * in the node1. During bootup, maxcpus=96 is
>>>>> + * appended.
>>>>> + * case 1: When running cpu_attach_domain(CPU24)
>>>>> + * during boot up, CPU24 is the first CPU in its
>>>>> + * non-NULL LLC domain. However,
>>>>> + * its corresponding llc id has not been assigned yet.
>>>>> + *
>>>>> + * case 2: After boot up, the CPU100 is brought up
>>>>> + * via sysfs manually. As a result, CPU100 has only a
>>>>> + * Numa domain attached, because CPU100 is the only CPU
>>>>> + * of a sched domain, all its bottom domains are degenerated.
>>>>> + * The LLC domain pointer sd is NULL for CPU100.
>>>>> + *
>>>>> + * For both cases, we want to increase the number of LLCs.
>>>>> + */
>>>>> + per_cpu(sd_llc_id, cpu) = max_llcs++;
>>>>> +
>>>>> + return per_cpu(sd_llc_id, cpu);
>>>>> +}
>>>>
>>>> I'm not sure I follow. So partition_sched_domains() first calls
>>>> detach_destroy_domains() on the old set, and then build_sched_domains()
>>>> on the new set.
>>>>
>>>> Do detach_destroy_domain() will do:
>>>>
>>>> cpu_attach_domain(NULL,..);
>>>>
>>>> That is, it will explicitly attach the NULL sched_domain to a CPU. At
>>>> which point I feel update_llc_id() should be returning -1, no?
>>>>
>>>> Then later, build_sched_domains() will set a !NULL sched_domain, at
>>>> which point update_llc_id() can set a real value.
>>>>
>>>> This should then also get rid of that weird max_llcs check in llc_id(),
>>>> right?
>>
>> The check for max_llcs was intended to prevent out-of-bounds access
>> to rq->nr_pref_llc[] at multiple points in the code.
>> Since dst_llc = llc_id(env->dst_cpu); — and while the LLC ID for the
>> CPU is updated in update_llc_id(), this update occurs before we reallocate
>> the nr_pref_llc buffer — dst_llc may end up exceeding the bounds of the
>> original nr_pref_llc buffer.
>>
>> For this reason, we added a check if (dst_llc > max_llc) in llc_id()
>> when attempting to access rq->nr_pref_llc[dst_llc].
>>
>> However, I agree that the max_llc check seems to not properly integrated
>> into the current patch: it should instead be placed in the 7th patch, as
>> this would better illustrate the rationale for the max_llc check here:
>> sched/cache: Introduce per runqueue task LLC preference counter
>>
>> In the 7th patch, we actually increment new_max_llcs rather than
>> max_llcs — meaning max_llcs always represents the "old" number of LLCs.
>> As a result, there is a race window between extending the rq->nr_pref_llc
>> buffer and updating max_llcs.
>>
>>
>> @@ -714,7 +827,7 @@ static int update_llc_id(struct sched_domain *sd,
>> *
>> * For both cases, we want to increase the number of LLCs.
>> */
>> - per_cpu(sd_llc_id, cpu) = max_llcs++;
>> + per_cpu(sd_llc_id, cpu) = new_max_llcs++;
>>
>> return per_cpu(sd_llc_id, cpu);
>> }
>>
>>
>>> Thanks for pointing this out. Yes, we should take care of the
>>> attachment of NULL sd. Will update the code accordingly.
>>>
>>
>> My understanding is that, if the sd is NULL, it is either because invoked
>> by detach_destroy_domain() for the old set, or by case 2 mentioned in
>> above comments:
>> Say, CPU0-CPU95 are online during bootup, the boot command line is
>> maxcpus=96.
>> Later after bootup, the user wants to bring up CPU100, the LLC domain for
>> CPU100 is NULL in this case(due to sd generation), and a new LLC should be
>> detected.
>>
>> That is to say, when we reach update_llc_id(), there could be 2 reasons
>> for NULL sd. For the detach_destroy_domain() case, update_llc_id()
>> should return a valid id without increasing the max_llcs, because of
>> if (id >= 0)
>> return id;
>> And for the latter, the max_llcs should be increased.
>> Let me double check on this.
>
> The issue is we could offline all CPUs in a LLC and online them later.
> In the current code, we will assign their ids all to -1.
I suppose we don't reset the ids in current implementation, only
the first scan of LLCs will reset/initialize the ids to -1 in
build_sched_domains()?
if (!max_llcs) { //max_llcs is initialized to 0 during bootup
for_each_possible_cpu(i)
per_cpu(sd_llc_id, i) = -1;
}
> So on attach
> of CPUs again, we'll be assigning a new LLC. I think the proper thing
> to do is not to assign llc id of the offlined cpu (the case where sd == NULL)
> and keep the original llc id assigned. Then we should be okay and not
> increase max_llcs.
>
This is the current implementation because we don't assign new ids to
CPUs that already have an id(no matter it is offline/online).
thanks,
Chenyu
© 2016 - 2026 Red Hat, Inc.