From: Chen Ridong <chenridong@huawei.com>
Following the introduction of cpuset1_generate_sched_domains() for v1
in the previous patch, v1-specific logic can now be removed from the
generic generate_sched_domains(). This patch cleans up the v1-only
code and ensures uf_node is only visible when CONFIG_CPUSETS_V1=y.
Signed-off-by: Chen Ridong <chenridong@huawei.com>
---
kernel/cgroup/cpuset-internal.h | 10 +--
kernel/cgroup/cpuset-v1.c | 2 +-
kernel/cgroup/cpuset.c | 144 +++++---------------------------
3 files changed, 27 insertions(+), 129 deletions(-)
diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h
index bd767f8cb0ed..ef7b7c5afd4c 100644
--- a/kernel/cgroup/cpuset-internal.h
+++ b/kernel/cgroup/cpuset-internal.h
@@ -175,14 +175,14 @@ struct cpuset {
/* Handle for cpuset.cpus.partition */
struct cgroup_file partition_file;
- /* Used to merge intersecting subsets for generate_sched_domains */
- struct uf_node node;
-
#ifdef CONFIG_CPUSETS_V1
struct fmeter fmeter; /* memory_pressure filter */
/* for custom sched domain */
int relax_domain_level;
+
+ /* Used to merge intersecting subsets for generate_sched_domains */
+ struct uf_node node;
#endif
};
@@ -315,8 +315,6 @@ void cpuset1_hotplug_update_tasks(struct cpuset *cs,
int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial);
void cpuset1_init(struct cpuset *cs);
void cpuset1_online_css(struct cgroup_subsys_state *css);
-void update_domain_attr_tree(struct sched_domain_attr *dattr,
- struct cpuset *root_cs);
int cpuset1_generate_sched_domains(cpumask_var_t **domains,
struct sched_domain_attr **attributes);
@@ -331,8 +329,6 @@ static inline int cpuset1_validate_change(struct cpuset *cur,
struct cpuset *trial) { return 0; }
static inline void cpuset1_init(struct cpuset *cs) {}
static inline void cpuset1_online_css(struct cgroup_subsys_state *css) {}
-static inline void update_domain_attr_tree(struct sched_domain_attr *dattr,
- struct cpuset *root_cs) {}
static inline int cpuset1_generate_sched_domains(cpumask_var_t **domains,
struct sched_domain_attr **attributes) { return 0; };
diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c
index 5c0bded46a7c..0226350e704f 100644
--- a/kernel/cgroup/cpuset-v1.c
+++ b/kernel/cgroup/cpuset-v1.c
@@ -560,7 +560,7 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
dattr->relax_domain_level = c->relax_domain_level;
}
-void update_domain_attr_tree(struct sched_domain_attr *dattr,
+static void update_domain_attr_tree(struct sched_domain_attr *dattr,
struct cpuset *root_cs)
{
struct cpuset *cp;
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 6bb0b201c34b..3e3468d928f3 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -789,18 +789,13 @@ static int generate_sched_domains(cpumask_var_t **domains,
{
struct cpuset *cp; /* top-down scan of cpusets */
struct cpuset **csa; /* array of all cpuset ptrs */
- int csn; /* how many cpuset ptrs in csa so far */
int i, j; /* indices for partition finding loops */
cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
struct sched_domain_attr *dattr; /* attributes for custom domains */
int ndoms = 0; /* number of sched domains in result */
- int nslot; /* next empty doms[] struct cpumask slot */
struct cgroup_subsys_state *pos_css;
- bool root_load_balance = is_sched_load_balance(&top_cpuset);
- bool cgrpv2 = cpuset_v2();
- int nslot_update;
- if (!cgrpv2)
+ if (!cpuset_v2())
return cpuset1_generate_sched_domains(domains, attributes);
doms = NULL;
@@ -808,70 +803,25 @@ static int generate_sched_domains(cpumask_var_t **domains,
csa = NULL;
/* Special case for the 99% of systems with one, full, sched domain */
- if (root_load_balance && cpumask_empty(subpartitions_cpus)) {
-single_root_domain:
+ if (cpumask_empty(subpartitions_cpus)) {
ndoms = 1;
- doms = alloc_sched_domains(ndoms);
- if (!doms)
- goto done;
-
- dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
- if (dattr) {
- *dattr = SD_ATTR_INIT;
- update_domain_attr_tree(dattr, &top_cpuset);
- }
- cpumask_and(doms[0], top_cpuset.effective_cpus,
- housekeeping_cpumask(HK_TYPE_DOMAIN));
-
- goto done;
+ goto generate_doms;
}
csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL);
if (!csa)
goto done;
- csn = 0;
+ /* Find how many partitions and cache them to csa[] */
rcu_read_lock();
- if (root_load_balance)
- csa[csn++] = &top_cpuset;
cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
- if (cp == &top_cpuset)
- continue;
-
- if (cgrpv2)
- goto v2;
-
- /*
- * v1:
- * Continue traversing beyond @cp iff @cp has some CPUs and
- * isn't load balancing. The former is obvious. The
- * latter: All child cpusets contain a subset of the
- * parent's cpus, so just skip them, and then we call
- * update_domain_attr_tree() to calc relax_domain_level of
- * the corresponding sched domain.
- */
- if (!cpumask_empty(cp->cpus_allowed) &&
- !(is_sched_load_balance(cp) &&
- cpumask_intersects(cp->cpus_allowed,
- housekeeping_cpumask(HK_TYPE_DOMAIN))))
- continue;
-
- if (is_sched_load_balance(cp) &&
- !cpumask_empty(cp->effective_cpus))
- csa[csn++] = cp;
-
- /* skip @cp's subtree */
- pos_css = css_rightmost_descendant(pos_css);
- continue;
-
-v2:
/*
* Only valid partition roots that are not isolated and with
- * non-empty effective_cpus will be saved into csn[].
+ * non-empty effective_cpus will be saved into csa[].
*/
if ((cp->partition_root_state == PRS_ROOT) &&
!cpumask_empty(cp->effective_cpus))
- csa[csn++] = cp;
+ csa[ndoms++] = cp;
/*
* Skip @cp's subtree if not a partition root and has no
@@ -882,40 +832,18 @@ static int generate_sched_domains(cpumask_var_t **domains,
}
rcu_read_unlock();
- /*
- * If there are only isolated partitions underneath the cgroup root,
- * we can optimize out unneeded sched domains scanning.
- */
- if (root_load_balance && (csn == 1))
- goto single_root_domain;
-
- for (i = 0; i < csn; i++)
- uf_node_init(&csa[i]->node);
-
- /* Merge overlapping cpusets */
- for (i = 0; i < csn; i++) {
- for (j = i + 1; j < csn; j++) {
- if (cpusets_overlap(csa[i], csa[j])) {
+ for (i = 0; i < ndoms; i++) {
+ for (j = i + 1; j < ndoms; j++) {
+ if (cpusets_overlap(csa[i], csa[j]))
/*
* Cgroup v2 shouldn't pass down overlapping
* partition root cpusets.
*/
- WARN_ON_ONCE(cgrpv2);
- uf_union(&csa[i]->node, &csa[j]->node);
- }
+ WARN_ON_ONCE(1);
}
}
- /* Count the total number of domains */
- for (i = 0; i < csn; i++) {
- if (uf_find(&csa[i]->node) == &csa[i]->node)
- ndoms++;
- }
-
- /*
- * Now we know how many domains to create.
- * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
- */
+generate_doms:
doms = alloc_sched_domains(ndoms);
if (!doms)
goto done;
@@ -932,45 +860,19 @@ static int generate_sched_domains(cpumask_var_t **domains,
* to SD_ATTR_INIT. Also non-isolating partition root CPUs are a
* subset of HK_TYPE_DOMAIN housekeeping CPUs.
*/
- if (cgrpv2) {
- for (i = 0; i < ndoms; i++) {
- /*
- * The top cpuset may contain some boot time isolated
- * CPUs that need to be excluded from the sched domain.
- */
- if (csa[i] == &top_cpuset)
- cpumask_and(doms[i], csa[i]->effective_cpus,
- housekeeping_cpumask(HK_TYPE_DOMAIN));
- else
- cpumask_copy(doms[i], csa[i]->effective_cpus);
- if (dattr)
- dattr[i] = SD_ATTR_INIT;
- }
- goto done;
- }
-
- for (nslot = 0, i = 0; i < csn; i++) {
- nslot_update = 0;
- for (j = i; j < csn; j++) {
- if (uf_find(&csa[j]->node) == &csa[i]->node) {
- struct cpumask *dp = doms[nslot];
-
- if (i == j) {
- nslot_update = 1;
- cpumask_clear(dp);
- if (dattr)
- *(dattr + nslot) = SD_ATTR_INIT;
- }
- cpumask_or(dp, dp, csa[j]->effective_cpus);
- cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN));
- if (dattr)
- update_domain_attr_tree(dattr + nslot, csa[j]);
- }
- }
- if (nslot_update)
- nslot++;
+ for (i = 0; i < ndoms; i++) {
+ /*
+ * The top cpuset may contain some boot time isolated
+ * CPUs that need to be excluded from the sched domain.
+ */
+ if (!csa || csa[i] == &top_cpuset)
+ cpumask_and(doms[i], top_cpuset.effective_cpus,
+ housekeeping_cpumask(HK_TYPE_DOMAIN));
+ else
+ cpumask_copy(doms[i], csa[i]->effective_cpus);
+ if (dattr)
+ dattr[i] = SD_ATTR_INIT;
}
- BUG_ON(nslot != ndoms);
done:
kfree(csa);
--
2.34.1
On 12/17/25 3:49 AM, Chen Ridong wrote:
> From: Chen Ridong <chenridong@huawei.com>
>
> Following the introduction of cpuset1_generate_sched_domains() for v1
> in the previous patch, v1-specific logic can now be removed from the
> generic generate_sched_domains(). This patch cleans up the v1-only
> code and ensures uf_node is only visible when CONFIG_CPUSETS_V1=y.
>
> Signed-off-by: Chen Ridong <chenridong@huawei.com>
> ---
> kernel/cgroup/cpuset-internal.h | 10 +--
> kernel/cgroup/cpuset-v1.c | 2 +-
> kernel/cgroup/cpuset.c | 144 +++++---------------------------
> 3 files changed, 27 insertions(+), 129 deletions(-)
>
> diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h
> index bd767f8cb0ed..ef7b7c5afd4c 100644
> --- a/kernel/cgroup/cpuset-internal.h
> +++ b/kernel/cgroup/cpuset-internal.h
> @@ -175,14 +175,14 @@ struct cpuset {
> /* Handle for cpuset.cpus.partition */
> struct cgroup_file partition_file;
>
> - /* Used to merge intersecting subsets for generate_sched_domains */
> - struct uf_node node;
> -
> #ifdef CONFIG_CPUSETS_V1
> struct fmeter fmeter; /* memory_pressure filter */
>
> /* for custom sched domain */
> int relax_domain_level;
> +
> + /* Used to merge intersecting subsets for generate_sched_domains */
> + struct uf_node node;
> #endif
> };
>
> @@ -315,8 +315,6 @@ void cpuset1_hotplug_update_tasks(struct cpuset *cs,
> int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial);
> void cpuset1_init(struct cpuset *cs);
> void cpuset1_online_css(struct cgroup_subsys_state *css);
> -void update_domain_attr_tree(struct sched_domain_attr *dattr,
> - struct cpuset *root_cs);
> int cpuset1_generate_sched_domains(cpumask_var_t **domains,
> struct sched_domain_attr **attributes);
>
> @@ -331,8 +329,6 @@ static inline int cpuset1_validate_change(struct cpuset *cur,
> struct cpuset *trial) { return 0; }
> static inline void cpuset1_init(struct cpuset *cs) {}
> static inline void cpuset1_online_css(struct cgroup_subsys_state *css) {}
> -static inline void update_domain_attr_tree(struct sched_domain_attr *dattr,
> - struct cpuset *root_cs) {}
> static inline int cpuset1_generate_sched_domains(cpumask_var_t **domains,
> struct sched_domain_attr **attributes) { return 0; };
>
> diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c
> index 5c0bded46a7c..0226350e704f 100644
> --- a/kernel/cgroup/cpuset-v1.c
> +++ b/kernel/cgroup/cpuset-v1.c
> @@ -560,7 +560,7 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
> dattr->relax_domain_level = c->relax_domain_level;
> }
>
> -void update_domain_attr_tree(struct sched_domain_attr *dattr,
> +static void update_domain_attr_tree(struct sched_domain_attr *dattr,
> struct cpuset *root_cs)
> {
> struct cpuset *cp;
> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
> index 6bb0b201c34b..3e3468d928f3 100644
> --- a/kernel/cgroup/cpuset.c
> +++ b/kernel/cgroup/cpuset.c
> @@ -789,18 +789,13 @@ static int generate_sched_domains(cpumask_var_t **domains,
> {
> struct cpuset *cp; /* top-down scan of cpusets */
> struct cpuset **csa; /* array of all cpuset ptrs */
> - int csn; /* how many cpuset ptrs in csa so far */
> int i, j; /* indices for partition finding loops */
> cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
> struct sched_domain_attr *dattr; /* attributes for custom domains */
> int ndoms = 0; /* number of sched domains in result */
> - int nslot; /* next empty doms[] struct cpumask slot */
> struct cgroup_subsys_state *pos_css;
> - bool root_load_balance = is_sched_load_balance(&top_cpuset);
> - bool cgrpv2 = cpuset_v2();
> - int nslot_update;
>
> - if (!cgrpv2)
> + if (!cpuset_v2())
> return cpuset1_generate_sched_domains(domains, attributes);
>
> doms = NULL;
> @@ -808,70 +803,25 @@ static int generate_sched_domains(cpumask_var_t **domains,
> csa = NULL;
>
> /* Special case for the 99% of systems with one, full, sched domain */
> - if (root_load_balance && cpumask_empty(subpartitions_cpus)) {
> -single_root_domain:
> + if (cpumask_empty(subpartitions_cpus)) {
> ndoms = 1;
> - doms = alloc_sched_domains(ndoms);
> - if (!doms)
> - goto done;
> -
> - dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
> - if (dattr) {
> - *dattr = SD_ATTR_INIT;
> - update_domain_attr_tree(dattr, &top_cpuset);
> - }
> - cpumask_and(doms[0], top_cpuset.effective_cpus,
> - housekeeping_cpumask(HK_TYPE_DOMAIN));
> -
> - goto done;
> + goto generate_doms;
That is not correct. The code under the generate_doms label will need to
access csa[0] which is not allocated yet and may cause panic. You either
need to keep the current code or move it after the csa allocation and
assign top_cpuset to csa[0].
> }
>
> csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL);
> if (!csa)
> goto done;
> - csn = 0;
>
> + /* Find how many partitions and cache them to csa[] */
> rcu_read_lock();
> - if (root_load_balance)
> - csa[csn++] = &top_cpuset;
> cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
The cpuset_for_each_descendant_pre() macro will visit the root
(top_cpuset) first and so it should be OK to remove the above 2 lines of
code.
Cheers,
Longman
On 2025/12/18 3:05, Waiman Long wrote:
> On 12/17/25 3:49 AM, Chen Ridong wrote:
>> From: Chen Ridong <chenridong@huawei.com>
>>
>> Following the introduction of cpuset1_generate_sched_domains() for v1
>> in the previous patch, v1-specific logic can now be removed from the
>> generic generate_sched_domains(). This patch cleans up the v1-only
>> code and ensures uf_node is only visible when CONFIG_CPUSETS_V1=y.
>>
>> Signed-off-by: Chen Ridong <chenridong@huawei.com>
>> ---
>> kernel/cgroup/cpuset-internal.h | 10 +--
>> kernel/cgroup/cpuset-v1.c | 2 +-
>> kernel/cgroup/cpuset.c | 144 +++++---------------------------
>> 3 files changed, 27 insertions(+), 129 deletions(-)
>>
>> diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h
>> index bd767f8cb0ed..ef7b7c5afd4c 100644
>> --- a/kernel/cgroup/cpuset-internal.h
>> +++ b/kernel/cgroup/cpuset-internal.h
>> @@ -175,14 +175,14 @@ struct cpuset {
>> /* Handle for cpuset.cpus.partition */
>> struct cgroup_file partition_file;
>> - /* Used to merge intersecting subsets for generate_sched_domains */
>> - struct uf_node node;
>> -
>> #ifdef CONFIG_CPUSETS_V1
>> struct fmeter fmeter; /* memory_pressure filter */
>> /* for custom sched domain */
>> int relax_domain_level;
>> +
>> + /* Used to merge intersecting subsets for generate_sched_domains */
>> + struct uf_node node;
>> #endif
>> };
>> @@ -315,8 +315,6 @@ void cpuset1_hotplug_update_tasks(struct cpuset *cs,
>> int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial);
>> void cpuset1_init(struct cpuset *cs);
>> void cpuset1_online_css(struct cgroup_subsys_state *css);
>> -void update_domain_attr_tree(struct sched_domain_attr *dattr,
>> - struct cpuset *root_cs);
>> int cpuset1_generate_sched_domains(cpumask_var_t **domains,
>> struct sched_domain_attr **attributes);
>> @@ -331,8 +329,6 @@ static inline int cpuset1_validate_change(struct cpuset *cur,
>> struct cpuset *trial) { return 0; }
>> static inline void cpuset1_init(struct cpuset *cs) {}
>> static inline void cpuset1_online_css(struct cgroup_subsys_state *css) {}
>> -static inline void update_domain_attr_tree(struct sched_domain_attr *dattr,
>> - struct cpuset *root_cs) {}
>> static inline int cpuset1_generate_sched_domains(cpumask_var_t **domains,
>> struct sched_domain_attr **attributes) { return 0; };
>> diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c
>> index 5c0bded46a7c..0226350e704f 100644
>> --- a/kernel/cgroup/cpuset-v1.c
>> +++ b/kernel/cgroup/cpuset-v1.c
>> @@ -560,7 +560,7 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
>> dattr->relax_domain_level = c->relax_domain_level;
>> }
>> -void update_domain_attr_tree(struct sched_domain_attr *dattr,
>> +static void update_domain_attr_tree(struct sched_domain_attr *dattr,
>> struct cpuset *root_cs)
>> {
>> struct cpuset *cp;
>> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
>> index 6bb0b201c34b..3e3468d928f3 100644
>> --- a/kernel/cgroup/cpuset.c
>> +++ b/kernel/cgroup/cpuset.c
>> @@ -789,18 +789,13 @@ static int generate_sched_domains(cpumask_var_t **domains,
>> {
>> struct cpuset *cp; /* top-down scan of cpusets */
>> struct cpuset **csa; /* array of all cpuset ptrs */
>> - int csn; /* how many cpuset ptrs in csa so far */
>> int i, j; /* indices for partition finding loops */
>> cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
>> struct sched_domain_attr *dattr; /* attributes for custom domains */
>> int ndoms = 0; /* number of sched domains in result */
>> - int nslot; /* next empty doms[] struct cpumask slot */
>> struct cgroup_subsys_state *pos_css;
>> - bool root_load_balance = is_sched_load_balance(&top_cpuset);
>> - bool cgrpv2 = cpuset_v2();
>> - int nslot_update;
>> - if (!cgrpv2)
>> + if (!cpuset_v2())
>> return cpuset1_generate_sched_domains(domains, attributes);
>> doms = NULL;
>> @@ -808,70 +803,25 @@ static int generate_sched_domains(cpumask_var_t **domains,
>> csa = NULL;
>> /* Special case for the 99% of systems with one, full, sched domain */
>> - if (root_load_balance && cpumask_empty(subpartitions_cpus)) {
>> -single_root_domain:
>> + if (cpumask_empty(subpartitions_cpus)) {
>> ndoms = 1;
>> - doms = alloc_sched_domains(ndoms);
>> - if (!doms)
>> - goto done;
>> -
>> - dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
>> - if (dattr) {
>> - *dattr = SD_ATTR_INIT;
>> - update_domain_attr_tree(dattr, &top_cpuset);
>> - }
>> - cpumask_and(doms[0], top_cpuset.effective_cpus,
>> - housekeeping_cpumask(HK_TYPE_DOMAIN));
>> -
>> - goto done;
>> + goto generate_doms;
>
> That is not correct. The code under the generate_doms label will need to access csa[0] which is not
> allocated yet and may cause panic. You either need to keep the current code or move it after the csa
> allocation and assign top_cpuset to csa[0].
>
>> }
>> csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL);
>> if (!csa)
>> goto done;
>> - csn = 0;
>> + /* Find how many partitions and cache them to csa[] */
>> rcu_read_lock();
>> - if (root_load_balance)
>> - csa[csn++] = &top_cpuset;
>> cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
>
> The cpuset_for_each_descendant_pre() macro will visit the root (top_cpuset) first and so it should
> be OK to remove the above 2 lines of code.
>
Yes, it is OK for v2, but we have to keep it in v1. If we remove it in v1, it will skip all whole tree.
--
Best regards,
Ridong
On 2025/12/18 3:05, Waiman Long wrote:
> On 12/17/25 3:49 AM, Chen Ridong wrote:
>> From: Chen Ridong <chenridong@huawei.com>
>>
>> Following the introduction of cpuset1_generate_sched_domains() for v1
>> in the previous patch, v1-specific logic can now be removed from the
>> generic generate_sched_domains(). This patch cleans up the v1-only
>> code and ensures uf_node is only visible when CONFIG_CPUSETS_V1=y.
>>
>> Signed-off-by: Chen Ridong <chenridong@huawei.com>
>> ---
>> kernel/cgroup/cpuset-internal.h | 10 +--
>> kernel/cgroup/cpuset-v1.c | 2 +-
>> kernel/cgroup/cpuset.c | 144 +++++---------------------------
>> 3 files changed, 27 insertions(+), 129 deletions(-)
>>
>> diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h
>> index bd767f8cb0ed..ef7b7c5afd4c 100644
>> --- a/kernel/cgroup/cpuset-internal.h
>> +++ b/kernel/cgroup/cpuset-internal.h
>> @@ -175,14 +175,14 @@ struct cpuset {
>> /* Handle for cpuset.cpus.partition */
>> struct cgroup_file partition_file;
>> - /* Used to merge intersecting subsets for generate_sched_domains */
>> - struct uf_node node;
>> -
>> #ifdef CONFIG_CPUSETS_V1
>> struct fmeter fmeter; /* memory_pressure filter */
>> /* for custom sched domain */
>> int relax_domain_level;
>> +
>> + /* Used to merge intersecting subsets for generate_sched_domains */
>> + struct uf_node node;
>> #endif
>> };
>> @@ -315,8 +315,6 @@ void cpuset1_hotplug_update_tasks(struct cpuset *cs,
>> int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial);
>> void cpuset1_init(struct cpuset *cs);
>> void cpuset1_online_css(struct cgroup_subsys_state *css);
>> -void update_domain_attr_tree(struct sched_domain_attr *dattr,
>> - struct cpuset *root_cs);
>> int cpuset1_generate_sched_domains(cpumask_var_t **domains,
>> struct sched_domain_attr **attributes);
>> @@ -331,8 +329,6 @@ static inline int cpuset1_validate_change(struct cpuset *cur,
>> struct cpuset *trial) { return 0; }
>> static inline void cpuset1_init(struct cpuset *cs) {}
>> static inline void cpuset1_online_css(struct cgroup_subsys_state *css) {}
>> -static inline void update_domain_attr_tree(struct sched_domain_attr *dattr,
>> - struct cpuset *root_cs) {}
>> static inline int cpuset1_generate_sched_domains(cpumask_var_t **domains,
>> struct sched_domain_attr **attributes) { return 0; };
>> diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c
>> index 5c0bded46a7c..0226350e704f 100644
>> --- a/kernel/cgroup/cpuset-v1.c
>> +++ b/kernel/cgroup/cpuset-v1.c
>> @@ -560,7 +560,7 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
>> dattr->relax_domain_level = c->relax_domain_level;
>> }
>> -void update_domain_attr_tree(struct sched_domain_attr *dattr,
>> +static void update_domain_attr_tree(struct sched_domain_attr *dattr,
>> struct cpuset *root_cs)
>> {
>> struct cpuset *cp;
>> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
>> index 6bb0b201c34b..3e3468d928f3 100644
>> --- a/kernel/cgroup/cpuset.c
>> +++ b/kernel/cgroup/cpuset.c
>> @@ -789,18 +789,13 @@ static int generate_sched_domains(cpumask_var_t **domains,
>> {
>> struct cpuset *cp; /* top-down scan of cpusets */
>> struct cpuset **csa; /* array of all cpuset ptrs */
>> - int csn; /* how many cpuset ptrs in csa so far */
>> int i, j; /* indices for partition finding loops */
>> cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
>> struct sched_domain_attr *dattr; /* attributes for custom domains */
>> int ndoms = 0; /* number of sched domains in result */
>> - int nslot; /* next empty doms[] struct cpumask slot */
>> struct cgroup_subsys_state *pos_css;
>> - bool root_load_balance = is_sched_load_balance(&top_cpuset);
>> - bool cgrpv2 = cpuset_v2();
>> - int nslot_update;
>> - if (!cgrpv2)
>> + if (!cpuset_v2())
>> return cpuset1_generate_sched_domains(domains, attributes);
>> doms = NULL;
>> @@ -808,70 +803,25 @@ static int generate_sched_domains(cpumask_var_t **domains,
>> csa = NULL;
>> /* Special case for the 99% of systems with one, full, sched domain */
>> - if (root_load_balance && cpumask_empty(subpartitions_cpus)) {
>> -single_root_domain:
>> + if (cpumask_empty(subpartitions_cpus)) {
>> ndoms = 1;
>> - doms = alloc_sched_domains(ndoms);
>> - if (!doms)
>> - goto done;
>> -
>> - dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
>> - if (dattr) {
>> - *dattr = SD_ATTR_INIT;
>> - update_domain_attr_tree(dattr, &top_cpuset);
>> - }
>> - cpumask_and(doms[0], top_cpuset.effective_cpus,
>> - housekeeping_cpumask(HK_TYPE_DOMAIN));
>> -
>> - goto done;
>> + goto generate_doms;
>
> That is not correct. The code under the generate_doms label will need to access csa[0] which is not
> allocated yet and may cause panic. You either need to keep the current code or move it after the csa
> allocation and assign top_cpuset to csa[0].
>
Thank you, Longman.
Sorry, I should note that I made a small change. I added a !csa check: if csa is not allocated, then
ndoms should equal 1, and we only need the top_cpuset (no csa is indeed required). I think it's
cleaner to avoid allocating csa when there's no valid partition.
```
+ for (i = 0; i < ndoms; i++) {
+ /*
+ * The top cpuset may contain some boot time isolated
+ * CPUs that need to be excluded from the sched domain.
+ */
+ if (!csa || csa[i] == &top_cpuset)
+ cpumask_and(doms[i], top_cpuset.effective_cpus,
+ housekeeping_cpumask(HK_TYPE_DOMAIN));
+ else
+ cpumask_copy(doms[i], csa[i]->effective_cpus);
+ if (dattr)
+ dattr[i] = SD_ATTR_INIT;
}
```
Tested with single‑domain generation — no panic or warning observed.
>> }
>> csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL);
>> if (!csa)
>> goto done;
>> - csn = 0;
>> + /* Find how many partitions and cache them to csa[] */
>> rcu_read_lock();
>> - if (root_load_balance)
>> - csa[csn++] = &top_cpuset;
>> cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
>
> The cpuset_for_each_descendant_pre() macro will visit the root (top_cpuset) first and so it should
> be OK to remove the above 2 lines of code.
>
> Cheers,
> Longman
>
--
Best regards,
Ridong
On 12/17/25 8:39 PM, Chen Ridong wrote:
>
> On 2025/12/18 3:05, Waiman Long wrote:
>> On 12/17/25 3:49 AM, Chen Ridong wrote:
>>> From: Chen Ridong <chenridong@huawei.com>
>>>
>>> Following the introduction of cpuset1_generate_sched_domains() for v1
>>> in the previous patch, v1-specific logic can now be removed from the
>>> generic generate_sched_domains(). This patch cleans up the v1-only
>>> code and ensures uf_node is only visible when CONFIG_CPUSETS_V1=y.
>>>
>>> Signed-off-by: Chen Ridong <chenridong@huawei.com>
>>> ---
>>> kernel/cgroup/cpuset-internal.h | 10 +--
>>> kernel/cgroup/cpuset-v1.c | 2 +-
>>> kernel/cgroup/cpuset.c | 144 +++++---------------------------
>>> 3 files changed, 27 insertions(+), 129 deletions(-)
>>>
>>> diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h
>>> index bd767f8cb0ed..ef7b7c5afd4c 100644
>>> --- a/kernel/cgroup/cpuset-internal.h
>>> +++ b/kernel/cgroup/cpuset-internal.h
>>> @@ -175,14 +175,14 @@ struct cpuset {
>>> /* Handle for cpuset.cpus.partition */
>>> struct cgroup_file partition_file;
>>> - /* Used to merge intersecting subsets for generate_sched_domains */
>>> - struct uf_node node;
>>> -
>>> #ifdef CONFIG_CPUSETS_V1
>>> struct fmeter fmeter; /* memory_pressure filter */
>>> /* for custom sched domain */
>>> int relax_domain_level;
>>> +
>>> + /* Used to merge intersecting subsets for generate_sched_domains */
>>> + struct uf_node node;
>>> #endif
>>> };
>>> @@ -315,8 +315,6 @@ void cpuset1_hotplug_update_tasks(struct cpuset *cs,
>>> int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial);
>>> void cpuset1_init(struct cpuset *cs);
>>> void cpuset1_online_css(struct cgroup_subsys_state *css);
>>> -void update_domain_attr_tree(struct sched_domain_attr *dattr,
>>> - struct cpuset *root_cs);
>>> int cpuset1_generate_sched_domains(cpumask_var_t **domains,
>>> struct sched_domain_attr **attributes);
>>> @@ -331,8 +329,6 @@ static inline int cpuset1_validate_change(struct cpuset *cur,
>>> struct cpuset *trial) { return 0; }
>>> static inline void cpuset1_init(struct cpuset *cs) {}
>>> static inline void cpuset1_online_css(struct cgroup_subsys_state *css) {}
>>> -static inline void update_domain_attr_tree(struct sched_domain_attr *dattr,
>>> - struct cpuset *root_cs) {}
>>> static inline int cpuset1_generate_sched_domains(cpumask_var_t **domains,
>>> struct sched_domain_attr **attributes) { return 0; };
>>> diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c
>>> index 5c0bded46a7c..0226350e704f 100644
>>> --- a/kernel/cgroup/cpuset-v1.c
>>> +++ b/kernel/cgroup/cpuset-v1.c
>>> @@ -560,7 +560,7 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
>>> dattr->relax_domain_level = c->relax_domain_level;
>>> }
>>> -void update_domain_attr_tree(struct sched_domain_attr *dattr,
>>> +static void update_domain_attr_tree(struct sched_domain_attr *dattr,
>>> struct cpuset *root_cs)
>>> {
>>> struct cpuset *cp;
>>> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
>>> index 6bb0b201c34b..3e3468d928f3 100644
>>> --- a/kernel/cgroup/cpuset.c
>>> +++ b/kernel/cgroup/cpuset.c
>>> @@ -789,18 +789,13 @@ static int generate_sched_domains(cpumask_var_t **domains,
>>> {
>>> struct cpuset *cp; /* top-down scan of cpusets */
>>> struct cpuset **csa; /* array of all cpuset ptrs */
>>> - int csn; /* how many cpuset ptrs in csa so far */
>>> int i, j; /* indices for partition finding loops */
>>> cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
>>> struct sched_domain_attr *dattr; /* attributes for custom domains */
>>> int ndoms = 0; /* number of sched domains in result */
>>> - int nslot; /* next empty doms[] struct cpumask slot */
>>> struct cgroup_subsys_state *pos_css;
>>> - bool root_load_balance = is_sched_load_balance(&top_cpuset);
>>> - bool cgrpv2 = cpuset_v2();
>>> - int nslot_update;
>>> - if (!cgrpv2)
>>> + if (!cpuset_v2())
>>> return cpuset1_generate_sched_domains(domains, attributes);
>>> doms = NULL;
>>> @@ -808,70 +803,25 @@ static int generate_sched_domains(cpumask_var_t **domains,
>>> csa = NULL;
>>> /* Special case for the 99% of systems with one, full, sched domain */
>>> - if (root_load_balance && cpumask_empty(subpartitions_cpus)) {
>>> -single_root_domain:
>>> + if (cpumask_empty(subpartitions_cpus)) {
>>> ndoms = 1;
>>> - doms = alloc_sched_domains(ndoms);
>>> - if (!doms)
>>> - goto done;
>>> -
>>> - dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
>>> - if (dattr) {
>>> - *dattr = SD_ATTR_INIT;
>>> - update_domain_attr_tree(dattr, &top_cpuset);
>>> - }
>>> - cpumask_and(doms[0], top_cpuset.effective_cpus,
>>> - housekeeping_cpumask(HK_TYPE_DOMAIN));
>>> -
>>> - goto done;
>>> + goto generate_doms;
>> That is not correct. The code under the generate_doms label will need to access csa[0] which is not
>> allocated yet and may cause panic. You either need to keep the current code or move it after the csa
>> allocation and assign top_cpuset to csa[0].
>>
> Thank you, Longman.
>
> Sorry, I should note that I made a small change. I added a !csa check: if csa is not allocated, then
> ndoms should equal 1, and we only need the top_cpuset (no csa is indeed required). I think it's
> cleaner to avoid allocating csa when there's no valid partition.
>
> ```
> + for (i = 0; i < ndoms; i++) {
> + /*
> + * The top cpuset may contain some boot time isolated
> + * CPUs that need to be excluded from the sched domain.
> + */
> + if (!csa || csa[i] == &top_cpuset)
> + cpumask_and(doms[i], top_cpuset.effective_cpus,
> + housekeeping_cpumask(HK_TYPE_DOMAIN));
> + else
> + cpumask_copy(doms[i], csa[i]->effective_cpus);
> + if (dattr)
> + dattr[i] = SD_ATTR_INIT;
> }
> ```
>
> Tested with single‑domain generation — no panic or warning observed.
Yes, !csa check here should be good enough to handle the NULL csa case
here. Maybe adding a comment in the goto line saying that !csa will be
correctly handled.
Cheers,
Longman
On 2025/12/18 11:14, Waiman Long wrote:
> On 12/17/25 8:39 PM, Chen Ridong wrote:
>>
>> On 2025/12/18 3:05, Waiman Long wrote:
>>> On 12/17/25 3:49 AM, Chen Ridong wrote:
>>>> From: Chen Ridong <chenridong@huawei.com>
>>>>
>>>> Following the introduction of cpuset1_generate_sched_domains() for v1
>>>> in the previous patch, v1-specific logic can now be removed from the
>>>> generic generate_sched_domains(). This patch cleans up the v1-only
>>>> code and ensures uf_node is only visible when CONFIG_CPUSETS_V1=y.
>>>>
>>>> Signed-off-by: Chen Ridong <chenridong@huawei.com>
>>>> ---
>>>> kernel/cgroup/cpuset-internal.h | 10 +--
>>>> kernel/cgroup/cpuset-v1.c | 2 +-
>>>> kernel/cgroup/cpuset.c | 144 +++++---------------------------
>>>> 3 files changed, 27 insertions(+), 129 deletions(-)
>>>>
>>>> diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h
>>>> index bd767f8cb0ed..ef7b7c5afd4c 100644
>>>> --- a/kernel/cgroup/cpuset-internal.h
>>>> +++ b/kernel/cgroup/cpuset-internal.h
>>>> @@ -175,14 +175,14 @@ struct cpuset {
>>>> /* Handle for cpuset.cpus.partition */
>>>> struct cgroup_file partition_file;
>>>> - /* Used to merge intersecting subsets for generate_sched_domains */
>>>> - struct uf_node node;
>>>> -
>>>> #ifdef CONFIG_CPUSETS_V1
>>>> struct fmeter fmeter; /* memory_pressure filter */
>>>> /* for custom sched domain */
>>>> int relax_domain_level;
>>>> +
>>>> + /* Used to merge intersecting subsets for generate_sched_domains */
>>>> + struct uf_node node;
>>>> #endif
>>>> };
>>>> @@ -315,8 +315,6 @@ void cpuset1_hotplug_update_tasks(struct cpuset *cs,
>>>> int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial);
>>>> void cpuset1_init(struct cpuset *cs);
>>>> void cpuset1_online_css(struct cgroup_subsys_state *css);
>>>> -void update_domain_attr_tree(struct sched_domain_attr *dattr,
>>>> - struct cpuset *root_cs);
>>>> int cpuset1_generate_sched_domains(cpumask_var_t **domains,
>>>> struct sched_domain_attr **attributes);
>>>> @@ -331,8 +329,6 @@ static inline int cpuset1_validate_change(struct cpuset *cur,
>>>> struct cpuset *trial) { return 0; }
>>>> static inline void cpuset1_init(struct cpuset *cs) {}
>>>> static inline void cpuset1_online_css(struct cgroup_subsys_state *css) {}
>>>> -static inline void update_domain_attr_tree(struct sched_domain_attr *dattr,
>>>> - struct cpuset *root_cs) {}
>>>> static inline int cpuset1_generate_sched_domains(cpumask_var_t **domains,
>>>> struct sched_domain_attr **attributes) { return 0; };
>>>> diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c
>>>> index 5c0bded46a7c..0226350e704f 100644
>>>> --- a/kernel/cgroup/cpuset-v1.c
>>>> +++ b/kernel/cgroup/cpuset-v1.c
>>>> @@ -560,7 +560,7 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
>>>> dattr->relax_domain_level = c->relax_domain_level;
>>>> }
>>>> -void update_domain_attr_tree(struct sched_domain_attr *dattr,
>>>> +static void update_domain_attr_tree(struct sched_domain_attr *dattr,
>>>> struct cpuset *root_cs)
>>>> {
>>>> struct cpuset *cp;
>>>> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
>>>> index 6bb0b201c34b..3e3468d928f3 100644
>>>> --- a/kernel/cgroup/cpuset.c
>>>> +++ b/kernel/cgroup/cpuset.c
>>>> @@ -789,18 +789,13 @@ static int generate_sched_domains(cpumask_var_t **domains,
>>>> {
>>>> struct cpuset *cp; /* top-down scan of cpusets */
>>>> struct cpuset **csa; /* array of all cpuset ptrs */
>>>> - int csn; /* how many cpuset ptrs in csa so far */
>>>> int i, j; /* indices for partition finding loops */
>>>> cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
>>>> struct sched_domain_attr *dattr; /* attributes for custom domains */
>>>> int ndoms = 0; /* number of sched domains in result */
>>>> - int nslot; /* next empty doms[] struct cpumask slot */
>>>> struct cgroup_subsys_state *pos_css;
>>>> - bool root_load_balance = is_sched_load_balance(&top_cpuset);
>>>> - bool cgrpv2 = cpuset_v2();
>>>> - int nslot_update;
>>>> - if (!cgrpv2)
>>>> + if (!cpuset_v2())
>>>> return cpuset1_generate_sched_domains(domains, attributes);
>>>> doms = NULL;
>>>> @@ -808,70 +803,25 @@ static int generate_sched_domains(cpumask_var_t **domains,
>>>> csa = NULL;
>>>> /* Special case for the 99% of systems with one, full, sched domain */
>>>> - if (root_load_balance && cpumask_empty(subpartitions_cpus)) {
>>>> -single_root_domain:
>>>> + if (cpumask_empty(subpartitions_cpus)) {
>>>> ndoms = 1;
>>>> - doms = alloc_sched_domains(ndoms);
>>>> - if (!doms)
>>>> - goto done;
>>>> -
>>>> - dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
>>>> - if (dattr) {
>>>> - *dattr = SD_ATTR_INIT;
>>>> - update_domain_attr_tree(dattr, &top_cpuset);
>>>> - }
>>>> - cpumask_and(doms[0], top_cpuset.effective_cpus,
>>>> - housekeeping_cpumask(HK_TYPE_DOMAIN));
>>>> -
>>>> - goto done;
>>>> + goto generate_doms;
>>> That is not correct. The code under the generate_doms label will need to access csa[0] which is not
>>> allocated yet and may cause panic. You either need to keep the current code or move it after the csa
>>> allocation and assign top_cpuset to csa[0].
>>>
>> Thank you, Longman.
>>
>> Sorry, I should note that I made a small change. I added a !csa check: if csa is not allocated, then
>> ndoms should equal 1, and we only need the top_cpuset (no csa is indeed required). I think it's
>> cleaner to avoid allocating csa when there's no valid partition.
>>
>> ```
>> + for (i = 0; i < ndoms; i++) {
>> + /*
>> + * The top cpuset may contain some boot time isolated
>> + * CPUs that need to be excluded from the sched domain.
>> + */
>> + if (!csa || csa[i] == &top_cpuset)
>> + cpumask_and(doms[i], top_cpuset.effective_cpus,
>> + housekeeping_cpumask(HK_TYPE_DOMAIN));
>> + else
>> + cpumask_copy(doms[i], csa[i]->effective_cpus);
>> + if (dattr)
>> + dattr[i] = SD_ATTR_INIT;
>> }
>> ```
>>
>> Tested with single‑domain generation — no panic or warning observed.
>
> Yes, !csa check here should be good enough to handle the NULL csa case here. Maybe adding a comment
> in the goto line saying that !csa will be correctly handled.
>
Good idea, will add.
--
Best regards,
Ridong
© 2016 - 2026 Red Hat, Inc.