[PATCH -next v2] cpuset: Treat cpusets in attaching as populated

Chen Ridong posted 1 patch 2 months, 3 weeks ago
There is a newer version of this series
kernel/cgroup/cpuset.c | 31 +++++++++++++++++++++++--------
1 file changed, 23 insertions(+), 8 deletions(-)
[PATCH -next v2] cpuset: Treat cpusets in attaching as populated
Posted by Chen Ridong 2 months, 3 weeks ago
From: Chen Ridong <chenridong@huawei.com>

Currently, the check for whether a partition is populated does not
account for tasks in the cpuset of attaching. This is a corner case
that can leave a task stuck in a partition with no effective CPUs.

The race condition occurs as follows:

cpu0				cpu1
				//cpuset A  with cpu N
migrate task p to A
cpuset_can_attach
// with effective cpus
// check ok

// cpuset_mutex is not held	// clear cpuset.cpus.exclusive
				// making effective cpus empty
				update_exclusive_cpumask
				// tasks_nocpu_error check ok
				// empty effective cpus, partition valid
cpuset_attach
...
// task p stays in A, with non-effective cpus.

To fix this issue, this patch introduces cs_is_populated, which considers
tasks in the attaching cpuset. This new helper is used in validate_change
and partition_is_populated.

Fixes: e2d59900d936 ("cgroup/cpuset: Allow no-task partition to have empty cpuset.cpus.effective")
Signed-off-by: Chen Ridong <chenridong@huawei.com>
---
 kernel/cgroup/cpuset.c | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index daf813386260..bd273b1e09b0 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -356,6 +356,15 @@ static inline bool is_in_v2_mode(void)
 	      (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
 }
 
+static inline bool cs_is_populated(struct cpuset *cs)
+{
+	lockdep_assert_held(&cpuset_mutex);
+
+	/* Cpusets in the process of attaching should be considered as populated */
+	return cgroup_is_populated(cs->css.cgroup) ||
+		cs->attach_in_progress;
+}
+
 /**
  * partition_is_populated - check if partition has tasks
  * @cs: partition root to be checked
@@ -373,19 +382,25 @@ static inline bool is_in_v2_mode(void)
 static inline bool partition_is_populated(struct cpuset *cs,
 					  struct cpuset *excluded_child)
 {
-	struct cgroup_subsys_state *css;
-	struct cpuset *child;
+	struct cpuset *cp;
+	struct cgroup_subsys_state *pos_css;
 
-	if (cs->css.cgroup->nr_populated_csets)
+	/*
+	 * We cannot call cs_is_populated(cs) directly, as
+	 * nr_populated_domain_children may include populated
+	 * csets from descendants that are partitions.
+	 */
+	if (cs->css.cgroup->nr_populated_csets ||
+	    cs->attach_in_progress)
 		return true;
 
 	rcu_read_lock();
-	cpuset_for_each_child(child, css, cs) {
-		if (child == excluded_child)
+	cpuset_for_each_descendant_pre(cp, pos_css, cs) {
+		if (cp == cs || cp == excluded_child)
 			continue;
-		if (is_partition_valid(child))
+		if (is_partition_valid(cp))
 			continue;
-		if (cgroup_is_populated(child->css.cgroup)) {
+		if (cs_is_populated(cp)) {
 			rcu_read_unlock();
 			return true;
 		}
@@ -670,7 +685,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
 	 * be changed to have empty cpus_allowed or mems_allowed.
 	 */
 	ret = -ENOSPC;
-	if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) {
+	if (cs_is_populated(cur)) {
 		if (!cpumask_empty(cur->cpus_allowed) &&
 		    cpumask_empty(trial->cpus_allowed))
 			goto out;
-- 
2.34.1
Re: [PATCH -next v2] cpuset: Treat cpusets in attaching as populated
Posted by Waiman Long 2 months, 3 weeks ago
On 11/13/25 8:28 AM, Chen Ridong wrote:
> From: Chen Ridong <chenridong@huawei.com>
>
> Currently, the check for whether a partition is populated does not
> account for tasks in the cpuset of attaching. This is a corner case
> that can leave a task stuck in a partition with no effective CPUs.
>
> The race condition occurs as follows:
>
> cpu0				cpu1
> 				//cpuset A  with cpu N
> migrate task p to A
> cpuset_can_attach
> // with effective cpus
> // check ok
>
> // cpuset_mutex is not held	// clear cpuset.cpus.exclusive
> 				// making effective cpus empty
> 				update_exclusive_cpumask
> 				// tasks_nocpu_error check ok
> 				// empty effective cpus, partition valid
> cpuset_attach
> ...
> // task p stays in A, with non-effective cpus.
>
> To fix this issue, this patch introduces cs_is_populated, which considers
> tasks in the attaching cpuset. This new helper is used in validate_change
> and partition_is_populated.
>
> Fixes: e2d59900d936 ("cgroup/cpuset: Allow no-task partition to have empty cpuset.cpus.effective")
> Signed-off-by: Chen Ridong <chenridong@huawei.com>
> ---
>   kernel/cgroup/cpuset.c | 31 +++++++++++++++++++++++--------
>   1 file changed, 23 insertions(+), 8 deletions(-)
>
> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
> index daf813386260..bd273b1e09b0 100644
> --- a/kernel/cgroup/cpuset.c
> +++ b/kernel/cgroup/cpuset.c
> @@ -356,6 +356,15 @@ static inline bool is_in_v2_mode(void)
>   	      (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
>   }
>   
> +static inline bool cs_is_populated(struct cpuset *cs)
Could you name it as "cpuset_is_populated()" as it is a cpuset specific 
version of cgroup_is_populated()?
> +{
> +	lockdep_assert_held(&cpuset_mutex);
> +
> +	/* Cpusets in the process of attaching should be considered as populated */
> +	return cgroup_is_populated(cs->css.cgroup) ||
> +		cs->attach_in_progress;
> +}
> +
>   /**
>    * partition_is_populated - check if partition has tasks
>    * @cs: partition root to be checked
> @@ -373,19 +382,25 @@ static inline bool is_in_v2_mode(void)
>   static inline bool partition_is_populated(struct cpuset *cs,
>   					  struct cpuset *excluded_child)
>   {
> -	struct cgroup_subsys_state *css;
> -	struct cpuset *child;
> +	struct cpuset *cp;
> +	struct cgroup_subsys_state *pos_css;
>   
> -	if (cs->css.cgroup->nr_populated_csets)
> +	/*
> +	 * We cannot call cs_is_populated(cs) directly, as
> +	 * nr_populated_domain_children may include populated
> +	 * csets from descendants that are partitions.
> +	 */
> +	if (cs->css.cgroup->nr_populated_csets ||
> +	    cs->attach_in_progress)
>   		return true;
>   
>   	rcu_read_lock();
> -	cpuset_for_each_child(child, css, cs) {
> -		if (child == excluded_child)
> +	cpuset_for_each_descendant_pre(cp, pos_css, cs) {
> +		if (cp == cs || cp == excluded_child)
>   			continue;
> -		if (is_partition_valid(child))
> +		if (is_partition_valid(cp))

You should add " pos_css = css_rightmost_descendant(pos_css);" to skip 
the whole subtree.

Cheers,
Longman


>   			continue;
> -		if (cgroup_is_populated(child->css.cgroup)) {
> +		if (cs_is_populated(cp)) {
>   			rcu_read_unlock();
>   			return true;
>   		}
> @@ -670,7 +685,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
>   	 * be changed to have empty cpus_allowed or mems_allowed.
>   	 */
>   	ret = -ENOSPC;
> -	if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) {
> +	if (cs_is_populated(cur)) {
>   		if (!cpumask_empty(cur->cpus_allowed) &&
>   		    cpumask_empty(trial->cpus_allowed))
>   			goto out;
Re: [PATCH -next v2] cpuset: Treat cpusets in attaching as populated
Posted by Chen Ridong 2 months, 3 weeks ago

On 2025/11/13 22:14, Waiman Long wrote:
> On 11/13/25 8:28 AM, Chen Ridong wrote:
>> From: Chen Ridong <chenridong@huawei.com>
>>
>> Currently, the check for whether a partition is populated does not
>> account for tasks in the cpuset of attaching. This is a corner case
>> that can leave a task stuck in a partition with no effective CPUs.
>>
>> The race condition occurs as follows:
>>
>> cpu0                cpu1
>>                 //cpuset A  with cpu N
>> migrate task p to A
>> cpuset_can_attach
>> // with effective cpus
>> // check ok
>>
>> // cpuset_mutex is not held    // clear cpuset.cpus.exclusive
>>                 // making effective cpus empty
>>                 update_exclusive_cpumask
>>                 // tasks_nocpu_error check ok
>>                 // empty effective cpus, partition valid
>> cpuset_attach
>> ...
>> // task p stays in A, with non-effective cpus.
>>
>> To fix this issue, this patch introduces cs_is_populated, which considers
>> tasks in the attaching cpuset. This new helper is used in validate_change
>> and partition_is_populated.
>>
>> Fixes: e2d59900d936 ("cgroup/cpuset: Allow no-task partition to have empty cpuset.cpus.effective")
>> Signed-off-by: Chen Ridong <chenridong@huawei.com>
>> ---
>>   kernel/cgroup/cpuset.c | 31 +++++++++++++++++++++++--------
>>   1 file changed, 23 insertions(+), 8 deletions(-)
>>
>> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
>> index daf813386260..bd273b1e09b0 100644
>> --- a/kernel/cgroup/cpuset.c
>> +++ b/kernel/cgroup/cpuset.c
>> @@ -356,6 +356,15 @@ static inline bool is_in_v2_mode(void)
>>             (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
>>   }
>>   +static inline bool cs_is_populated(struct cpuset *cs)
> Could you name it as "cpuset_is_populated()" as it is a cpuset specific version of
> cgroup_is_populated()?

Sure, will update.

>> +{
>> +    lockdep_assert_held(&cpuset_mutex);
>> +
>> +    /* Cpusets in the process of attaching should be considered as populated */
>> +    return cgroup_is_populated(cs->css.cgroup) ||
>> +        cs->attach_in_progress;
>> +}
>> +
>>   /**
>>    * partition_is_populated - check if partition has tasks
>>    * @cs: partition root to be checked
>> @@ -373,19 +382,25 @@ static inline bool is_in_v2_mode(void)
>>   static inline bool partition_is_populated(struct cpuset *cs,
>>                         struct cpuset *excluded_child)
>>   {
>> -    struct cgroup_subsys_state *css;
>> -    struct cpuset *child;
>> +    struct cpuset *cp;
>> +    struct cgroup_subsys_state *pos_css;
>>   -    if (cs->css.cgroup->nr_populated_csets)
>> +    /*
>> +     * We cannot call cs_is_populated(cs) directly, as
>> +     * nr_populated_domain_children may include populated
>> +     * csets from descendants that are partitions.
>> +     */
>> +    if (cs->css.cgroup->nr_populated_csets ||
>> +        cs->attach_in_progress)
>>           return true;
>>         rcu_read_lock();
>> -    cpuset_for_each_child(child, css, cs) {
>> -        if (child == excluded_child)
>> +    cpuset_for_each_descendant_pre(cp, pos_css, cs) {
>> +        if (cp == cs || cp == excluded_child)
>>               continue;
>> -        if (is_partition_valid(child))
>> +        if (is_partition_valid(cp))
> 
> You should add " pos_css = css_rightmost_descendant(pos_css);" to skip the whole subtree.
> 
> Cheers,
> Longman
> 

Oh... you're right, I should have caught this.

Thank you so much, Longman!

> 
>>               continue;
>> -        if (cgroup_is_populated(child->css.cgroup)) {
>> +        if (cs_is_populated(cp)) {
>>               rcu_read_unlock();
>>               return true;
>>           }
>> @@ -670,7 +685,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
>>        * be changed to have empty cpus_allowed or mems_allowed.
>>        */
>>       ret = -ENOSPC;
>> -    if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) {
>> +    if (cs_is_populated(cur)) {
>>           if (!cpumask_empty(cur->cpus_allowed) &&
>>               cpumask_empty(trial->cpus_allowed))
>>               goto out;
> 

-- 
Best regards,
Ridong