kernel/cgroup/cpuset.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-)
From: Chen Ridong <chenridong@huawei.com>
A test scenario revealed inconsistent results based on operation order:
Scenario 1:
#cd /sys/fs/cgroup/
#mkdir A1
#mkdir B1
#echo 1-2 > B1/cpuset.cpus
#echo 0-1 > A1/cpuset.cpus
#echo root > A1/cpuset.cpus.partition
#cat A1/cpuset.cpus.partition
root invalid (Cpu list in cpuset.cpus not exclusive)
Scenario 2:
#cd /sys/fs/cgroup/
#mkdir A1
#mkdir B1
#echo 1-2 > B1/cpuset.cpus
#echo root > A1/cpuset.cpus.partition
#echo 0-1 > A1/cpuset.cpus
#cat A1/cpuset.cpus.partition
root
The second scenario produces an unexpected result: A1 should be marked
as invalid but is incorrectly recognized as valid. This occurs because
when validate_change is invoked, A1 (in root-invalid state) may
automatically transition to a valid partition, with non-exclusive state
checks against siblings, leading to incorrect validation.
To fix this inconsistency, treat trialcs in root-invalid state as exclusive
during validation and set the corresponding exclusive flags, ensuring
consistent behavior regardless of operation order.
Signed-off-by: Chen Ridong <chenridong@huawei.com>
---
kernel/cgroup/cpuset.c | 19 ++++++++++++++-----
1 file changed, 14 insertions(+), 5 deletions(-)
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index daf813386260..a189f356b5f1 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2526,6 +2526,18 @@ static void partition_cpus_change(struct cpuset *cs, struct cpuset *trialcs,
}
}
+static int init_trialcs(struct cpuset *cs, struct cpuset *trialcs)
+{
+ trialcs->prs_err = PERR_NONE;
+ /*
+ * If partition_root_state != 0, it may automatically change to a partition,
+ * Therefore, we should treat trialcs as exclusive during validation
+ */
+ if (trialcs->partition_root_state)
+ set_bit(CS_CPU_EXCLUSIVE, &trialcs->flags);
+ return compute_trialcs_excpus(trialcs, cs);
+}
+
/**
* update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
* @cs: the cpuset to consider
@@ -2551,9 +2563,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (alloc_tmpmasks(&tmp))
return -ENOMEM;
- compute_trialcs_excpus(trialcs, cs);
- trialcs->prs_err = PERR_NONE;
-
+ init_trialcs(cs, trialcs);
retval = cpus_allowed_validate_change(cs, trialcs, &tmp);
if (retval < 0)
goto out_free;
@@ -2612,7 +2622,7 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
* Reject the change if there is exclusive CPUs conflict with
* the siblings.
*/
- if (compute_trialcs_excpus(trialcs, cs))
+ if (init_trialcs(cs, trialcs))
return -EINVAL;
/*
@@ -2628,7 +2638,6 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (alloc_tmpmasks(&tmp))
return -ENOMEM;
- trialcs->prs_err = PERR_NONE;
partition_cpus_change(cs, trialcs, &tmp);
spin_lock_irq(&callback_lock);
--
2.34.1
On 11/15/25 4:31 AM, Chen Ridong wrote:
> From: Chen Ridong <chenridong@huawei.com>
>
> A test scenario revealed inconsistent results based on operation order:
> Scenario 1:
> #cd /sys/fs/cgroup/
> #mkdir A1
> #mkdir B1
> #echo 1-2 > B1/cpuset.cpus
> #echo 0-1 > A1/cpuset.cpus
> #echo root > A1/cpuset.cpus.partition
> #cat A1/cpuset.cpus.partition
> root invalid (Cpu list in cpuset.cpus not exclusive)
>
> Scenario 2:
> #cd /sys/fs/cgroup/
> #mkdir A1
> #mkdir B1
> #echo 1-2 > B1/cpuset.cpus
> #echo root > A1/cpuset.cpus.partition
> #echo 0-1 > A1/cpuset.cpus
> #cat A1/cpuset.cpus.partition
> root
>
> The second scenario produces an unexpected result: A1 should be marked
> as invalid but is incorrectly recognized as valid. This occurs because
> when validate_change is invoked, A1 (in root-invalid state) may
> automatically transition to a valid partition, with non-exclusive state
> checks against siblings, leading to incorrect validation.
>
> To fix this inconsistency, treat trialcs in root-invalid state as exclusive
> during validation and set the corresponding exclusive flags, ensuring
> consistent behavior regardless of operation order.
>
> Signed-off-by: Chen Ridong <chenridong@huawei.com>
> ---
> kernel/cgroup/cpuset.c | 19 ++++++++++++++-----
> 1 file changed, 14 insertions(+), 5 deletions(-)
>
> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
> index daf813386260..a189f356b5f1 100644
> --- a/kernel/cgroup/cpuset.c
> +++ b/kernel/cgroup/cpuset.c
> @@ -2526,6 +2526,18 @@ static void partition_cpus_change(struct cpuset *cs, struct cpuset *trialcs,
> }
> }
>
> +static int init_trialcs(struct cpuset *cs, struct cpuset *trialcs)
> +{
> + trialcs->prs_err = PERR_NONE;
> + /*
> + * If partition_root_state != 0, it may automatically change to a partition,
> + * Therefore, we should treat trialcs as exclusive during validation
> + */
> + if (trialcs->partition_root_state)
> + set_bit(CS_CPU_EXCLUSIVE, &trialcs->flags);
Nit: We usually use the non-atomic version __set_bit() if concurrent
access isn't possible which is true in this case.
> + return compute_trialcs_excpus(trialcs, cs);
> +}
> +
> /**
> * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
> * @cs: the cpuset to consider
> @@ -2551,9 +2563,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
> if (alloc_tmpmasks(&tmp))
> return -ENOMEM;
>
> - compute_trialcs_excpus(trialcs, cs);
> - trialcs->prs_err = PERR_NONE;
> -
> + init_trialcs(cs, trialcs);
> retval = cpus_allowed_validate_change(cs, trialcs, &tmp);
> if (retval < 0)
> goto out_free;
> @@ -2612,7 +2622,7 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
> * Reject the change if there is exclusive CPUs conflict with
> * the siblings.
> */
> - if (compute_trialcs_excpus(trialcs, cs))
> + if (init_trialcs(cs, trialcs))
> return -EINVAL;
>
> /*
> @@ -2628,7 +2638,6 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
> if (alloc_tmpmasks(&tmp))
> return -ENOMEM;
>
> - trialcs->prs_err = PERR_NONE;
> partition_cpus_change(cs, trialcs, &tmp);
>
> spin_lock_irq(&callback_lock);
Acked-by: Waiman Long <longman@redhat.com>
On 2025/11/15 09:31, Chen Ridong wrote:
>A test scenario revealed inconsistent results based on operation order:
>Scenario 1:
> #cd /sys/fs/cgroup/
> #mkdir A1
> #mkdir B1
> #echo 1-2 > B1/cpuset.cpus
> #echo 0-1 > A1/cpuset.cpus
> #echo root > A1/cpuset.cpus.partition
> #cat A1/cpuset.cpus.partition
> root invalid (Cpu list in cpuset.cpus not exclusive)
>
>Scenario 2:
> #cd /sys/fs/cgroup/
> #mkdir A1
> #mkdir B1
> #echo 1-2 > B1/cpuset.cpus
> #echo root > A1/cpuset.cpus.partition
> #echo 0-1 > A1/cpuset.cpus
> #cat A1/cpuset.cpus.partition
> root
>
>The second scenario produces an unexpected result: A1 should be marked
>as invalid but is incorrectly recognized as valid. This occurs because
>when validate_change is invoked, A1 (in root-invalid state) may
>automatically transition to a valid partition, with non-exclusive state
>checks against siblings, leading to incorrect validation.
>
>To fix this inconsistency, treat trialcs in root-invalid state as exclusive
>during validation and set the corresponding exclusive flags, ensuring
>consistent behavior regardless of operation order.
>
>Signed-off-by: Chen Ridong <chenridong@huawei.com>
>---
> kernel/cgroup/cpuset.c | 19 ++++++++++++++-----
> 1 file changed, 14 insertions(+), 5 deletions(-)
>
>diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
>index daf813386260..a189f356b5f1 100644
>--- a/kernel/cgroup/cpuset.c
>+++ b/kernel/cgroup/cpuset.c
>@@ -2526,6 +2526,18 @@ static void partition_cpus_change(struct cpuset *cs, struct cpuset *trialcs,
> }
> }
>
>+static int init_trialcs(struct cpuset *cs, struct cpuset *trialcs)
>+{
>+ trialcs->prs_err = PERR_NONE;
>+ /*
>+ * If partition_root_state != 0, it may automatically change to a partition,
>+ * Therefore, we should treat trialcs as exclusive during validation
>+ */
>+ if (trialcs->partition_root_state)
>+ set_bit(CS_CPU_EXCLUSIVE, &trialcs->flags);
>+ return compute_trialcs_excpus(trialcs, cs);
>+}
>+
> /**
> * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
> * @cs: the cpuset to consider
>@@ -2551,9 +2563,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
> if (alloc_tmpmasks(&tmp))
> return -ENOMEM;
>
>- compute_trialcs_excpus(trialcs, cs);
>- trialcs->prs_err = PERR_NONE;
>-
>+ init_trialcs(cs, trialcs);
> retval = cpus_allowed_validate_change(cs, trialcs, &tmp);
> if (retval < 0)
> goto out_free;
>@@ -2612,7 +2622,7 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
> * Reject the change if there is exclusive CPUs conflict with
> * the siblings.
> */
>- if (compute_trialcs_excpus(trialcs, cs))
>+ if (init_trialcs(cs, trialcs))
> return -EINVAL;
>
> /*
>@@ -2628,7 +2638,6 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
> if (alloc_tmpmasks(&tmp))
> return -ENOMEM;
>
>- trialcs->prs_err = PERR_NONE;
> partition_cpus_change(cs, trialcs, &tmp);
>
> spin_lock_irq(&callback_lock);
Hi, Ridong,
Maybe, this patch does not apply to the following cases:
Step
#1> echo "root" > A1/cpuset.cpus.partition
#1> echo "0-1" > B1/cpuset.cpus
#2> echo "1-2" > A1/cpuset.cpus.exclusive -> return error
It should return success here.
Please consider the following modification.
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 52468d2c178a..b4085438368c 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -609,6 +609,9 @@ static inline bool cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2)
cpumask_subset(cs2->cpus_allowed, cs1->exclusive_cpus))
return true;
+ if (cpumask_empty(cs1->exclusive_cpus))
+ return cpumask_intersects(cs1->cpus_allowed, cs2->cpus_allowed);
+
return false;
}
Thanks,
Sun Shaojie
On 2025/11/17 12:35, Sun Shaojie wrote:
> On 2025/11/15 09:31, Chen Ridong wrote:
>> A test scenario revealed inconsistent results based on operation order:
>> Scenario 1:
>> #cd /sys/fs/cgroup/
>> #mkdir A1
>> #mkdir B1
>> #echo 1-2 > B1/cpuset.cpus
>> #echo 0-1 > A1/cpuset.cpus
>> #echo root > A1/cpuset.cpus.partition
>> #cat A1/cpuset.cpus.partition
>> root invalid (Cpu list in cpuset.cpus not exclusive)
>>
>> Scenario 2:
>> #cd /sys/fs/cgroup/
>> #mkdir A1
>> #mkdir B1
>> #echo 1-2 > B1/cpuset.cpus
>> #echo root > A1/cpuset.cpus.partition
>> #echo 0-1 > A1/cpuset.cpus
>> #cat A1/cpuset.cpus.partition
>> root
>>
>> The second scenario produces an unexpected result: A1 should be marked
>> as invalid but is incorrectly recognized as valid. This occurs because
>> when validate_change is invoked, A1 (in root-invalid state) may
>> automatically transition to a valid partition, with non-exclusive state
>> checks against siblings, leading to incorrect validation.
>>
>> To fix this inconsistency, treat trialcs in root-invalid state as exclusive
>> during validation and set the corresponding exclusive flags, ensuring
>> consistent behavior regardless of operation order.
>>
>> Signed-off-by: Chen Ridong <chenridong@huawei.com>
>> ---
>> kernel/cgroup/cpuset.c | 19 ++++++++++++++-----
>> 1 file changed, 14 insertions(+), 5 deletions(-)
>>
>> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
>> index daf813386260..a189f356b5f1 100644
>> --- a/kernel/cgroup/cpuset.c
>> +++ b/kernel/cgroup/cpuset.c
>> @@ -2526,6 +2526,18 @@ static void partition_cpus_change(struct cpuset *cs, struct cpuset *trialcs,
>> }
>> }
>>
>> +static int init_trialcs(struct cpuset *cs, struct cpuset *trialcs)
>> +{
>> + trialcs->prs_err = PERR_NONE;
>> + /*
>> + * If partition_root_state != 0, it may automatically change to a partition,
>> + * Therefore, we should treat trialcs as exclusive during validation
>> + */
>> + if (trialcs->partition_root_state)
>> + set_bit(CS_CPU_EXCLUSIVE, &trialcs->flags);
>> + return compute_trialcs_excpus(trialcs, cs);
>> +}
>> +
>> /**
>> * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
>> * @cs: the cpuset to consider
>> @@ -2551,9 +2563,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
>> if (alloc_tmpmasks(&tmp))
>> return -ENOMEM;
>>
>> - compute_trialcs_excpus(trialcs, cs);
>> - trialcs->prs_err = PERR_NONE;
>> -
>> + init_trialcs(cs, trialcs);
>> retval = cpus_allowed_validate_change(cs, trialcs, &tmp);
>> if (retval < 0)
>> goto out_free;
>> @@ -2612,7 +2622,7 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
>> * Reject the change if there is exclusive CPUs conflict with
>> * the siblings.
>> */
>> - if (compute_trialcs_excpus(trialcs, cs))
>> + if (init_trialcs(cs, trialcs))
>> return -EINVAL;
>>
>> /*
>> @@ -2628,7 +2638,6 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
>> if (alloc_tmpmasks(&tmp))
>> return -ENOMEM;
>>
>> - trialcs->prs_err = PERR_NONE;
>> partition_cpus_change(cs, trialcs, &tmp);
>>
>> spin_lock_irq(&callback_lock);
>
> Hi, Ridong,
>
> Maybe, this patch does not apply to the following cases:
> Step
> #1> echo "root" > A1/cpuset.cpus.partition
> #1> echo "0-1" > B1/cpuset.cpus
> #2> echo "1-2" > A1/cpuset.cpus.exclusive -> return error
> It should return success here.
>
> Please consider the following modification.
>
If A1 will automatically change to a valid partition, I think it should return error.
Thanks.
--
Best regards,
Ridong
On 2025/11/15 14:23, Chen Ridong wrote: >On 2025/11/17 12:35, Sun Shaojie wrote: >> Hi, Ridong, >> >> Maybe, this patch does not apply to the following cases: >> Step >> #1> echo "root" > A1/cpuset.cpus.partition >> #1> echo "0-1" > B1/cpuset.cpus >> #2> echo "1-2" > A1/cpuset.cpus.exclusive -> return error >> It should return success here. >> >> Please consider the following modification. >> > >If A1 will automatically change to a valid partition, I think it should return error. Hi, Ridong, A1 will not automatically change to a valid partition. Perhaps this example is more intuitive. For example: Before apply this patch: #1> echo "0-1" > B1/cpuset.cpus #2> echo "root" > A1/cpuset.cpus.partition -> A1's prstate is "root invalid" #3> echo "1-2" > A1/cpuset.cpus.exclusive Return success, and A1's prstate is "root invalid" After apply this patch: #1> echo "0-1" > B1/cpuset.cpus #2> echo "root" > A1/cpuset.cpus.partition -> A1's prstate is "root invalid" #3> echo "1-2" > A1/cpuset.cpus.exclusive Return error, and A1's prstate is "root invalid" It should return success here. Because A1's prstate is "root invalid. For this example, the behavior should remain consistent before and after applying the patch. This is because when A1 is in the "root invalid" state, its behavior is equivalent to that of a "member," meaning A1's cpuset.cpus.exclusive and B1's cpuset.cpus are allowed to overlap. Thanks, Sun Shaojie
On 2025/11/17 14:53, Sun Shaojie wrote: > On 2025/11/15 14:23, Chen Ridong wrote: >> On 2025/11/17 12:35, Sun Shaojie wrote: >>> Hi, Ridong, >>> >>> Maybe, this patch does not apply to the following cases: >>> Step >>> #1> echo "root" > A1/cpuset.cpus.partition >>> #1> echo "0-1" > B1/cpuset.cpus >>> #2> echo "1-2" > A1/cpuset.cpus.exclusive -> return error >>> It should return success here. >>> >>> Please consider the following modification. >>> >> >> If A1 will automatically change to a valid partition, I think it should return error. > > Hi, Ridong, > > A1 will not automatically change to a valid partition. > > Perhaps this example is more intuitive. > > For example: > > Before apply this patch: > #1> echo "0-1" > B1/cpuset.cpus > #2> echo "root" > A1/cpuset.cpus.partition -> A1's prstate is "root invalid" > #3> echo "1-2" > A1/cpuset.cpus.exclusive > Return success, and A1's prstate is "root invalid" > I did not apply your patch to test my patch. here are the results I obtained: # cd /sys/fs/cgroup/ # mkdir A1 # mkdir B1 # echo 0-1 > B1/cpuset.cpus # echo root > A1/cpuset.cpus.partition # cat A1/cpuset.cpus.partition root invalid (cpuset.cpus and cpuset.cpus.exclusive are empty) # echo 1-2 > A1/cpuset.cpus # cat A1/cpuset.cpus.partition root This differs from the results you provided. Never mind, let's focus on whether the rule should be relaxed in your patch. Once that's resolved, I can resubmit my patch. Let's set this patch aside for now. Thanks. -- Best regards, Ridong
On 2025/11/15 15:41, Chen Ridong wrote: >> Our product need ensure the following behavior: in cgroup-v2, user >> modifications to one cpuset should not affect the partition state of its >> sibling cpusets. This is justified and meaningful, as it aligns with the >> isolation characteristics of cgroups. >> > >This is ideal in theory, but I don’t think it’s practical in reality. > >> This can be divided into two scenarios: >> Scenario 1: Only one of A1 and B1 is "root". >> Scenario 2: Both A1 and B1 are "root". >> >> We plan to implement Scenario 1 first. This is the goal of patch v2. >> However, patch v2 is flawed because it does not strictly adhere to the >> following existing rule. >> >> However, it is worth noting that the current cgroup v2 implementation does >> not strictly adhere to the following rule either (which is also an >> objective for patch v3 to address). >> >> Rule 1: "cpuset.cpus" cannot be a subset of a sibling's "cpuset.cpus.exclusive". >> >> Using your example to illustrate. >> Step (refer to the steps in the table below) >> #1> mkdir -p A1 >> #2> echo "0-1" > A1/cpuset.cpus.exclusive >> #3> echo "root" > A1/cpuset.cpus.partition >> #4> mkdir -p B1 >> #5> echo "0" > B1/cpuset.cpus >> >> Table 1: Current result >> Step | return | A1's excl_cpus | B1's cpus | A1's prstate | B1's prstate | >> #1 | 0 | | | member | | >> #2 | 0 | 0-1 | | member | | >> #3 | 0 | 0-1 | | root | | >> #4 | 0 | 0-1 | | root | member | >> #5 | 0 | 0-1 | 0 | root invalid | member | >> > >I think this what we expect. > >> Table 2: Expected result >> Step | return | A1's excl_cpus | B1's cpus | A1's prstate | B1's prstate | >> #1 | 0 | | | member | | >> #2 | 0 | 0-1 | | member | | >> #3 | 0 | 0-1 | | root | | >> #4 | 0 | 0-1 | | root | member | >> #5 | error | 0-1 | | root | member | >> > >Step 5 should not return an error. As Longman pointed out, in cgroup-v2, setting cpuset.cpus should >never fail. Hi, Ridong, Thank you for your correction. Will update. Thanks, Sunshaojie.
© 2016 - 2026 Red Hat, Inc.