It was found that the user requested affinity via sched_setaffinity()
can be easily overwritten by other kernel subsystems without an easy way
to reset it back to what the user requested. For example, any change
to the current cpuset hierarchy may reset the cpumask of the tasks in
the affected cpusets to the default cpuset value even if those tasks
have pre-existing user requested affinity. That is especially easy to
trigger under a cgroup v2 environment where writing "+cpuset" to the
root cgroup's cgroup.subtree_control file will reset the cpus affinity
of all the processes in the system.
That is problematic in a nohz_full environment where the tasks running
in the nohz_full CPUs usually have their cpus affinity explicitly set
and will behave incorrectly if cpus affinity changes.
Fix this problem by looking at user_cpus_ptr in __set_cpus_allowed_ptr()
and use it to restrcit the given cpumask unless there is no overlap. In
that case, it will fallback to the given one. The SCA_USER flag is
reused to indicate intent to set user_cpus_ptr and so user_cpus_ptr
masking should be skipped. In addition, masking should also be skipped
if any of the SCA_MIGRATE_* flag is set.
All callers of set_cpus_allowed_ptr() will be affected by this change.
A scratch cpumask is added to percpu runqueues structure for doing
additional masking when user_cpus_ptr is set.
Signed-off-by: Waiman Long <longman@redhat.com>
---
kernel/sched/core.c | 22 +++++++++++++++++-----
kernel/sched/sched.h | 3 +++
2 files changed, 20 insertions(+), 5 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c7c0425974c2..ab8e591dbaf5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2932,6 +2932,15 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
struct rq *rq;
rq = task_rq_lock(p, &rf);
+ /*
+ * Masking should be skipped if SCA_USER or any of the SCA_MIGRATE_*
+ * flags are set.
+ */
+ if (p->user_cpus_ptr &&
+ !(flags & (SCA_USER | SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) &&
+ cpumask_and(rq->scratch_mask, new_mask, p->user_cpus_ptr))
+ new_mask = rq->scratch_mask;
+
return __set_cpus_allowed_ptr_locked(p, new_mask, flags, rq, &rf);
}
@@ -3028,7 +3037,7 @@ void force_compatible_cpus_allowed_ptr(struct task_struct *p)
}
static int
-__sched_setaffinity(struct task_struct *p, const struct cpumask *mask);
+__sched_setaffinity(struct task_struct *p, const struct cpumask *mask, int flags);
/*
* Restore the affinity of a task @p which was previously restricted by a
@@ -3045,7 +3054,7 @@ void relax_compatible_cpus_allowed_ptr(struct task_struct *p)
* Try to restore the old affinity mask with __sched_setaffinity().
* Cpuset masking will be done there too.
*/
- ret = __sched_setaffinity(p, task_user_cpus(p));
+ ret = __sched_setaffinity(p, task_user_cpus(p), 0);
WARN_ON_ONCE(ret);
}
@@ -8049,7 +8058,7 @@ int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
#endif
static int
-__sched_setaffinity(struct task_struct *p, const struct cpumask *mask)
+__sched_setaffinity(struct task_struct *p, const struct cpumask *mask, int flags)
{
int retval;
cpumask_var_t cpus_allowed, new_mask;
@@ -8069,7 +8078,7 @@ __sched_setaffinity(struct task_struct *p, const struct cpumask *mask)
if (retval)
goto out_free_new_mask;
again:
- retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK);
+ retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK | flags);
if (retval)
goto out_free_new_mask;
@@ -8134,7 +8143,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
}
cpumask_copy(user_mask, in_mask);
- retval = __sched_setaffinity(p, in_mask);
+ retval = __sched_setaffinity(p, in_mask, SCA_USER);
/*
* Save in_mask into user_cpus_ptr after a successful
@@ -9647,6 +9656,9 @@ void __init sched_init(void)
cpumask_size(), GFP_KERNEL, cpu_to_node(i));
per_cpu(select_rq_mask, i) = (cpumask_var_t)kzalloc_node(
cpumask_size(), GFP_KERNEL, cpu_to_node(i));
+ per_cpu(runqueues.scratch_mask, i) =
+ (cpumask_var_t)kzalloc_node(cpumask_size(),
+ GFP_KERNEL, cpu_to_node(i));
}
#endif /* CONFIG_CPUMASK_OFFSTACK */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ac235bc8ef08..482b702d65ea 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1159,6 +1159,9 @@ struct rq {
unsigned int core_forceidle_occupation;
u64 core_forceidle_start;
#endif
+
+ /* Scratch cpumask to be temporarily used under rq_lock */
+ cpumask_var_t scratch_mask;
};
#ifdef CONFIG_FAIR_GROUP_SCHED
--
2.31.1
On Thu, Sep 22, 2022 at 02:00:39PM -0400, Waiman Long wrote: > @@ -9647,6 +9656,9 @@ void __init sched_init(void) > cpumask_size(), GFP_KERNEL, cpu_to_node(i)); > per_cpu(select_rq_mask, i) = (cpumask_var_t)kzalloc_node( > cpumask_size(), GFP_KERNEL, cpu_to_node(i)); > + per_cpu(runqueues.scratch_mask, i) = > + (cpumask_var_t)kzalloc_node(cpumask_size(), > + GFP_KERNEL, cpu_to_node(i)); > } > #endif /* CONFIG_CPUMASK_OFFSTACK */ > That doesn't actually apply; I've made it: --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9748,6 +9748,7 @@ void __init sched_init(void) rq->core_cookie = 0UL; #endif + zalloc_cpumask_var_node(&per_cpu(runqueues.scratch_mask, i), GFP_KERNEL, cpu_to_node(i)); } set_load_weight(&init_task, false);
On 10/7/22 06:01, Peter Zijlstra wrote: > On Thu, Sep 22, 2022 at 02:00:39PM -0400, Waiman Long wrote: >> @@ -9647,6 +9656,9 @@ void __init sched_init(void) >> cpumask_size(), GFP_KERNEL, cpu_to_node(i)); >> per_cpu(select_rq_mask, i) = (cpumask_var_t)kzalloc_node( >> cpumask_size(), GFP_KERNEL, cpu_to_node(i)); >> + per_cpu(runqueues.scratch_mask, i) = >> + (cpumask_var_t)kzalloc_node(cpumask_size(), >> + GFP_KERNEL, cpu_to_node(i)); >> } >> #endif /* CONFIG_CPUMASK_OFFSTACK */ >> > That doesn't actually apply; I've made it: > > --- a/kernel/sched/core.c > +++ b/kernel/sched/core.c > @@ -9748,6 +9748,7 @@ void __init sched_init(void) > > rq->core_cookie = 0UL; > #endif > + zalloc_cpumask_var_node(&per_cpu(runqueues.scratch_mask, i), GFP_KERNEL, cpu_to_node(i)); > } > > set_load_weight(&init_task, false); > Sorry, I should have worked on the latest tip tree instead. Thanks, Longman
On 10/7/22 10:57, Waiman Long wrote: > > On 10/7/22 06:01, Peter Zijlstra wrote: >> On Thu, Sep 22, 2022 at 02:00:39PM -0400, Waiman Long wrote: >>> @@ -9647,6 +9656,9 @@ void __init sched_init(void) >>> cpumask_size(), GFP_KERNEL, cpu_to_node(i)); >>> per_cpu(select_rq_mask, i) = (cpumask_var_t)kzalloc_node( >>> cpumask_size(), GFP_KERNEL, cpu_to_node(i)); >>> + per_cpu(runqueues.scratch_mask, i) = >>> + (cpumask_var_t)kzalloc_node(cpumask_size(), >>> + GFP_KERNEL, cpu_to_node(i)); >>> } >>> #endif /* CONFIG_CPUMASK_OFFSTACK */ >> That doesn't actually apply; I've made it: >> >> --- a/kernel/sched/core.c >> +++ b/kernel/sched/core.c >> @@ -9748,6 +9748,7 @@ void __init sched_init(void) >> rq->core_cookie = 0UL; >> #endif >> + zalloc_cpumask_var_node(&per_cpu(runqueues.scratch_mask, i), >> GFP_KERNEL, cpu_to_node(i)); >> } >> set_load_weight(&init_task, false); >> > Sorry, I should have worked on the latest tip tree instead. To be consistent with the surround context, it may be better to change it to + zalloc_cpumask_var_node(rq->scratch_mask, GFP_KERNEL, cpu_to_node(i)); Cheers, Longman
The following commit has been merged into the sched/core branch of tip:
Commit-ID: da019032819a1f09943d3af676892ec8c627668e
Gitweb: https://git.kernel.org/tip/da019032819a1f09943d3af676892ec8c627668e
Author: Waiman Long <longman@redhat.com>
AuthorDate: Thu, 22 Sep 2022 14:00:39 -04:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 27 Oct 2022 11:01:22 +02:00
sched: Enforce user requested affinity
It was found that the user requested affinity via sched_setaffinity()
can be easily overwritten by other kernel subsystems without an easy way
to reset it back to what the user requested. For example, any change
to the current cpuset hierarchy may reset the cpumask of the tasks in
the affected cpusets to the default cpuset value even if those tasks
have pre-existing user requested affinity. That is especially easy to
trigger under a cgroup v2 environment where writing "+cpuset" to the
root cgroup's cgroup.subtree_control file will reset the cpus affinity
of all the processes in the system.
That is problematic in a nohz_full environment where the tasks running
in the nohz_full CPUs usually have their cpus affinity explicitly set
and will behave incorrectly if cpus affinity changes.
Fix this problem by looking at user_cpus_ptr in __set_cpus_allowed_ptr()
and use it to restrcit the given cpumask unless there is no overlap. In
that case, it will fallback to the given one. The SCA_USER flag is
reused to indicate intent to set user_cpus_ptr and so user_cpus_ptr
masking should be skipped. In addition, masking should also be skipped
if any of the SCA_MIGRATE_* flag is set.
All callers of set_cpus_allowed_ptr() will be affected by this change.
A scratch cpumask is added to percpu runqueues structure for doing
additional masking when user_cpus_ptr is set.
Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20220922180041.1768141-4-longman@redhat.com
---
kernel/sched/core.c | 10 ++++++++++
kernel/sched/sched.h | 3 +++
2 files changed, 13 insertions(+)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 67fb0e4..283bdbd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2949,6 +2949,15 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
struct rq *rq;
rq = task_rq_lock(p, &rf);
+ /*
+ * Masking should be skipped if SCA_USER or any of the SCA_MIGRATE_*
+ * flags are set.
+ */
+ if (p->user_cpus_ptr &&
+ !(ctx->flags & (SCA_USER | SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) &&
+ cpumask_and(rq->scratch_mask, ctx->new_mask, p->user_cpus_ptr))
+ ctx->new_mask = rq->scratch_mask;
+
return __set_cpus_allowed_ptr_locked(p, ctx, rq, &rf);
}
@@ -9804,6 +9813,7 @@ void __init sched_init(void)
rq->core_cookie = 0UL;
#endif
+ zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i));
}
set_load_weight(&init_task, false);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 04f571d..771f8dd 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1151,6 +1151,9 @@ struct rq {
unsigned int core_forceidle_occupation;
u64 core_forceidle_start;
#endif
+
+ /* Scratch cpumask to be temporarily used under rq_lock */
+ cpumask_var_t scratch_mask;
};
#ifdef CONFIG_FAIR_GROUP_SCHED
© 2016 - 2026 Red Hat, Inc.