Now in cpuset_attach(), we need to synchronously wait for
flush_workqueue to complete. The execution time of flushing
cpuset_migrate_mm_wq depends on the amount of mm migration initiated by
cpusets at that time. When the cpuset.mems of a cgroup occupying a large
amount of memory is modified, it may trigger extensive mm migration,
causing cpuset_attach() to block on flush_workqueue for an extended period.
This could be dangerous because cpuset_attach() is within the critical
section of cgroup_mutex, which may ultimately cause all cgroup-related
operations in the system to be blocked.
This patch attempts to defer the flush_workqueue() operation until
returning to userspace using the task_work which is originally proposed by
tejun[1], so that flush happens after cgroup_mutex is dropped. That way we
maintain the operation synchronicity while avoiding bothering anyone else.
[1]: https://lore.kernel.org/cgroups/ZgMFPMjZRZCsq9Q-@slm.duckdns.org/T/#m117f606fa24f66f0823a60f211b36f24bd9e1883
Originally-by: tejun heo <tj@kernel.org>
Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
---
kernel/cgroup/cpuset.c | 29 ++++++++++++++++++++++++-----
1 file changed, 24 insertions(+), 5 deletions(-)
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 3d8492581c8c4..ceb467079e41f 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -40,6 +40,7 @@
#include <linux/sched/isolation.h>
#include <linux/wait.h>
#include <linux/workqueue.h>
+#include <linux/task_work.h>
DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
@@ -2582,9 +2583,24 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
}
}
-static void cpuset_post_attach(void)
+static void flush_migrate_mm_task_workfn(struct callback_head *head)
{
flush_workqueue(cpuset_migrate_mm_wq);
+ kfree(head);
+}
+
+static void schedule_flush_migrate_mm(void)
+{
+ struct callback_head *flush_cb;
+
+ flush_cb = kzalloc(sizeof(struct callback_head), GFP_KERNEL);
+ if (!flush_cb)
+ return;
+
+ init_task_work(flush_cb, flush_migrate_mm_task_workfn);
+
+ if (task_work_add(current, flush_cb, TWA_RESUME))
+ kfree(flush_cb);
}
/*
@@ -3141,6 +3157,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
struct cpuset *cs;
struct cpuset *oldcs = cpuset_attach_old_cs;
bool cpus_updated, mems_updated;
+ bool queue_task_work = false;
cgroup_taskset_first(tset, &css);
cs = css_cs(css);
@@ -3191,15 +3208,18 @@ static void cpuset_attach(struct cgroup_taskset *tset)
* @old_mems_allowed is the right nodesets that we
* migrate mm from.
*/
- if (is_memory_migrate(cs))
+ if (is_memory_migrate(cs)) {
cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
&cpuset_attach_nodemask_to);
- else
+ queue_task_work = true;
+ } else
mmput(mm);
}
}
out:
+ if (queue_task_work)
+ schedule_flush_migrate_mm();
cs->old_mems_allowed = cpuset_attach_nodemask_to;
if (cs->nr_migrate_dl_tasks) {
@@ -3257,7 +3277,7 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
mutex_unlock(&cpuset_mutex);
cpus_read_unlock();
if (of_cft(of)->private == FILE_MEMLIST)
- flush_workqueue(cpuset_migrate_mm_wq);
+ schedule_flush_migrate_mm();
return retval ?: nbytes;
}
@@ -3725,7 +3745,6 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
.can_attach = cpuset_can_attach,
.cancel_attach = cpuset_cancel_attach,
.attach = cpuset_attach,
- .post_attach = cpuset_post_attach,
.bind = cpuset_bind,
.can_fork = cpuset_can_fork,
.cancel_fork = cpuset_cancel_fork,
--
2.20.1
On 9/4/25 3:45 AM, Chuyi Zhou wrote: > Now in cpuset_attach(), we need to synchronously wait for > flush_workqueue to complete. The execution time of flushing > cpuset_migrate_mm_wq depends on the amount of mm migration initiated by > cpusets at that time. When the cpuset.mems of a cgroup occupying a large > amount of memory is modified, it may trigger extensive mm migration, > causing cpuset_attach() to block on flush_workqueue for an extended period. > This could be dangerous because cpuset_attach() is within the critical > section of cgroup_mutex, which may ultimately cause all cgroup-related > operations in the system to be blocked. > > This patch attempts to defer the flush_workqueue() operation until > returning to userspace using the task_work which is originally proposed by > tejun[1], so that flush happens after cgroup_mutex is dropped. That way we > maintain the operation synchronicity while avoiding bothering anyone else. > > [1]: https://lore.kernel.org/cgroups/ZgMFPMjZRZCsq9Q-@slm.duckdns.org/T/#m117f606fa24f66f0823a60f211b36f24bd9e1883 > > Originally-by: tejun heo <tj@kernel.org> > Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com> > --- > kernel/cgroup/cpuset.c | 29 ++++++++++++++++++++++++----- > 1 file changed, 24 insertions(+), 5 deletions(-) > > diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c > index 3d8492581c8c4..ceb467079e41f 100644 > --- a/kernel/cgroup/cpuset.c > +++ b/kernel/cgroup/cpuset.c > @@ -40,6 +40,7 @@ > #include <linux/sched/isolation.h> > #include <linux/wait.h> > #include <linux/workqueue.h> > +#include <linux/task_work.h> > > DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key); > DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); > @@ -2582,9 +2583,24 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, > } > } > > -static void cpuset_post_attach(void) > +static void flush_migrate_mm_task_workfn(struct callback_head *head) > { > flush_workqueue(cpuset_migrate_mm_wq); > + kfree(head); > +} > + > +static void schedule_flush_migrate_mm(void) > +{ > + struct callback_head *flush_cb; > + > + flush_cb = kzalloc(sizeof(struct callback_head), GFP_KERNEL); > + if (!flush_cb) > + return; > + > + init_task_work(flush_cb, flush_migrate_mm_task_workfn); > + > + if (task_work_add(current, flush_cb, TWA_RESUME)) > + kfree(flush_cb); > } > > /* > @@ -3141,6 +3157,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) > struct cpuset *cs; > struct cpuset *oldcs = cpuset_attach_old_cs; > bool cpus_updated, mems_updated; > + bool queue_task_work = false; > > cgroup_taskset_first(tset, &css); > cs = css_cs(css); > @@ -3191,15 +3208,18 @@ static void cpuset_attach(struct cgroup_taskset *tset) > * @old_mems_allowed is the right nodesets that we > * migrate mm from. > */ > - if (is_memory_migrate(cs)) > + if (is_memory_migrate(cs)) { > cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, > &cpuset_attach_nodemask_to); > - else > + queue_task_work = true; > + } else > mmput(mm); > } > } > > out: > + if (queue_task_work) > + schedule_flush_migrate_mm(); > cs->old_mems_allowed = cpuset_attach_nodemask_to; > > if (cs->nr_migrate_dl_tasks) { > @@ -3257,7 +3277,7 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of, > mutex_unlock(&cpuset_mutex); > cpus_read_unlock(); > if (of_cft(of)->private == FILE_MEMLIST) > - flush_workqueue(cpuset_migrate_mm_wq); > + schedule_flush_migrate_mm(); > return retval ?: nbytes; > } > > @@ -3725,7 +3745,6 @@ struct cgroup_subsys cpuset_cgrp_subsys = { > .can_attach = cpuset_can_attach, > .cancel_attach = cpuset_cancel_attach, > .attach = cpuset_attach, > - .post_attach = cpuset_post_attach, > .bind = cpuset_bind, > .can_fork = cpuset_can_fork, > .cancel_fork = cpuset_cancel_fork, Reviewed-by: Waiman Long <longman@redhat.com>
© 2016 - 2025 Red Hat, Inc.