Defer flushing of the cpuset_migrate_mm_wq to task_work

[PATCH 2/3] cpuset: Defer flushing of the cpuset_migrate_mm_wq to task_work

Posted by Chuyi Zhou 4 weeks ago

Now in cpuset_attach(), we need to synchronously wait for
flush_workqueue to complete. The execution time of flushing
cpuset_migrate_mm_wq depends on the amount of mm migration initiated by
cpusets at that time. When the cpuset.mems of a cgroup occupying a large
amount of memory is modified, it may trigger extensive mm migration,
causing cpuset_attach() to block on flush_workqueue for an extended period.
This could be dangerous because cpuset_attach() is within the critical
section of cgroup_mutex, which may ultimately cause all cgroup-related
operations in the system to be blocked.

This patch attempts to defer the flush_workqueue() operation until
returning to userspace using the task_work which is originally proposed by
tejun[1], so that flush happens after cgroup_mutex is dropped. That way we
maintain the operation synchronicity while avoiding bothering anyone else.

[1]: https://lore.kernel.org/cgroups/ZgMFPMjZRZCsq9Q-@slm.duckdns.org/T/#m117f606fa24f66f0823a60f211b36f24bd9e1883

Originally-by: tejun heo <tj@kernel.org>
Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
---
 kernel/cgroup/cpuset.c | 29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 3d8492581c8c4..ceb467079e41f 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -40,6 +40,7 @@
 #include <linux/sched/isolation.h>
 #include <linux/wait.h>
 #include <linux/workqueue.h>
+#include <linux/task_work.h>
 
 DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
 DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
@@ -2582,9 +2583,24 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
 	}
 }
 
-static void cpuset_post_attach(void)
+static void flush_migrate_mm_task_workfn(struct callback_head *head)
 {
 	flush_workqueue(cpuset_migrate_mm_wq);
+	kfree(head);
+}
+
+static void schedule_flush_migrate_mm(void)
+{
+	struct callback_head *flush_cb;
+
+	flush_cb = kzalloc(sizeof(struct callback_head), GFP_KERNEL);
+	if (!flush_cb)
+		return;
+
+	init_task_work(flush_cb, flush_migrate_mm_task_workfn);
+
+	if (task_work_add(current, flush_cb, TWA_RESUME))
+		kfree(flush_cb);
 }
 
 /*
@@ -3141,6 +3157,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
 	struct cpuset *cs;
 	struct cpuset *oldcs = cpuset_attach_old_cs;
 	bool cpus_updated, mems_updated;
+	bool queue_task_work = false;
 
 	cgroup_taskset_first(tset, &css);
 	cs = css_cs(css);
@@ -3191,15 +3208,18 @@ static void cpuset_attach(struct cgroup_taskset *tset)
 			 * @old_mems_allowed is the right nodesets that we
 			 * migrate mm from.
 			 */
-			if (is_memory_migrate(cs))
+			if (is_memory_migrate(cs)) {
 				cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
 						  &cpuset_attach_nodemask_to);
-			else
+				queue_task_work = true;
+			} else
 				mmput(mm);
 		}
 	}
 
 out:
+	if (queue_task_work)
+		schedule_flush_migrate_mm();
 	cs->old_mems_allowed = cpuset_attach_nodemask_to;
 
 	if (cs->nr_migrate_dl_tasks) {
@@ -3257,7 +3277,7 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
 	mutex_unlock(&cpuset_mutex);
 	cpus_read_unlock();
 	if (of_cft(of)->private == FILE_MEMLIST)
-		flush_workqueue(cpuset_migrate_mm_wq);
+		schedule_flush_migrate_mm();
 	return retval ?: nbytes;
 }
 
@@ -3725,7 +3745,6 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
 	.can_attach	= cpuset_can_attach,
 	.cancel_attach	= cpuset_cancel_attach,
 	.attach		= cpuset_attach,
-	.post_attach	= cpuset_post_attach,
 	.bind		= cpuset_bind,
 	.can_fork	= cpuset_can_fork,
 	.cancel_fork	= cpuset_cancel_fork,
-- 
2.20.1

Re: [PATCH 2/3] cpuset: Defer flushing of the cpuset_migrate_mm_wq to task_work

Posted by Waiman Long 4 weeks ago

On 9/4/25 3:45 AM, Chuyi Zhou wrote:
> Now in cpuset_attach(), we need to synchronously wait for
> flush_workqueue to complete. The execution time of flushing
> cpuset_migrate_mm_wq depends on the amount of mm migration initiated by
> cpusets at that time. When the cpuset.mems of a cgroup occupying a large
> amount of memory is modified, it may trigger extensive mm migration,
> causing cpuset_attach() to block on flush_workqueue for an extended period.
> This could be dangerous because cpuset_attach() is within the critical
> section of cgroup_mutex, which may ultimately cause all cgroup-related
> operations in the system to be blocked.
>
> This patch attempts to defer the flush_workqueue() operation until
> returning to userspace using the task_work which is originally proposed by
> tejun[1], so that flush happens after cgroup_mutex is dropped. That way we
> maintain the operation synchronicity while avoiding bothering anyone else.
>
> [1]: https://lore.kernel.org/cgroups/ZgMFPMjZRZCsq9Q-@slm.duckdns.org/T/#m117f606fa24f66f0823a60f211b36f24bd9e1883
>
> Originally-by: tejun heo <tj@kernel.org>
> Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
> ---
>   kernel/cgroup/cpuset.c | 29 ++++++++++++++++++++++++-----
>   1 file changed, 24 insertions(+), 5 deletions(-)
>
> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
> index 3d8492581c8c4..ceb467079e41f 100644
> --- a/kernel/cgroup/cpuset.c
> +++ b/kernel/cgroup/cpuset.c
> @@ -40,6 +40,7 @@
>   #include <linux/sched/isolation.h>
>   #include <linux/wait.h>
>   #include <linux/workqueue.h>
> +#include <linux/task_work.h>
>   
>   DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
>   DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
> @@ -2582,9 +2583,24 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
>   	}
>   }
>   
> -static void cpuset_post_attach(void)
> +static void flush_migrate_mm_task_workfn(struct callback_head *head)
>   {
>   	flush_workqueue(cpuset_migrate_mm_wq);
> +	kfree(head);
> +}
> +
> +static void schedule_flush_migrate_mm(void)
> +{
> +	struct callback_head *flush_cb;
> +
> +	flush_cb = kzalloc(sizeof(struct callback_head), GFP_KERNEL);
> +	if (!flush_cb)
> +		return;
> +
> +	init_task_work(flush_cb, flush_migrate_mm_task_workfn);
> +
> +	if (task_work_add(current, flush_cb, TWA_RESUME))
> +		kfree(flush_cb);
>   }
>   
>   /*
> @@ -3141,6 +3157,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
>   	struct cpuset *cs;
>   	struct cpuset *oldcs = cpuset_attach_old_cs;
>   	bool cpus_updated, mems_updated;
> +	bool queue_task_work = false;
>   
>   	cgroup_taskset_first(tset, &css);
>   	cs = css_cs(css);
> @@ -3191,15 +3208,18 @@ static void cpuset_attach(struct cgroup_taskset *tset)
>   			 * @old_mems_allowed is the right nodesets that we
>   			 * migrate mm from.
>   			 */
> -			if (is_memory_migrate(cs))
> +			if (is_memory_migrate(cs)) {
>   				cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
>   						  &cpuset_attach_nodemask_to);
> -			else
> +				queue_task_work = true;
> +			} else
>   				mmput(mm);
>   		}
>   	}
>   
>   out:
> +	if (queue_task_work)
> +		schedule_flush_migrate_mm();
>   	cs->old_mems_allowed = cpuset_attach_nodemask_to;
>   
>   	if (cs->nr_migrate_dl_tasks) {
> @@ -3257,7 +3277,7 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
>   	mutex_unlock(&cpuset_mutex);
>   	cpus_read_unlock();
>   	if (of_cft(of)->private == FILE_MEMLIST)
> -		flush_workqueue(cpuset_migrate_mm_wq);
> +		schedule_flush_migrate_mm();
>   	return retval ?: nbytes;
>   }
>   
> @@ -3725,7 +3745,6 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
>   	.can_attach	= cpuset_can_attach,
>   	.cancel_attach	= cpuset_cancel_attach,
>   	.attach		= cpuset_attach,
> -	.post_attach	= cpuset_post_attach,
>   	.bind		= cpuset_bind,
>   	.can_fork	= cpuset_can_fork,
>   	.cancel_fork	= cpuset_cancel_fork,
Reviewed-by:  Waiman Long <longman@redhat.com>

[PATCH 1/3] cpuset: Don't always flush cpuset_migrate_mm_wq in cpuset_write_resmask
[PATCH 2/3] cpuset: Defer flushing of the cpuset_migrate_mm_wq to task_work
[PATCH 3/3] cgroup: Remove unused cgroup_subsys::post_attach