[PATCH] cgroup/cpuset: Support multiple source/destination cpusets using pids pattern

Ridong Chen posted 1 patch 1 day ago
kernel/cgroup/cpuset-internal.h |   7 --
kernel/cgroup/cpuset.c          | 167 ++++++++++++++++----------------
2 files changed, 84 insertions(+), 90 deletions(-)
[PATCH] cgroup/cpuset: Support multiple source/destination cpusets using pids pattern
Posted by Ridong Chen 1 day ago
When the cpuset controller is enabled/disabled in a parent cgroup, tasks
from multiple child cpusets need to be migrated. The current code only
handles a single source/destination pair.

Support multiple source/destination cpusets by adopting the per-task
processing pattern similar to the pids controller:

1) Perform per-task DL bandwidth reservation (dl_bw_alloc) directly in
   cpuset_can_attach() instead of batching into sum_migrate_dl_bw. This
   eliminates the sum_migrate_dl_bw and dl_bw_cpu fields from the cpuset
   struct.

2) Track attach_in_progress per-task per-destination cpuset to properly
   guard all involved cpusets from having their cpus/mems zeroed.

3) Use a shared cpuset_undo_attach() helper for both rollback-on-error
   in cpuset_can_attach() and for cpuset_cancel_attach().

4) Detect many-source migrations and force cpus_updated/mems_updated
   to true so all tasks get properly updated during attach.

5) Defer nr_deadline_tasks updates to cpuset_attach() (after migration
   is committed) to avoid a race with dl_rebuild_rd_accounting() that
   could see inconsistent values between can_attach and attach.

Signed-off-by: Ridong Chen <ridong.chen@linux.dev>
---
 kernel/cgroup/cpuset-internal.h |   7 --
 kernel/cgroup/cpuset.c          | 167 ++++++++++++++++----------------
 2 files changed, 84 insertions(+), 90 deletions(-)

diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h
index f7aaf01f7cd5..8f32cb97eb94 100644
--- a/kernel/cgroup/cpuset-internal.h
+++ b/kernel/cgroup/cpuset-internal.h
@@ -167,13 +167,6 @@ struct cpuset {
 	 */
 	int nr_deadline_tasks;
 	int nr_migrate_dl_tasks;
-	/* DL bandwidth that needs destination reservation for this attach. */
-	u64 sum_migrate_dl_bw;
-	/*
-	 * CPU used for temporary DL bandwidth allocation during attach;
-	 * -1 if no DL bandwidth was allocated in the current attach.
-	 */
-	int dl_bw_cpu;
 
 	/* Invalid partition error code, not lock protected */
 	enum prs_errcode prs_err;
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index e52a5a40d607..a6d96a39cdb1 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -288,7 +288,6 @@ struct cpuset top_cpuset = {
 	.flags = BIT(CS_CPU_EXCLUSIVE) |
 		 BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE),
 	.partition_root_state = PRS_ROOT,
-	.dl_bw_cpu = -1,
 };
 
 /**
@@ -580,8 +579,6 @@ static struct cpuset *dup_or_alloc_cpuset(struct cpuset *cs)
 	if (!trial)
 		return NULL;
 
-	trial->dl_bw_cpu = -1;
-
 	/* Setup cpumask pointer array */
 	cpumask_var_t *pmask[4] = {
 		&trial->cpus_allowed,
@@ -3026,31 +3023,36 @@ static int cpuset_can_attach_check(struct cpuset *cs, struct cpuset *oldcs,
 	return 0;
 }
 
-static int cpuset_reserve_dl_bw(struct cpuset *cs)
+/*
+ * Undo DL bandwidth reservations and attach_in_progress increments done
+ * in cpuset_can_attach(). Used for both rollback on error and cancel_attach.
+ * If @stop_at is non-NULL, undo only for tasks before @stop_at in the tset.
+ */
+static void cpuset_undo_attach(struct cgroup_taskset *tset,
+			       struct task_struct *stop_at)
 {
-	int cpu, ret;
-
-	if (!cs->sum_migrate_dl_bw)
-		return 0;
+	struct cgroup_subsys_state *css;
+	struct task_struct *task;
 
-	cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus);
-	if (unlikely(cpu >= nr_cpu_ids))
-		return -EINVAL;
+	cgroup_taskset_for_each(task, css, tset) {
+		struct cpuset *cs = css_cs(css);
 
-	ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw);
-	if (ret)
-		return ret;
+		if (task == stop_at)
+			break;
 
-	cs->dl_bw_cpu = cpu;
-	return 0;
+		if (dl_task(task)) {
+			cs->nr_migrate_dl_tasks--;
+			if (dl_task_needs_bw_move(task, cs->effective_cpus)) {
+				int cpu = cpumask_any_and(cpu_active_mask,
+							 cs->effective_cpus);
+				dl_bw_free(cpu, task->dl.dl_bw);
+			}
+		}
+		dec_attach_in_progress_locked(cs);
+	}
 }
 
-static void reset_migrate_dl_data(struct cpuset *cs)
-{
-	cs->nr_migrate_dl_tasks = 0;
-	cs->sum_migrate_dl_bw = 0;
-	cs->dl_bw_cpu = -1;
-}
+static bool attach_many_sources;
 
 /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
 static int cpuset_can_attach(struct cgroup_taskset *tset)
@@ -3067,90 +3069,73 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
 	cs = css_cs(css);
 
 	mutex_lock(&cpuset_mutex);
+	attach_many_sources = false;
 
 	/* Check to see if task is allowed in the cpuset */
 	ret = cpuset_can_attach_check(cs, oldcs, &setsched_check);
 	if (ret)
 		goto out_unlock;
 
-	/*
-	 * The cpuset_attach_old_cs is used mainly by cpuset_migrate_mm() to get
-	 * the old_mems_allowed value. There are two ways that many-to-one
-	 * cpuset migration can happen:
-	 * 1) A multithread application with threads in different cpusets is
-	 *    wholely migrated to a new cpuset.
-	 * 2) Disabling v2 cpuset controller will move all the tasks in child
-	 *    cpusets to the parent cpuset.
-	 *
-	 * In the former case, it is the mm setting of the group leader that
-	 * really matters. So cpuset_attach_old_cs should track the oldcs of the
-	 * group leader. It falls back to the oldcs of the first task if there
-	 * is no group leader in the taskset. In the latter case, effective_mems
-	 * of child cpusets must always be a subset of the parent. So no real
-	 * page migration will be necessary no matter which child cpuset is
-	 * selected as cpuset_attach_old_cs.
-	 */
 	cgroup_taskset_for_each(task, css, tset) {
+		struct cpuset *newcs = css_cs(css);
+		struct cpuset *new_oldcs = task_cs(task);
+
+		if (newcs != cs || new_oldcs != oldcs) {
+			if (new_oldcs != oldcs)
+				attach_many_sources = true;
+			cs = newcs;
+			oldcs = new_oldcs;
+			ret = cpuset_can_attach_check(cs, oldcs,
+						      &setsched_check);
+			if (ret)
+				goto out_rollback;
+		}
+
 		ret = task_can_attach(task);
 		if (ret)
-			goto out_unlock;
+			goto out_rollback;
 
-		/* Update cpuset_attach_old_cs to the latest group leader */
 		if (task == task->group_leader)
 			cpuset_attach_old_cs = task_cs(task);
 
 		if (setsched_check) {
 			ret = security_task_setscheduler(task);
 			if (ret)
-				goto out_unlock;
+				goto out_rollback;
 		}
 
 		if (dl_task(task)) {
-			/*
-			 * Count all migrating DL tasks for cpuset task accounting.
-			 * Only tasks that need a root-domain bandwidth move
-			 * contribute to sum_migrate_dl_bw.
-			 */
 			cs->nr_migrate_dl_tasks++;
-			if (dl_task_needs_bw_move(task, cs->effective_cpus))
-				cs->sum_migrate_dl_bw += task->dl.dl_bw;
+			if (dl_task_needs_bw_move(task, cs->effective_cpus)) {
+				int cpu = cpumask_any_and(cpu_active_mask,
+							 cs->effective_cpus);
+				if (unlikely(cpu >= nr_cpu_ids)) {
+					ret = -EINVAL;
+					goto out_rollback;
+				}
+				ret = dl_bw_alloc(cpu, task->dl.dl_bw);
+				if (ret)
+					goto out_rollback;
+			}
 		}
-	}
-
-	ret = cpuset_reserve_dl_bw(cs);
 
-out_unlock:
-	if (ret) {
-		reset_migrate_dl_data(cs);
-	} else {
-		/*
-		 * Mark attach is in progress.  This makes validate_change() fail
-		 * changes which zero cpus/mems_allowed.
-		 */
 		cs->attach_in_progress++;
 	}
 
+	goto out_unlock;
+
+out_rollback:
+	cpuset_undo_attach(tset, task);
+
+out_unlock:
 	mutex_unlock(&cpuset_mutex);
 	return ret;
 }
 
 static void cpuset_cancel_attach(struct cgroup_taskset *tset)
 {
-	struct cgroup_subsys_state *css;
-	struct cpuset *cs;
-
-	cgroup_taskset_first(tset, &css);
-	cs = css_cs(css);
-
 	mutex_lock(&cpuset_mutex);
-	dec_attach_in_progress_locked(cs);
-
-	if (cs->dl_bw_cpu >= 0)
-		dl_bw_free(cs->dl_bw_cpu, cs->sum_migrate_dl_bw);
-
-	if (cs->nr_migrate_dl_tasks)
-		reset_migrate_dl_data(cs);
-
+	cpuset_undo_attach(tset, NULL);
 	mutex_unlock(&cpuset_mutex);
 }
 
@@ -3232,8 +3217,15 @@ static void cpuset_attach(struct cgroup_taskset *tset)
 	mutex_lock(&cpuset_mutex);
 	queue_task_work = false;
 
-	attach_cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus);
-	attach_mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
+	if (attach_many_sources) {
+		attach_cpus_updated = true;
+		attach_mems_updated = true;
+	} else {
+		attach_cpus_updated = !cpumask_equal(cs->effective_cpus,
+						    oldcs->effective_cpus);
+		attach_mems_updated = !nodes_equal(cs->effective_mems,
+						   oldcs->effective_mems);
+	}
 
 	/*
 	 * In the default hierarchy, enabling cpuset in the child cgroups
@@ -3250,20 +3242,29 @@ static void cpuset_attach(struct cgroup_taskset *tset)
 	}
 
 	cgroup_taskset_for_each(task, css, tset)
-		cpuset_attach_task(cs, task);
+		cpuset_attach_task(css_cs(css), task);
 
 out:
 	if (queue_task_work)
 		schedule_flush_migrate_mm();
 	cs->old_mems_allowed = cpuset_attach_nodemask_to;
 
-	if (cs->nr_migrate_dl_tasks) {
-		cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks;
-		oldcs->nr_deadline_tasks -= cs->nr_migrate_dl_tasks;
-		reset_migrate_dl_data(cs);
-	}
+	/*
+	 * Update nr_deadline_tasks now that migration is committed.
+	 * nr_migrate_dl_tasks was accumulated per-dst in can_attach but
+	 * nr_deadline_tasks is deferred to here to avoid a race with
+	 * dl_rebuild_rd_accounting() between can_attach and attach.
+	 */
+	cgroup_taskset_for_each(task, css, tset) {
+		struct cpuset *dst_cs = css_cs(css);
 
-	dec_attach_in_progress_locked(cs);
+		if (dst_cs->nr_migrate_dl_tasks) {
+			dst_cs->nr_deadline_tasks += dst_cs->nr_migrate_dl_tasks;
+			oldcs->nr_deadline_tasks -= dst_cs->nr_migrate_dl_tasks;
+			dst_cs->nr_migrate_dl_tasks = 0;
+		}
+		dec_attach_in_progress_locked(dst_cs);
+	}
 
 	mutex_unlock(&cpuset_mutex);
 }
-- 
2.43.0