sched: Support moving kthreads into cpuset cgroups

[RFC/PATCH] sched: Support moving kthreads into cpuset cgroups

Posted by Xi Wang 9 months, 1 week ago

In theory we should be able to manage kernel tasks with cpuset
cgroups just like user tasks, would be a flexible way to limit
interferences to real-time and other sensitive workloads. This is
however not supported today: When setting cpu affinity for kthreads,
kernel code uses a simpler control path that directly lead to
__set_cpus_allowed_ptr or __ktread_bind_mask. Neither honors cpuset
restrictions.

This patch adds cpuset support for kernel tasks by merging userspace
and kernel cpu affinity control paths and applying the same
restrictions to kthreads.

The PF_NO_SETAFFINITY flag is still supported for tasks that have to
run with certain cpu affinities. Kernel ensures kthreads with this
flag have their affinities locked and they stay in the root cpuset:

If userspace moves kthreadd out of the root cpuset (see example
below), a newly forked kthread will be in a non root cgroup as well.
If PF_NO_SETAFFINITY is detected for the kthread, it will move itself
into the root cpuset before the threadfn is called. This does depend
on the kthread create -> kthread bind -> wake up sequence.

Since kthreads are clones of kthreadd, the typical usage pattern is:

Create a cpuset cgroup for kernel threads.

Move kthreadd to that cgroup - all new newly created kthreads are
automatically enrolled into that cgroup.

Move all remaining unlocked (!PF_NO_SETAFFINITY) kthreads into that
group.

After these steps, all unlocked kthreads are managed by the cgroup,
including current and future kthreads.

Command line example:

mkdir /sys/fs/cgroup/kernel
echo "+cpuset" > /sys/fs/cgroup/cgroup.subtree_control
echo "+cpuset" > /sys/fs/cgroup/kernel/cgroup.subtree_control

ktd=`pgrep -x kthreadd`; echo "move kthreadd/$ktd first"; echo $ktd > /dev/cgroup/cpuset/kernel/tasks
kthreads=`ps -e -o pgrp= -o pid=  | sed -ne 's/^ *0 *// p'`
for p in $kthreads; do echo "moving $p (ok to fail for locked kthreads)"; echo $p > /sys/fs/cgroup/kernel/cgroup.procs; done
echo 4-7 > /sys/fs/cgroup/kernel/cpuset.cpus

Signed-off-by: Xi Wang <xii@google.com>
---
 include/linux/kthread.h | 10 ++++-
 include/linux/sched.h   | 11 +++++
 kernel/cgroup/cpuset.c  | 31 ++++++++++++--
 kernel/kthread.c        | 89 +++++++++++++++++++++++++++++++++++---
 kernel/sched/core.c     | 95 ++++++++++++++++++++++++++++++++++++++---
 kernel/sched/sched.h    |  6 ---
 kernel/sched/syscalls.c | 63 +--------------------------
 kernel/workqueue.c      |  7 ++-
 8 files changed, 226 insertions(+), 86 deletions(-)

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 8d27403888ce..36215a30d7f7 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -13,6 +13,14 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
 					   int node,
 					   const char namefmt[], ...);
 
+__printf(4, 5)
+struct task_struct *kthread_create_on_node_root_cpuset(
+					   int (*threadfn)(void *data),
+					   void *data,
+					   int node,
+					   const char namefmt[], ...);
+
+
 /**
  * kthread_create - create a kthread on the current node
  * @threadfn: the function to run in the thread
@@ -27,7 +35,6 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
 #define kthread_create(threadfn, data, namefmt, arg...) \
 	kthread_create_on_node(threadfn, data, NUMA_NO_NODE, namefmt, ##arg)
 
-
 struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
 					  void *data,
 					  unsigned int cpu,
@@ -85,6 +92,7 @@ kthread_run_on_cpu(int (*threadfn)(void *data), void *data,
 void free_kthread_struct(struct task_struct *k);
 void kthread_bind(struct task_struct *k, unsigned int cpu);
 void kthread_bind_mask(struct task_struct *k, const struct cpumask *mask);
+void kthread_bind_mask_cpuset(struct task_struct *k, const struct cpumask *mask);
 int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask);
 int kthread_stop(struct task_struct *k);
 int kthread_stop_put(struct task_struct *k);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0782de6b20d5..45b912e21239 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1855,6 +1855,13 @@ extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpu
 extern int task_can_attach(struct task_struct *p);
 extern int dl_bw_alloc(int cpu, u64 dl_bw);
 extern void dl_bw_free(int cpu, u64 dl_bw);
+
+#define SCA_CHECK		0x01
+#define SCA_MIGRATE_DISABLE	0x02
+#define SCA_MIGRATE_ENABLE	0x04
+#define SCA_USER		0x08
+#define SCA_NO_CPUSET	0x10
+
 #ifdef CONFIG_SMP
 
 /* do_set_cpus_allowed() - consider using set_cpus_allowed_ptr() instead */
@@ -1868,6 +1875,9 @@ extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new
  * Return: zero if successful, or a negative error code
  */
 extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask);
+extern int set_cpus_allowed_ptr_no_cpuset(struct task_struct *p, const struct cpumask *new_mask);
+extern int set_cpus_allowed_ptr_flags(
+	struct task_struct *p, const struct cpumask *new_mask, u32 flags);
 extern int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node);
 extern void release_user_cpus_ptr(struct task_struct *p);
 extern int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask);
@@ -1884,6 +1894,7 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpuma
 		return -EINVAL;
 	return 0;
 }
+
 static inline int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node)
 {
 	if (src->user_cpus_ptr)
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index d0143b3dce47..ef929b349da8 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -1128,6 +1128,13 @@ void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
 	while ((task = css_task_iter_next(&it))) {
 		const struct cpumask *possible_mask = task_cpu_possible_mask(task);
 
+		/*
+		 * See also cpuset_can_attach. A thead with the flag could temporarily
+		 * reside in a non root cpuset. Don't change its affinity.
+		 */
+		if (task->flags & PF_NO_SETAFFINITY)
+			continue;
+
 		if (top_cs) {
 			/*
 			 * Percpu kthreads in top_cpuset are ignored
@@ -3034,7 +3041,14 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
 	mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
 
 	cgroup_taskset_for_each(task, css, tset) {
-		ret = task_can_attach(task);
+		/*
+		 * With the kthreads in cpuset feature, kthreadd can be moved to a
+		 * non root cpuset. We want to allow a PF_NO_SETAFFINITY task to be
+		 * spawned and then moved to root, which needs to be allowed here.
+		 */
+		ret = !(cs == &top_cpuset && task->flags & PF_NO_SETAFFINITY);
+		/* Check regular threads */
+		ret = ret && task_can_attach(task);
 		if (ret)
 			goto out_unlock;
 
@@ -3127,7 +3141,7 @@ static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task)
 	 * can_attach beforehand should guarantee that this doesn't
 	 * fail.  TODO: have a better way to handle failure here
 	 */
-	WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
+	WARN_ON_ONCE(set_cpus_allowed_ptr_flags(task, cpus_attach, SCA_NO_CPUSET));
 
 	cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
 	cpuset1_update_task_spread_flags(cs, task);
@@ -3164,8 +3178,19 @@ static void cpuset_attach(struct cgroup_taskset *tset)
 
 	guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
 
-	cgroup_taskset_for_each(task, css, tset)
+	cgroup_taskset_for_each(task, css, tset) {
+		/*
+		 * See cpuset_can_attach.
+		 * With the kthreads in cpuset feature, kthreadd can be moved to a
+		 * non root cpuset. We want to allow a PF_NO_SETAFFINITY task to be
+		 * spawned and then moved to root as it starts running. Don't reset the
+		 * cpu affinity in this case because the thread could have already been
+		 * pinned to a cpu with kthread_bind and we want to preserve that.
+		 */
+		if (task->flags & PF_NO_SETAFFINITY)
+			continue;
 		cpuset_attach_task(cs, task);
+	}
 
 	/*
 	 * Change mm for all threadgroup leaders. This is expensive and may
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 77c44924cf54..2689eb67846e 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -45,6 +45,7 @@ struct kthread_create_info
 	int (*threadfn)(void *data);
 	void *data;
 	int node;
+	bool move_to_root;
 
 	/* Result passed back to kthread_create() from kthreadd. */
 	struct task_struct *result;
@@ -409,6 +410,9 @@ static void kthread_affine_node(void)
 	}
 }
 
+int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
+		       bool threadgroup);
+
 static int kthread(void *_create)
 {
 	static const struct sched_param param = { .sched_priority = 0 };
@@ -418,6 +422,7 @@ static int kthread(void *_create)
 	void *data = create->data;
 	struct completion *done;
 	struct kthread *self;
+	bool move_to_root = create->move_to_root;
 	int ret;
 
 	self = to_kthread(current);
@@ -454,6 +459,42 @@ static int kthread(void *_create)
 
 	self->started = 1;
 
+#ifdef CONFIG_CPUSETS
+	/*
+	 * With the kthreads in cgroup feature, kthreadd can be optionally put
+	 * into a non root cpuset (such that newly created kernel threads are
+	 * automatically restricted). Certain kernel threads that must to be in
+	 * the root cpuset are moved to root here.
+	 *
+	 * This code is called after the schedule() above, thus kthread_bind
+	 * or kthread_bind_mask should have already been called if present.
+	 * PF_NO_SETAFFINITY set by these functions implicitly triggers the
+	 * move to root action. It can also be explicitly triggered with the
+	 * move_to_root flag.
+	 *
+	 * Potential races between the conditional and cgroup mutex lock:
+	 *
+	 * current can be out of root then moved into root before mutex lock,
+	 * which is ok because cgroup_attach_task should be able to handle
+	 * src == dst. There are checks in cgroup_migrate_prepare_dst etc.
+	 *
+	 * current can be in root then moved out of root before mutex lock,
+	 * which is also ok: For threads with PF_NO_SETAFFINITY the move is
+	 * disallowed so we can't have this race. For other threads, we allow
+	 * users to move them out of the root cgroup and there is no guarantee
+	 * on the order of actions.
+	 */
+	if ((current->flags & PF_NO_SETAFFINITY || move_to_root) &&
+	  !task_css_is_root(current, cpuset_cgrp_id)) {
+		mutex_lock(&cgroup_mutex);
+		percpu_down_write(&cgroup_threadgroup_rwsem);
+		if (cgroup_attach_task(&cpuset_cgrp_subsys.root->cgrp, current, false))
+			WARN_ONCE(1, "Cannot move newly created kernel thread to root cpuset");
+		percpu_up_write(&cgroup_threadgroup_rwsem);
+		mutex_unlock(&cgroup_mutex);
+	}
+#endif
+
 	if (!(current->flags & PF_NO_SETAFFINITY) && !self->preferred_affinity)
 		kthread_affine_node();
 
@@ -504,7 +545,8 @@ static __printf(4, 0)
 struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
 						    void *data, int node,
 						    const char namefmt[],
-						    va_list args)
+						    va_list args,
+						    bool move_to_root)
 {
 	DECLARE_COMPLETION_ONSTACK(done);
 	struct task_struct *task;
@@ -516,6 +558,7 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
 	create->threadfn = threadfn;
 	create->data = data;
 	create->node = node;
+	create->move_to_root = move_to_root;
 	create->done = &done;
 	create->full_name = kvasprintf(GFP_KERNEL, namefmt, args);
 	if (!create->full_name) {
@@ -585,14 +628,40 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
 	va_list args;
 
 	va_start(args, namefmt);
-	task = __kthread_create_on_node(threadfn, data, node, namefmt, args);
+	task = __kthread_create_on_node(threadfn, data, node, namefmt, args, false);
 	va_end(args);
 
 	return task;
 }
 EXPORT_SYMBOL(kthread_create_on_node);
 
-static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state)
+/*
+ * Move the newly created kthread to root cpuset if it is not already there.
+ * This happens if kthreadd is moved out of root cpuset by user. Otherwise same
+ * as the regular version.
+ */
+struct task_struct *kthread_create_on_node_root_cpuset(
+					   int (*threadfn)(void *data),
+					   void *data, int node,
+					   const char namefmt[],
+					   ...)
+
+{
+	struct task_struct *task;
+	va_list args;
+
+	va_start(args, namefmt);
+	task = __kthread_create_on_node(threadfn, data, node, namefmt, args, true);
+	va_end(args);
+
+	return task;
+}
+EXPORT_SYMBOL(kthread_create_on_node_root_cpuset);
+
+
+static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask,
+  unsigned int state, bool no_setaffinity)
+
 {
 	unsigned long flags;
 
@@ -604,22 +673,28 @@ static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mas
 	/* It's safe because the task is inactive. */
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
 	do_set_cpus_allowed(p, mask);
-	p->flags |= PF_NO_SETAFFINITY;
+	if (no_setaffinity)
+		p->flags |= PF_NO_SETAFFINITY;
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 }
 
 static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state)
 {
-	__kthread_bind_mask(p, cpumask_of(cpu), state);
+	__kthread_bind_mask(p, cpumask_of(cpu), state, true);
 }
 
 void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask)
 {
 	struct kthread *kthread = to_kthread(p);
-	__kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE);
+	__kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE, true);
 	WARN_ON_ONCE(kthread->started);
 }
 
+void kthread_bind_mask_cpuset(struct task_struct *p, const struct cpumask *mask)
+{
+	set_cpus_allowed_ptr(p, mask);
+}
+
 /**
  * kthread_bind - bind a just-created kthread to a cpu.
  * @p: thread created by kthread_create().
@@ -1044,7 +1119,7 @@ __kthread_create_worker_on_node(unsigned int flags, int node,
 	kthread_init_worker(worker);
 
 	task = __kthread_create_on_node(kthread_worker_fn, worker,
-					node, namefmt, args);
+					node, namefmt, args, true);
 	if (IS_ERR(task))
 		goto fail_task;
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 54e7d63f7785..b604a8451ba3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2393,7 +2393,7 @@ void migrate_enable(void)
 	struct task_struct *p = current;
 	struct affinity_context ac = {
 		.new_mask  = &p->cpus_mask,
-		.flags     = SCA_MIGRATE_ENABLE,
+		.flags     = SCA_MIGRATE_ENABLE | SCA_NO_CPUSET,
 	};
 
 #ifdef CONFIG_DEBUG_PREEMPT
@@ -3153,7 +3153,7 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
  * task must not exit() & deallocate itself prematurely. The
  * call is not atomic; no spinlocks may be held.
  */
-int __set_cpus_allowed_ptr(struct task_struct *p, struct affinity_context *ctx)
+static int do_set_cpus_allowed_ptr(struct task_struct *p, struct affinity_context *ctx)
 {
 	struct rq_flags rf;
 	struct rq *rq;
@@ -3171,6 +3171,79 @@ int __set_cpus_allowed_ptr(struct task_struct *p, struct affinity_context *ctx)
 	return __set_cpus_allowed_ptr_locked(p, ctx, rq, &rf);
 }
 
+int __set_cpus_allowed_ptr(struct task_struct *p,
+				  struct affinity_context *ctx)
+{
+	int retval;
+	cpumask_var_t cpus_allowed, new_mask;
+
+	/*
+	 * Don't restrict the thread to cpuset if explicitly specified or if locked.
+	 */
+	if ((ctx->flags & SCA_NO_CPUSET) || (p->flags & PF_NO_SETAFFINITY))
+		return do_set_cpus_allowed_ptr(p, ctx);
+
+	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
+		WARN_ONCE(!(ctx->flags & SCA_USER),
+		  "Unable to restrict kernel thread to cpuset due to low memory");
+		return -ENOMEM;
+	}
+
+	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
+		WARN_ONCE(!(ctx->flags & SCA_USER),
+		  "Unable to restrict kernel thread to cpuset due to low memory");
+		retval = -ENOMEM;
+		goto out_free_cpus_allowed;
+	}
+
+	cpuset_cpus_allowed(p, cpus_allowed);
+	cpumask_and(new_mask, ctx->new_mask, cpus_allowed);
+
+	ctx->new_mask = new_mask;
+	ctx->flags |= SCA_CHECK;
+
+	retval = dl_task_check_affinity(p, new_mask);
+	if (retval)
+		goto out_free_new_mask;
+
+	retval = do_set_cpus_allowed_ptr(p, ctx);
+	if (retval)
+		goto out_free_new_mask;
+
+	cpuset_cpus_allowed(p, cpus_allowed);
+	if (!cpumask_subset(new_mask, cpus_allowed)) {
+		/*
+		 * We must have raced with a concurrent cpuset update.
+		 * Just reset the cpumask to the cpuset's cpus_allowed.
+		 */
+		cpumask_copy(new_mask, cpus_allowed);
+
+		/*
+		 * If SCA_USER is set, a 2nd call to __set_cpus_allowed_ptr()
+		 * will restore the previous user_cpus_ptr value.
+		 *
+		 * In the unlikely event a previous user_cpus_ptr exists,
+		 * we need to further restrict the mask to what is allowed
+		 * by that old user_cpus_ptr.
+		 */
+		if (unlikely((ctx->flags & SCA_USER) && ctx->user_mask)) {
+			bool empty = !cpumask_and(new_mask, new_mask,
+						  ctx->user_mask);
+
+			if (empty)
+				cpumask_copy(new_mask, cpus_allowed);
+		}
+		__set_cpus_allowed_ptr(p, ctx);
+		retval = -EINVAL;
+	}
+
+out_free_new_mask:
+	free_cpumask_var(new_mask);
+out_free_cpus_allowed:
+	free_cpumask_var(cpus_allowed);
+	return retval;
+}
+
 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 {
 	struct affinity_context ac = {
@@ -3182,6 +3255,17 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 }
 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
 
+int set_cpus_allowed_ptr_flags(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
+{
+	struct affinity_context ac = {
+		.new_mask  = new_mask,
+		.flags     = flags,
+	};
+
+	return __set_cpus_allowed_ptr(p, &ac);
+}
+EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr_flags);
+
 /*
  * Change a given task's CPU affinity to the intersection of its current
  * affinity mask and @subset_mask, writing the resulting mask to @new_mask.
@@ -3283,15 +3367,15 @@ void relax_compatible_cpus_allowed_ptr(struct task_struct *p)
 {
 	struct affinity_context ac = {
 		.new_mask  = task_user_cpus(p),
-		.flags     = 0,
+		.flags     = SCA_NO_CPUSET,
 	};
 	int ret;
 
 	/*
-	 * Try to restore the old affinity mask with __sched_setaffinity().
+	 * Try to restore the old affinity mask with __set_cpus_allowed_ptr().
 	 * Cpuset masking will be done there too.
 	 */
-	ret = __sched_setaffinity(p, &ac);
+	ret = __set_cpus_allowed_ptr(p, &ac);
 	WARN_ON_ONCE(ret);
 }
 
@@ -7292,6 +7376,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
 }
 #endif
 
+
 #if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC)
 int __sched __cond_resched(void)
 {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 91bea8d0a90b..9833432c9a75 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2576,11 +2576,6 @@ static inline bool sched_fair_runnable(struct rq *rq)
 extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
 extern struct task_struct *pick_task_idle(struct rq *rq);
 
-#define SCA_CHECK		0x01
-#define SCA_MIGRATE_DISABLE	0x02
-#define SCA_MIGRATE_ENABLE	0x04
-#define SCA_USER		0x08
-
 #ifdef CONFIG_SMP
 
 extern void update_group_capacity(struct sched_domain *sd, int cpu);
@@ -3939,7 +3934,6 @@ static inline int rt_effective_prio(struct task_struct *p, int prio)
 #endif /* !CONFIG_RT_MUTEXES */
 
 extern int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi);
-extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx);
 extern const struct sched_class *__setscheduler_class(int policy, int prio);
 extern void set_load_weight(struct task_struct *p, bool update_load);
 extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index 547c1f05b667..6528153c1297 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -1151,67 +1151,6 @@ int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
 }
 #endif /* CONFIG_SMP */
 
-int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx)
-{
-	int retval;
-	cpumask_var_t cpus_allowed, new_mask;
-
-	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL))
-		return -ENOMEM;
-
-	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
-		retval = -ENOMEM;
-		goto out_free_cpus_allowed;
-	}
-
-	cpuset_cpus_allowed(p, cpus_allowed);
-	cpumask_and(new_mask, ctx->new_mask, cpus_allowed);
-
-	ctx->new_mask = new_mask;
-	ctx->flags |= SCA_CHECK;
-
-	retval = dl_task_check_affinity(p, new_mask);
-	if (retval)
-		goto out_free_new_mask;
-
-	retval = __set_cpus_allowed_ptr(p, ctx);
-	if (retval)
-		goto out_free_new_mask;
-
-	cpuset_cpus_allowed(p, cpus_allowed);
-	if (!cpumask_subset(new_mask, cpus_allowed)) {
-		/*
-		 * We must have raced with a concurrent cpuset update.
-		 * Just reset the cpumask to the cpuset's cpus_allowed.
-		 */
-		cpumask_copy(new_mask, cpus_allowed);
-
-		/*
-		 * If SCA_USER is set, a 2nd call to __set_cpus_allowed_ptr()
-		 * will restore the previous user_cpus_ptr value.
-		 *
-		 * In the unlikely event a previous user_cpus_ptr exists,
-		 * we need to further restrict the mask to what is allowed
-		 * by that old user_cpus_ptr.
-		 */
-		if (unlikely((ctx->flags & SCA_USER) && ctx->user_mask)) {
-			bool empty = !cpumask_and(new_mask, new_mask,
-						  ctx->user_mask);
-
-			if (empty)
-				cpumask_copy(new_mask, cpus_allowed);
-		}
-		__set_cpus_allowed_ptr(p, ctx);
-		retval = -EINVAL;
-	}
-
-out_free_new_mask:
-	free_cpumask_var(new_mask);
-out_free_cpus_allowed:
-	free_cpumask_var(cpus_allowed);
-	return retval;
-}
-
 long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 {
 	struct affinity_context ac;
@@ -1252,7 +1191,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 		.flags     = SCA_USER,
 	};
 
-	retval = __sched_setaffinity(p, &ac);
+	retval = __set_cpus_allowed_ptr(p, &ac);
 	kfree(ac.user_mask);
 
 	return retval;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f9ef467020cf..d51c0716674e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2813,7 +2813,10 @@ static struct worker *create_worker(struct worker_pool *pool)
 		}
 
 		set_user_nice(worker->task, pool->attrs->nice);
-		kthread_bind_mask(worker->task, pool_allowed_cpus(pool));
+		if (!pool || (!worker->rescue_wq && pool->cpu >= 0))
+			kthread_bind_mask(worker->task, pool_allowed_cpus(pool));
+		else
+			kthread_bind_mask_cpuset(worker->task, pool_allowed_cpus(pool));
 	}
 
 	/* successful, attach the worker to the pool */
@@ -5587,7 +5590,7 @@ static int init_rescuer(struct workqueue_struct *wq)
 	if (wq->flags & WQ_UNBOUND)
 		kthread_bind_mask(rescuer->task, unbound_effective_cpumask(wq));
 	else
-		kthread_bind_mask(rescuer->task, cpu_possible_mask);
+		kthread_bind_mask_cpuset(rescuer->task, cpu_possible_mask);
 	wake_up_process(rescuer->task);
 
 	return 0;

Re: [RFC/PATCH] sched: Support moving kthreads into cpuset cgroups

Posted by Michal Koutný 9 months ago

Hello.

On Tue, May 06, 2025 at 11:35:32AM -0700, Xi Wang <xii@google.com> wrote:
> In theory we should be able to manage kernel tasks with cpuset
> cgroups just like user tasks, would be a flexible way to limit
> interferences to real-time and other sensitive workloads.

I can see that this might be good for PF_USER_WORKER type of kernel
tasks. However, generic kernel tasks are spawned by kernel who
knows/demands which should run where and therefore they should not be
subject to cpuset restrictions. When limiting interference is
considered, there's CPU isolation for that.

The migratable kthreadd seems too coarse grained approach to me (also
when compared with CPU isolation).
I'd mostly echo Tejun's comment [1].

Regards,
Michal

[1] https://lore.kernel.org/r/aBqmmtST-_9oM9rF@slm.duckdns.org/

Re: [RFC/PATCH] sched: Support moving kthreads into cpuset cgroups

Posted by Xi Wang 9 months ago

On Mon, May 12, 2025 at 3:36 AM Michal Koutný <mkoutny@suse.com> wrote:
>
> Hello.
>
> On Tue, May 06, 2025 at 11:35:32AM -0700, Xi Wang <xii@google.com> wrote:
> > In theory we should be able to manage kernel tasks with cpuset
> > cgroups just like user tasks, would be a flexible way to limit
> > interferences to real-time and other sensitive workloads.
>
> I can see that this might be good for PF_USER_WORKER type of kernel
> tasks. However, generic kernel tasks are spawned by kernel who
> knows/demands which should run where and therefore they should not be
> subject to cpuset restrictions. When limiting interference is
> considered, there's CPU isolation for that.
>
> The migratable kthreadd seems too coarse grained approach to me (also
> when compared with CPU isolation).
> I'd mostly echo Tejun's comment [1].
>
> Regards,
> Michal
>
> [1] https://lore.kernel.org/r/aBqmmtST-_9oM9rF@slm.duckdns.org/

Kernel doesn't actually have the best information because it doesn't know which
user threads are more important. Giving the root user more power for fine tuning
is a common practice.

The most important reason for moving kthreadd is to prevent a kthread from
interfering with important user threads right after forking. It can still be
moved to other cgroups later. Moving kthreadd allows better control rather than
worse control.

CPU isolation is more narrowly focused compared to generic cpuset based control.

-Xi

Re: [RFC/PATCH] sched: Support moving kthreads into cpuset cgroups

Posted by Tejun Heo 9 months ago

Hello,

On Tue, May 06, 2025 at 11:35:32AM -0700, Xi Wang wrote:
> In theory we should be able to manage kernel tasks with cpuset
> cgroups just like user tasks, would be a flexible way to limit
> interferences to real-time and other sensitive workloads. This is
> however not supported today: When setting cpu affinity for kthreads,
> kernel code uses a simpler control path that directly lead to
> __set_cpus_allowed_ptr or __ktread_bind_mask. Neither honors cpuset
> restrictions.
> 
> This patch adds cpuset support for kernel tasks by merging userspace
> and kernel cpu affinity control paths and applying the same
> restrictions to kthreads.
> 
> The PF_NO_SETAFFINITY flag is still supported for tasks that have to
> run with certain cpu affinities. Kernel ensures kthreads with this
> flag have their affinities locked and they stay in the root cpuset:
> 
> If userspace moves kthreadd out of the root cpuset (see example
> below), a newly forked kthread will be in a non root cgroup as well.
> If PF_NO_SETAFFINITY is detected for the kthread, it will move itself
> into the root cpuset before the threadfn is called. This does depend
> on the kthread create -> kthread bind -> wake up sequence.

Can you describe the use cases in detail? This is not in line with the
overall direction. e.g. We're making cpuset work with housekeeping mechanism
and tell workqueue which CPUs can be used for unbound execution and kthreads
which are closely tied to userspace activities are spawned into the same
cgroups as the user thread and subject to usual resource control.

There are a lot of risks in subjecting arbitrary kthreads to all cgroup
resource controls and just allowing cpuset doesn't seem like a great idea.
Integration through housekeeping makes a lot more sense to me. Note that
even for just cpuset thread level control doesn't really work that well. All
kthreads are forked by kthreadd. If you move the kthreadd into a cgroup, all
kthreads includling kworkers for all workqueues will be spawned there. The
granularity of control isn't much better than going through housekeeping.

Thanks.

-- 
tejun

Re: [RFC/PATCH] sched: Support moving kthreads into cpuset cgroups

Posted by Xi Wang 9 months ago

On Tue, May 6, 2025 at 5:17 PM Tejun Heo <tj@kernel.org> wrote:
>
> Hello,
>
> On Tue, May 06, 2025 at 11:35:32AM -0700, Xi Wang wrote:
> > In theory we should be able to manage kernel tasks with cpuset
> > cgroups just like user tasks, would be a flexible way to limit
> > interferences to real-time and other sensitive workloads. This is
> > however not supported today: When setting cpu affinity for kthreads,
> > kernel code uses a simpler control path that directly lead to
> > __set_cpus_allowed_ptr or __ktread_bind_mask. Neither honors cpuset
> > restrictions.
> >
> > This patch adds cpuset support for kernel tasks by merging userspace
> > and kernel cpu affinity control paths and applying the same
> > restrictions to kthreads.
> >
> > The PF_NO_SETAFFINITY flag is still supported for tasks that have to
> > run with certain cpu affinities. Kernel ensures kthreads with this
> > flag have their affinities locked and they stay in the root cpuset:
> >
> > If userspace moves kthreadd out of the root cpuset (see example
> > below), a newly forked kthread will be in a non root cgroup as well.
> > If PF_NO_SETAFFINITY is detected for the kthread, it will move itself
> > into the root cpuset before the threadfn is called. This does depend
> > on the kthread create -> kthread bind -> wake up sequence.
>
> Can you describe the use cases in detail? This is not in line with the
> overall direction. e.g. We're making cpuset work with housekeeping mechanism
> and tell workqueue which CPUs can be used for unbound execution and kthreads
> which are closely tied to userspace activities are spawned into the same
> cgroups as the user thread and subject to usual resource control.
>
> There are a lot of risks in subjecting arbitrary kthreads to all cgroup
> resource controls and just allowing cpuset doesn't seem like a great idea.
> Integration through housekeeping makes a lot more sense to me. Note that
> even for just cpuset thread level control doesn't really work that well. All
> kthreads are forked by kthreadd. If you move the kthreadd into a cgroup, all
> kthreads includling kworkers for all workqueues will be spawned there. The
> granularity of control isn't much better than going through housekeeping.

For the use cases, there are two major requirements at the moment:

Dynamic cpu affinity based isolation: CPUs running latency sensitive threads
(vcpu threads) can change over time. We'd like to configure kernel thread
affinity at run time too. Changing cpu affinity at run time requires cpumask
calculations and thread migrations. Sharing cpuset code would be nice.

Support numa based memory daemon affinity: We'd like to restrict kernel memory
daemons but maintain their numa affinity at the same time. cgroup hierarchies
can be helpful, e.g. create kernel, kernel/node0 and kernel/node1 and move the
daemons to the right cgroup.

Workqueue coverage is optional. kworker threads can use their separate
mechanisms too.

Since the goal is isolation, we'd like to restrict as many kthreads as possible,
even the ones that don't directly interact with user applications.

The kthreadd case is handled - a new kthread can be forked inside a non root
cgroup, but based on flags it can move itself to the root cgroup before threadfn
is called.

-Xi

Re: [RFC/PATCH] sched: Support moving kthreads into cpuset cgroups

Posted by Frederic Weisbecker 9 months ago

Le Tue, May 06, 2025 at 08:43:57PM -0700, Xi Wang a écrit :
> On Tue, May 6, 2025 at 5:17 PM Tejun Heo <tj@kernel.org> wrote:
> For the use cases, there are two major requirements at the moment:
> 
> Dynamic cpu affinity based isolation: CPUs running latency sensitive threads
> (vcpu threads) can change over time. We'd like to configure kernel thread
> affinity at run time too.

I would expect such latency sensitive application to run on isolated
partitions. And those already don't pull unbound kthreads.

> Changing cpu affinity at run time requires cpumask
> calculations and thread migrations. Sharing cpuset code would be nice.

There is already some (recent) affinity management in the kthread subsystem.
A list of kthreads having a preferred affinity (but !PF_NO_SETAFFINITY)
is maintained and automatically handled against hotplug events and housekeeping
state.

> 
> Support numa based memory daemon affinity: We'd like to restrict kernel memory
> daemons but maintain their numa affinity at the same time. cgroup hierarchies
> can be helpful, e.g. create kernel, kernel/node0 and kernel/node1 and move the
> daemons to the right cgroup.

The kthread subsystem also handles node affinity. See kswapd / kcompactd. And it
takes care of that while still honouring isolated / isolcpus partitions:

      d1a89197589c ("kthread: Default affine kthread to its preferred NUMA node")

> 
> Workqueue coverage is optional. kworker threads can use their separate
> mechanisms too.
> 
> Since the goal is isolation, we'd like to restrict as many kthreads as possible,
> even the ones that don't directly interact with user applications.
> 
> The kthreadd case is handled - a new kthread can be forked inside a non root
> cgroup, but based on flags it can move itself to the root cgroup before threadfn
> is called.

kthreadd and other kthreads that don't have a preferred affinity are also
affine outside isolcpus/nohz_full. And since isolated cpuset partitions
create NULL domains, those kthreads won't run there either.

What am I missing?

Thanks.

-- 
Frederic Weisbecker
SUSE Labs

Re: [RFC/PATCH] sched: Support moving kthreads into cpuset cgroups

Posted by Xi Wang 9 months ago

On Wed, May 7, 2025 at 7:11 AM Frederic Weisbecker <frederic@kernel.org> wrote:
>
> Le Tue, May 06, 2025 at 08:43:57PM -0700, Xi Wang a écrit :
> > On Tue, May 6, 2025 at 5:17 PM Tejun Heo <tj@kernel.org> wrote:
> > For the use cases, there are two major requirements at the moment:
> >
> > Dynamic cpu affinity based isolation: CPUs running latency sensitive threads
> > (vcpu threads) can change over time. We'd like to configure kernel thread
> > affinity at run time too.
>
> I would expect such latency sensitive application to run on isolated
> partitions. And those already don't pull unbound kthreads.
>
> > Changing cpu affinity at run time requires cpumask
> > calculations and thread migrations. Sharing cpuset code would be nice.
>
> There is already some (recent) affinity management in the kthread subsystem.
> A list of kthreads having a preferred affinity (but !PF_NO_SETAFFINITY)
> is maintained and automatically handled against hotplug events and housekeeping
> state.
>
> >
> > Support numa based memory daemon affinity: We'd like to restrict kernel memory
> > daemons but maintain their numa affinity at the same time. cgroup hierarchies
> > can be helpful, e.g. create kernel, kernel/node0 and kernel/node1 and move the
> > daemons to the right cgroup.
>
> The kthread subsystem also handles node affinity. See kswapd / kcompactd. And it
> takes care of that while still honouring isolated / isolcpus partitions:
>
>       d1a89197589c ("kthread: Default affine kthread to its preferred NUMA node")
>
> >
> > Workqueue coverage is optional. kworker threads can use their separate
> > mechanisms too.
> >
> > Since the goal is isolation, we'd like to restrict as many kthreads as possible,
> > even the ones that don't directly interact with user applications.
> >
> > The kthreadd case is handled - a new kthread can be forked inside a non root
> > cgroup, but based on flags it can move itself to the root cgroup before threadfn
> > is called.
>
> kthreadd and other kthreads that don't have a preferred affinity are also
> affine outside isolcpus/nohz_full. And since isolated cpuset partitions
> create NULL domains, those kthreads won't run there either.
>
> What am I missing?

Overall I think your arguments depend on kernel and application threads are
significantly different for cpu affinity management, but there isn't enough
evidence for it. If cpuset is a bad idea for kernel threads it's probably not
a good idea for user threads either. Maybe we should just remove cpuset from
kernel and let applications threads go with boot time global variables and
set their own cpu affinities.

-Xi

Re: [RFC/PATCH] sched: Support moving kthreads into cpuset cgroups

Posted by Tejun Heo 9 months ago

Hello,

On Wed, May 07, 2025 at 10:23:24AM -0700, Xi Wang wrote:
> Overall I think your arguments depend on kernel and application threads are
> significantly different for cpu affinity management, but there isn't enough
> evidence for it. If cpuset is a bad idea for kernel threads it's probably not
> a good idea for user threads either. Maybe we should just remove cpuset from
> kernel and let applications threads go with boot time global variables and
> set their own cpu affinities.

I can't tell whether you're making a good faith argument. Even if you are,
you're making one bold claim without much substance and then jumping to the
other extreme based on that. This isn't a productive way to discuss these
things.

Thanks.

-- 
tejun

Re: [RFC/PATCH] sched: Support moving kthreads into cpuset cgroups

Posted by Xi Wang 9 months ago

On Wed, May 7, 2025 at 10:36 AM Tejun Heo <tj@kernel.org> wrote:
>
> Hello,
>
> On Wed, May 07, 2025 at 10:23:24AM -0700, Xi Wang wrote:
> > Overall I think your arguments depend on kernel and application threads are
> > significantly different for cpu affinity management, but there isn't enough
> > evidence for it. If cpuset is a bad idea for kernel threads it's probably not
> > a good idea for user threads either. Maybe we should just remove cpuset from
> > kernel and let applications threads go with boot time global variables and
> > set their own cpu affinities.
>
> I can't tell whether you're making a good faith argument. Even if you are,
> you're making one bold claim without much substance and then jumping to the
> other extreme based on that. This isn't a productive way to discuss these
> things.
>
> Thanks.
>
> --
> tejun

Yes this is still serious technical discussion. Frederic made several "we can't
have b because we already have / are working on a" statements which were not
very actionable. Deducing to a particular case is a quick way to simplify. I'd
prefer to focus more on higher level technical tradeoffs.

Overall compartmentalization limits resource (cpu) sharing which limits
overcommit thus efficiency. cpumask restrictions are not ideal but sometimes
necessary. Dynamically configurable cpumasks are better than statically
reserved cpus.

I do think the cgroup tree structure sometimes helps and we don't have to use
it for all cases.

-Xi

Re: [RFC/PATCH] sched: Support moving kthreads into cpuset cgroups

Posted by Frederic Weisbecker 9 months ago

Le Wed, May 07, 2025 at 01:07:16PM -0700, Xi Wang a écrit :
> On Wed, May 7, 2025 at 10:36 AM Tejun Heo <tj@kernel.org> wrote:
> >
> > Hello,
> >
> > On Wed, May 07, 2025 at 10:23:24AM -0700, Xi Wang wrote:
> > > Overall I think your arguments depend on kernel and application threads are
> > > significantly different for cpu affinity management, but there isn't enough
> > > evidence for it. If cpuset is a bad idea for kernel threads it's probably not
> > > a good idea for user threads either. Maybe we should just remove cpuset from
> > > kernel and let applications threads go with boot time global variables and
> > > set their own cpu affinities.
> >
> > I can't tell whether you're making a good faith argument. Even if you are,
> > you're making one bold claim without much substance and then jumping to the
> > other extreme based on that. This isn't a productive way to discuss these
> > things.
> >
> > Thanks.
> >
> > --
> > tejun
> 
> Yes this is still serious technical discussion. Frederic made several "we can't
> have b because we already have / are working on a" statements which were not
> very actionable. Deducing to a particular case is a quick way to simplify.

I referred to a particular case (isolation) because you said this is your
usecase. You still haven't explained us why the current affinity management for
kthreads doesn't work for you.

> I'd prefer to focus more on higher level technical tradeoffs.
> 
> Overall compartmentalization limits resource (cpu) sharing which limits
> overcommit thus efficiency.
> cpumask restrictions are not ideal but sometimes
> necessary. Dynamically configurable cpumasks are better than statically
> reserved cpus.

For which usecase?

> I do think the cgroup tree structure sometimes helps and we don't have to use
> it for all cases.

Also kernel threads are special beasts, even some !PF_NO_SETAFFINTIY kthreads
have actual affinity preferences. If they can go through cpusets, this must be
dealt with. And admins will need to know about those affinity preferences for
each kthreads.

Also do we want to be able to expose all the cgroup limits to kthreads? Even
if only cpusets is allowed to have kthreads, does cpusets.mems make
sense to be exposed for example?

If your issue is ever resolved through cpusets, this will have to be maintained
forever with all those subtleties in mind.

I tend to think that CPU isolation is a very straightforward cpusets usecase:
no load balancing, NULL domains and tasks usually don't compete much for the
CPU since the point is to not be disturbed anyway.

And NULL domains already exclude kernel threads, dynamically. So please give
us a compelling reason for doing this.

Thanks.

-- 
Frederic Weisbecker
SUSE Labs

Re: [RFC/PATCH] sched: Support moving kthreads into cpuset cgroups

Posted by Xi Wang 9 months ago

On Wed, May 7, 2025 at 5:08 PM Frederic Weisbecker <frederic@kernel.org> wrote:
>
> Le Wed, May 07, 2025 at 01:07:16PM -0700, Xi Wang a écrit :
> > On Wed, May 7, 2025 at 10:36 AM Tejun Heo <tj@kernel.org> wrote:
> > >
> > > Hello,
> > >
> > > On Wed, May 07, 2025 at 10:23:24AM -0700, Xi Wang wrote:
> > > > Overall I think your arguments depend on kernel and application threads are
> > > > significantly different for cpu affinity management, but there isn't enough
> > > > evidence for it. If cpuset is a bad idea for kernel threads it's probably not
> > > > a good idea for user threads either. Maybe we should just remove cpuset from
> > > > kernel and let applications threads go with boot time global variables and
> > > > set their own cpu affinities.
> > >
> > > I can't tell whether you're making a good faith argument. Even if you are,
> > > you're making one bold claim without much substance and then jumping to the
> > > other extreme based on that. This isn't a productive way to discuss these
> > > things.
> > >
> > > Thanks.
> > >
> > > --
> > > tejun
> >
> > Yes this is still serious technical discussion. Frederic made several "we can't
> > have b because we already have / are working on a" statements which were not
> > very actionable. Deducing to a particular case is a quick way to simplify.
>
> I referred to a particular case (isolation) because you said this is your
> usecase. You still haven't explained us why the current affinity management for
> kthreads doesn't work for you.
>
> > I'd prefer to focus more on higher level technical tradeoffs.
> >
> > Overall compartmentalization limits resource (cpu) sharing which limits
> > overcommit thus efficiency.
> > cpumask restrictions are not ideal but sometimes
> > necessary. Dynamically configurable cpumasks are better than statically
> > reserved cpus.
>
> For which usecase?
>
> > I do think the cgroup tree structure sometimes helps and we don't have to use
> > it for all cases.
>
> Also kernel threads are special beasts, even some !PF_NO_SETAFFINTIY kthreads
> have actual affinity preferences. If they can go through cpusets, this must be
> dealt with. And admins will need to know about those affinity preferences for
> each kthreads.
>
> Also do we want to be able to expose all the cgroup limits to kthreads? Even
> if only cpusets is allowed to have kthreads, does cpusets.mems make
> sense to be exposed for example?
>
> If your issue is ever resolved through cpusets, this will have to be maintained
> forever with all those subtleties in mind.
>
> I tend to think that CPU isolation is a very straightforward cpusets usecase:
> no load balancing, NULL domains and tasks usually don't compete much for the
> CPU since the point is to not be disturbed anyway.
>
> And NULL domains already exclude kernel threads, dynamically. So please give
> us a compelling reason for doing this.
>
> Thanks.
>
> --
> Frederic Weisbecker
> SUSE Labs

I think our problem spaces are different. Perhaps your problems are closer to
hard real-time systems but our problems are about improving latency of existing
systems while maintaining efficiency (max supported cpu util).

For hard real-time systems we sometimes throw cores at the problem and run no
more than one thread per cpu. But if we want efficiency we have to share cpus
with scheduling policies. Disconnecting the cpu scheduler with isolcpus results
in losing too much of the machine capacity. CPU scheduling is needed for both
kernel and userspace threads.

For our use case we need to move kernel threads away from certain vcpu threads,
but other vcpu threads can share cpus with kernel threads. The ratio changes
from time to time. Permanently putting aside a few cpus results in a reduction
in machine capacity.

The PF_NO_SETAFFINTIY case is already handled by the patch. These threads will
run in root cgroup with affinities just like before.

The original justifications for the cpuset feature is here and the reasons are
still applicable:

"The management of large computer systems, with many processors (CPUs), complex
memory cache hierarchies and multiple Memory Nodes having non-uniform access
times (NUMA) presents additional challenges for the efficient scheduling and
memory placement of processes."

"But larger systems, which benefit more from careful processor and memory
placement to reduce memory access times and contention.."

"These subsets, or “soft partitions” must be able to be dynamically adjusted, as
the job mix changes, without impacting other concurrently executing jobs."

https://docs.kernel.org/admin-guide/cgroup-v1/cpusets.html

-Xi

Re: [RFC/PATCH] sched: Support moving kthreads into cpuset cgroups

Posted by Waiman Long 9 months ago

On 5/8/25 1:51 PM, Xi Wang wrote:
> I think our problem spaces are different. Perhaps your problems are closer to
> hard real-time systems but our problems are about improving latency of existing
> systems while maintaining efficiency (max supported cpu util).
>
> For hard real-time systems we sometimes throw cores at the problem and run no
> more than one thread per cpu. But if we want efficiency we have to share cpus
> with scheduling policies. Disconnecting the cpu scheduler with isolcpus results
> in losing too much of the machine capacity. CPU scheduling is needed for both
> kernel and userspace threads.
>
> For our use case we need to move kernel threads away from certain vcpu threads,
> but other vcpu threads can share cpus with kernel threads. The ratio changes
> from time to time. Permanently putting aside a few cpus results in a reduction
> in machine capacity.
>
> The PF_NO_SETAFFINTIY case is already handled by the patch. These threads will
> run in root cgroup with affinities just like before.
>
> The original justifications for the cpuset feature is here and the reasons are
> still applicable:
>
> "The management of large computer systems, with many processors (CPUs), complex
> memory cache hierarchies and multiple Memory Nodes having non-uniform access
> times (NUMA) presents additional challenges for the efficient scheduling and
> memory placement of processes."
>
> "But larger systems, which benefit more from careful processor and memory
> placement to reduce memory access times and contention.."
>
> "These subsets, or “soft partitions” must be able to be dynamically adjusted, as
> the job mix changes, without impacting other concurrently executing jobs."
>
> https://docs.kernel.org/admin-guide/cgroup-v1/cpusets.html
>
> -Xi
>
If you create a cpuset root partition, we are pushing some kthreads 
aways from CPUs dedicated to the newly created partition which has its 
own scheduling domain separate from the cgroup root. I do realize that 
the current way of excluding only per cpu kthreads isn't quite right. So 
I send out a new patch to extend to all the PF_NO_SETAFFINITY kthreads.

So instead of putting kthreads into the dedicated cpuset, we still keep 
them in the root cgroup. Instead we can create a separate cpuset 
partition to run the workload without interference from the background 
kthreads. Will that functionality suit your current need?

Cheers,
Longman

Re: [RFC/PATCH] sched: Support moving kthreads into cpuset cgroups

Posted by Xi Wang 9 months ago

On Thu, May 8, 2025 at 12:35 PM Waiman Long <llong@redhat.com> wrote:
>
> On 5/8/25 1:51 PM, Xi Wang wrote:
> > I think our problem spaces are different. Perhaps your problems are closer to
> > hard real-time systems but our problems are about improving latency of existing
> > systems while maintaining efficiency (max supported cpu util).
> >
> > For hard real-time systems we sometimes throw cores at the problem and run no
> > more than one thread per cpu. But if we want efficiency we have to share cpus
> > with scheduling policies. Disconnecting the cpu scheduler with isolcpus results
> > in losing too much of the machine capacity. CPU scheduling is needed for both
> > kernel and userspace threads.
> >
> > For our use case we need to move kernel threads away from certain vcpu threads,
> > but other vcpu threads can share cpus with kernel threads. The ratio changes
> > from time to time. Permanently putting aside a few cpus results in a reduction
> > in machine capacity.
> >
> > The PF_NO_SETAFFINTIY case is already handled by the patch. These threads will
> > run in root cgroup with affinities just like before.
> >
> > The original justifications for the cpuset feature is here and the reasons are
> > still applicable:
> >
> > "The management of large computer systems, with many processors (CPUs), complex
> > memory cache hierarchies and multiple Memory Nodes having non-uniform access
> > times (NUMA) presents additional challenges for the efficient scheduling and
> > memory placement of processes."
> >
> > "But larger systems, which benefit more from careful processor and memory
> > placement to reduce memory access times and contention.."
> >
> > "These subsets, or “soft partitions” must be able to be dynamically adjusted, as
> > the job mix changes, without impacting other concurrently executing jobs."
> >
> > https://docs.kernel.org/admin-guide/cgroup-v1/cpusets.html
> >
> > -Xi
> >
> If you create a cpuset root partition, we are pushing some kthreads
> aways from CPUs dedicated to the newly created partition which has its
> own scheduling domain separate from the cgroup root. I do realize that
> the current way of excluding only per cpu kthreads isn't quite right. So
> I send out a new patch to extend to all the PF_NO_SETAFFINITY kthreads.
>
> So instead of putting kthreads into the dedicated cpuset, we still keep
> them in the root cgroup. Instead we can create a separate cpuset
> partition to run the workload without interference from the background
> kthreads. Will that functionality suit your current need?
>
> Cheers,
> Longman
>

It's likely a major improvement over a fixed partition but maybe still not fully
flexible. I am not familiar with cpuset partitions but I wonder if the following
case can be supported:

Starting from
16 cpus
Root has cpu 0-3, 8-15
Job A has cpu 4-7 exclusive
Kernel threads cannot run on cpu 4-8 which is good.

Now adding best effort Job B, which is under SCHED_IDLE and rarely enters kernel
mode. As we expect C can be easily preempted we allow it to share cpus with A
and kernel threads to maximize throughput. Is there a layout that supports the
requirements below?

Job C threads on cpu 0-15
Job A threads on cpu 4-7
No kernel threads on cpu 4-7

-Xi

Re: [RFC/PATCH] sched: Support moving kthreads into cpuset cgroups

Posted by Waiman Long 9 months ago

On 5/8/25 6:39 PM, Xi Wang wrote:
> On Thu, May 8, 2025 at 12:35 PM Waiman Long <llong@redhat.com> wrote:
>> On 5/8/25 1:51 PM, Xi Wang wrote:
>>> I think our problem spaces are different. Perhaps your problems are closer to
>>> hard real-time systems but our problems are about improving latency of existing
>>> systems while maintaining efficiency (max supported cpu util).
>>>
>>> For hard real-time systems we sometimes throw cores at the problem and run no
>>> more than one thread per cpu. But if we want efficiency we have to share cpus
>>> with scheduling policies. Disconnecting the cpu scheduler with isolcpus results
>>> in losing too much of the machine capacity. CPU scheduling is needed for both
>>> kernel and userspace threads.
>>>
>>> For our use case we need to move kernel threads away from certain vcpu threads,
>>> but other vcpu threads can share cpus with kernel threads. The ratio changes
>>> from time to time. Permanently putting aside a few cpus results in a reduction
>>> in machine capacity.
>>>
>>> The PF_NO_SETAFFINTIY case is already handled by the patch. These threads will
>>> run in root cgroup with affinities just like before.
>>>
>>> The original justifications for the cpuset feature is here and the reasons are
>>> still applicable:
>>>
>>> "The management of large computer systems, with many processors (CPUs), complex
>>> memory cache hierarchies and multiple Memory Nodes having non-uniform access
>>> times (NUMA) presents additional challenges for the efficient scheduling and
>>> memory placement of processes."
>>>
>>> "But larger systems, which benefit more from careful processor and memory
>>> placement to reduce memory access times and contention.."
>>>
>>> "These subsets, or “soft partitions” must be able to be dynamically adjusted, as
>>> the job mix changes, without impacting other concurrently executing jobs."
>>>
>>> https://docs.kernel.org/admin-guide/cgroup-v1/cpusets.html
>>>
>>> -Xi
>>>
>> If you create a cpuset root partition, we are pushing some kthreads
>> aways from CPUs dedicated to the newly created partition which has its
>> own scheduling domain separate from the cgroup root. I do realize that
>> the current way of excluding only per cpu kthreads isn't quite right. So
>> I send out a new patch to extend to all the PF_NO_SETAFFINITY kthreads.
>>
>> So instead of putting kthreads into the dedicated cpuset, we still keep
>> them in the root cgroup. Instead we can create a separate cpuset
>> partition to run the workload without interference from the background
>> kthreads. Will that functionality suit your current need?
>>
>> Cheers,
>> Longman
>>
> It's likely a major improvement over a fixed partition but maybe still not fully
> flexible. I am not familiar with cpuset partitions but I wonder if the following
> case can be supported:
>
> Starting from
> 16 cpus
> Root has cpu 0-3, 8-15
> Job A has cpu 4-7 exclusive
> Kernel threads cannot run on cpu 4-8 which is good.
There will still be some kernel threads with PF_NO_SETAFFINITY flag set.

>
> Now adding best effort Job B, which is under SCHED_IDLE and rarely enters kernel
> mode. As we expect C can be easily preempted we allow it to share cpus with A
> and kernel threads to maximize throughput. Is there a layout that supports the
> requirements below?
>
> Job C threads on cpu 0-15

A task/thread can only be in one cpuset. So it cannot span all the CPUs. 
However, if there are multiples threads within the process, some of the 
threads can be moved to a different cpuset as it is threaded. With 
proper thread setting, you can have a job with threads spanning all the 
CPUs.

Cheers,
Longman

> Job A threads on cpu 4-7
> No kernel threads on cpu 4-7
>
> -Xi
>

Re: [RFC/PATCH] sched: Support moving kthreads into cpuset cgroups

Posted by Xi Wang 9 months ago

On Thu, May 8, 2025 at 5:30 PM Waiman Long <llong@redhat.com> wrote:
>
> On 5/8/25 6:39 PM, Xi Wang wrote:
> > On Thu, May 8, 2025 at 12:35 PM Waiman Long <llong@redhat.com> wrote:
> >> On 5/8/25 1:51 PM, Xi Wang wrote:
> >>> I think our problem spaces are different. Perhaps your problems are closer to
> >>> hard real-time systems but our problems are about improving latency of existing
> >>> systems while maintaining efficiency (max supported cpu util).
> >>>
> >>> For hard real-time systems we sometimes throw cores at the problem and run no
> >>> more than one thread per cpu. But if we want efficiency we have to share cpus
> >>> with scheduling policies. Disconnecting the cpu scheduler with isolcpus results
> >>> in losing too much of the machine capacity. CPU scheduling is needed for both
> >>> kernel and userspace threads.
> >>>
> >>> For our use case we need to move kernel threads away from certain vcpu threads,
> >>> but other vcpu threads can share cpus with kernel threads. The ratio changes
> >>> from time to time. Permanently putting aside a few cpus results in a reduction
> >>> in machine capacity.
> >>>
> >>> The PF_NO_SETAFFINTIY case is already handled by the patch. These threads will
> >>> run in root cgroup with affinities just like before.
> >>>
> >>> The original justifications for the cpuset feature is here and the reasons are
> >>> still applicable:
> >>>
> >>> "The management of large computer systems, with many processors (CPUs), complex
> >>> memory cache hierarchies and multiple Memory Nodes having non-uniform access
> >>> times (NUMA) presents additional challenges for the efficient scheduling and
> >>> memory placement of processes."
> >>>
> >>> "But larger systems, which benefit more from careful processor and memory
> >>> placement to reduce memory access times and contention.."
> >>>
> >>> "These subsets, or “soft partitions” must be able to be dynamically adjusted, as
> >>> the job mix changes, without impacting other concurrently executing jobs."
> >>>
> >>> https://docs.kernel.org/admin-guide/cgroup-v1/cpusets.html
> >>>
> >>> -Xi
> >>>
> >> If you create a cpuset root partition, we are pushing some kthreads
> >> aways from CPUs dedicated to the newly created partition which has its
> >> own scheduling domain separate from the cgroup root. I do realize that
> >> the current way of excluding only per cpu kthreads isn't quite right. So
> >> I send out a new patch to extend to all the PF_NO_SETAFFINITY kthreads.
> >>
> >> So instead of putting kthreads into the dedicated cpuset, we still keep
> >> them in the root cgroup. Instead we can create a separate cpuset
> >> partition to run the workload without interference from the background
> >> kthreads. Will that functionality suit your current need?
> >>
> >> Cheers,
> >> Longman
> >>
> > It's likely a major improvement over a fixed partition but maybe still not fully
> > flexible. I am not familiar with cpuset partitions but I wonder if the following
> > case can be supported:
> >
> > Starting from
> > 16 cpus
> > Root has cpu 0-3, 8-15
> > Job A has cpu 4-7 exclusive
> > Kernel threads cannot run on cpu 4-8 which is good.
> There will still be some kernel threads with PF_NO_SETAFFINITY flag set.
>
> >
> > Now adding best effort Job B, which is under SCHED_IDLE and rarely enters kernel
> > mode. As we expect C can be easily preempted we allow it to share cpus with A
> > and kernel threads to maximize throughput. Is there a layout that supports the
> > requirements below?
> >
> > Job C threads on cpu 0-15
>
> A task/thread can only be in one cpuset. So it cannot span all the CPUs.
> However, if there are multiples threads within the process, some of the
> threads can be moved to a different cpuset as it is threaded. With
> proper thread setting, you can have a job with threads spanning all the
> CPUs.
>
> Cheers,
> Longman
>
> > Job A threads on cpu 4-7
> > No kernel threads on cpu 4-7
> >
> > -Xi
> >
>

Partitions cannot have overlapping cpus but regular cpusets can. This is
probably where regular cpusets are still more flexible.

-Xi

Re: [RFC/PATCH] sched: Support moving kthreads into cpuset cgroups

Posted by Waiman Long 9 months, 1 week ago

On 5/6/25 2:35 PM, Xi Wang wrote:
> In theory we should be able to manage kernel tasks with cpuset
> cgroups just like user tasks, would be a flexible way to limit
> interferences to real-time and other sensitive workloads. This is
> however not supported today: When setting cpu affinity for kthreads,
> kernel code uses a simpler control path that directly lead to
> __set_cpus_allowed_ptr or __ktread_bind_mask. Neither honors cpuset
> restrictions.
>
> This patch adds cpuset support for kernel tasks by merging userspace
> and kernel cpu affinity control paths and applying the same
> restrictions to kthreads.
>
> The PF_NO_SETAFFINITY flag is still supported for tasks that have to
> run with certain cpu affinities. Kernel ensures kthreads with this
> flag have their affinities locked and they stay in the root cpuset:
>
> If userspace moves kthreadd out of the root cpuset (see example
> below), a newly forked kthread will be in a non root cgroup as well.
> If PF_NO_SETAFFINITY is detected for the kthread, it will move itself
> into the root cpuset before the threadfn is called. This does depend
> on the kthread create -> kthread bind -> wake up sequence.
>
> Since kthreads are clones of kthreadd, the typical usage pattern is:
>
> Create a cpuset cgroup for kernel threads.
>
> Move kthreadd to that cgroup - all new newly created kthreads are
> automatically enrolled into that cgroup.
>
> Move all remaining unlocked (!PF_NO_SETAFFINITY) kthreads into that
> group.
>
> After these steps, all unlocked kthreads are managed by the cgroup,
> including current and future kthreads.
>
> Command line example:
>
> mkdir /sys/fs/cgroup/kernel
> echo "+cpuset" > /sys/fs/cgroup/cgroup.subtree_control
> echo "+cpuset" > /sys/fs/cgroup/kernel/cgroup.subtree_control
>
> ktd=`pgrep -x kthreadd`; echo "move kthreadd/$ktd first"; echo $ktd > /dev/cgroup/cpuset/kernel/tasks
> kthreads=`ps -e -o pgrp= -o pid=  | sed -ne 's/^ *0 *// p'`
> for p in $kthreads; do echo "moving $p (ok to fail for locked kthreads)"; echo $p > /sys/fs/cgroup/kernel/cgroup.procs; done
> echo 4-7 > /sys/fs/cgroup/kernel/cpuset.cpus
>
> Signed-off-by: Xi Wang <xii@google.com>
> ---
>   include/linux/kthread.h | 10 ++++-
>   include/linux/sched.h   | 11 +++++
>   kernel/cgroup/cpuset.c  | 31 ++++++++++++--
>   kernel/kthread.c        | 89 +++++++++++++++++++++++++++++++++++---
>   kernel/sched/core.c     | 95 ++++++++++++++++++++++++++++++++++++++---
>   kernel/sched/sched.h    |  6 ---
>   kernel/sched/syscalls.c | 63 +--------------------------
>   kernel/workqueue.c      |  7 ++-
>   8 files changed, 226 insertions(+), 86 deletions(-)
>
> diff --git a/include/linux/kthread.h b/include/linux/kthread.h
> index 8d27403888ce..36215a30d7f7 100644
> --- a/include/linux/kthread.h
> +++ b/include/linux/kthread.h
> @@ -13,6 +13,14 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
>   					   int node,
>   					   const char namefmt[], ...);
>   
> +__printf(4, 5)
> +struct task_struct *kthread_create_on_node_root_cpuset(
> +					   int (*threadfn)(void *data),
> +					   void *data,
> +					   int node,
> +					   const char namefmt[], ...);
> +
> +
>   /**
>    * kthread_create - create a kthread on the current node
>    * @threadfn: the function to run in the thread
> @@ -27,7 +35,6 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
>   #define kthread_create(threadfn, data, namefmt, arg...) \
>   	kthread_create_on_node(threadfn, data, NUMA_NO_NODE, namefmt, ##arg)
>   
> -
>   struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
>   					  void *data,
>   					  unsigned int cpu,
> @@ -85,6 +92,7 @@ kthread_run_on_cpu(int (*threadfn)(void *data), void *data,
>   void free_kthread_struct(struct task_struct *k);
>   void kthread_bind(struct task_struct *k, unsigned int cpu);
>   void kthread_bind_mask(struct task_struct *k, const struct cpumask *mask);
> +void kthread_bind_mask_cpuset(struct task_struct *k, const struct cpumask *mask);
>   int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask);
>   int kthread_stop(struct task_struct *k);
>   int kthread_stop_put(struct task_struct *k);
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 0782de6b20d5..45b912e21239 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1855,6 +1855,13 @@ extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpu
>   extern int task_can_attach(struct task_struct *p);
>   extern int dl_bw_alloc(int cpu, u64 dl_bw);
>   extern void dl_bw_free(int cpu, u64 dl_bw);
> +
> +#define SCA_CHECK		0x01
> +#define SCA_MIGRATE_DISABLE	0x02
> +#define SCA_MIGRATE_ENABLE	0x04
> +#define SCA_USER		0x08
> +#define SCA_NO_CPUSET	0x10
> +
>   #ifdef CONFIG_SMP
>   
>   /* do_set_cpus_allowed() - consider using set_cpus_allowed_ptr() instead */
> @@ -1868,6 +1875,9 @@ extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new
>    * Return: zero if successful, or a negative error code
>    */
>   extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask);
> +extern int set_cpus_allowed_ptr_no_cpuset(struct task_struct *p, const struct cpumask *new_mask);
> +extern int set_cpus_allowed_ptr_flags(
> +	struct task_struct *p, const struct cpumask *new_mask, u32 flags);
>   extern int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node);
>   extern void release_user_cpus_ptr(struct task_struct *p);
>   extern int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask);
> @@ -1884,6 +1894,7 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpuma
>   		return -EINVAL;
>   	return 0;
>   }
> +
>   static inline int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node)
>   {
>   	if (src->user_cpus_ptr)
> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
> index d0143b3dce47..ef929b349da8 100644
> --- a/kernel/cgroup/cpuset.c
> +++ b/kernel/cgroup/cpuset.c
> @@ -1128,6 +1128,13 @@ void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
>   	while ((task = css_task_iter_next(&it))) {
>   		const struct cpumask *possible_mask = task_cpu_possible_mask(task);
>   
> +		/*
> +		 * See also cpuset_can_attach. A thead with the flag could temporarily
> +		 * reside in a non root cpuset. Don't change its affinity.
> +		 */
> +		if (task->flags & PF_NO_SETAFFINITY)
> +			continue;
> +
>   		if (top_cs) {
>   			/*
>   			 * Percpu kthreads in top_cpuset are ignored
> @@ -3034,7 +3041,14 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
>   	mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
>   
>   	cgroup_taskset_for_each(task, css, tset) {
> -		ret = task_can_attach(task);
> +		/*
> +		 * With the kthreads in cpuset feature, kthreadd can be moved to a
> +		 * non root cpuset. We want to allow a PF_NO_SETAFFINITY task to be
> +		 * spawned and then moved to root, which needs to be allowed here.
> +		 */
> +		ret = !(cs == &top_cpuset && task->flags & PF_NO_SETAFFINITY);
> +		/* Check regular threads */
> +		ret = ret && task_can_attach(task);
>   		if (ret)
>   			goto out_unlock;
>   
> @@ -3127,7 +3141,7 @@ static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task)
>   	 * can_attach beforehand should guarantee that this doesn't
>   	 * fail.  TODO: have a better way to handle failure here
>   	 */
> -	WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
> +	WARN_ON_ONCE(set_cpus_allowed_ptr_flags(task, cpus_attach, SCA_NO_CPUSET));
>   
>   	cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
>   	cpuset1_update_task_spread_flags(cs, task);
> @@ -3164,8 +3178,19 @@ static void cpuset_attach(struct cgroup_taskset *tset)
>   
>   	guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
>   
> -	cgroup_taskset_for_each(task, css, tset)
> +	cgroup_taskset_for_each(task, css, tset) {
> +		/*
> +		 * See cpuset_can_attach.
> +		 * With the kthreads in cpuset feature, kthreadd can be moved to a
> +		 * non root cpuset. We want to allow a PF_NO_SETAFFINITY task to be
> +		 * spawned and then moved to root as it starts running. Don't reset the
> +		 * cpu affinity in this case because the thread could have already been
> +		 * pinned to a cpu with kthread_bind and we want to preserve that.
> +		 */
> +		if (task->flags & PF_NO_SETAFFINITY)
> +			continue;
>   		cpuset_attach_task(cs, task);
> +	}
>   
>   	/*
>   	 * Change mm for all threadgroup leaders. This is expensive and may
> diff --git a/kernel/kthread.c b/kernel/kthread.c
> index 77c44924cf54..2689eb67846e 100644
> --- a/kernel/kthread.c
> +++ b/kernel/kthread.c
> @@ -45,6 +45,7 @@ struct kthread_create_info
>   	int (*threadfn)(void *data);
>   	void *data;
>   	int node;
> +	bool move_to_root;
>   
>   	/* Result passed back to kthread_create() from kthreadd. */
>   	struct task_struct *result;
> @@ -409,6 +410,9 @@ static void kthread_affine_node(void)
>   	}
>   }
>   
> +int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
> +		       bool threadgroup);
> +
>   static int kthread(void *_create)
>   {
>   	static const struct sched_param param = { .sched_priority = 0 };
> @@ -418,6 +422,7 @@ static int kthread(void *_create)
>   	void *data = create->data;
>   	struct completion *done;
>   	struct kthread *self;
> +	bool move_to_root = create->move_to_root;
>   	int ret;
>   
>   	self = to_kthread(current);
> @@ -454,6 +459,42 @@ static int kthread(void *_create)
>   
>   	self->started = 1;
>   
> +#ifdef CONFIG_CPUSETS
> +	/*
> +	 * With the kthreads in cgroup feature, kthreadd can be optionally put
> +	 * into a non root cpuset (such that newly created kernel threads are
> +	 * automatically restricted). Certain kernel threads that must to be in
> +	 * the root cpuset are moved to root here.
> +	 *
> +	 * This code is called after the schedule() above, thus kthread_bind
> +	 * or kthread_bind_mask should have already been called if present.
> +	 * PF_NO_SETAFFINITY set by these functions implicitly triggers the
> +	 * move to root action. It can also be explicitly triggered with the
> +	 * move_to_root flag.
> +	 *
> +	 * Potential races between the conditional and cgroup mutex lock:
> +	 *
> +	 * current can be out of root then moved into root before mutex lock,
> +	 * which is ok because cgroup_attach_task should be able to handle
> +	 * src == dst. There are checks in cgroup_migrate_prepare_dst etc.
> +	 *
> +	 * current can be in root then moved out of root before mutex lock,
> +	 * which is also ok: For threads with PF_NO_SETAFFINITY the move is
> +	 * disallowed so we can't have this race. For other threads, we allow
> +	 * users to move them out of the root cgroup and there is no guarantee
> +	 * on the order of actions.
> +	 */
> +	if ((current->flags & PF_NO_SETAFFINITY || move_to_root) &&
> +	  !task_css_is_root(current, cpuset_cgrp_id)) {
> +		mutex_lock(&cgroup_mutex);
> +		percpu_down_write(&cgroup_threadgroup_rwsem);
> +		if (cgroup_attach_task(&cpuset_cgrp_subsys.root->cgrp, current, false))
> +			WARN_ONCE(1, "Cannot move newly created kernel thread to root cpuset");
> +		percpu_up_write(&cgroup_threadgroup_rwsem);
> +		mutex_unlock(&cgroup_mutex);
> +	}
> +#endif
> +
>   	if (!(current->flags & PF_NO_SETAFFINITY) && !self->preferred_affinity)
>   		kthread_affine_node();
>   
> @@ -504,7 +545,8 @@ static __printf(4, 0)
>   struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
>   						    void *data, int node,
>   						    const char namefmt[],
> -						    va_list args)
> +						    va_list args,
> +						    bool move_to_root)
>   {
>   	DECLARE_COMPLETION_ONSTACK(done);
>   	struct task_struct *task;
> @@ -516,6 +558,7 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
>   	create->threadfn = threadfn;
>   	create->data = data;
>   	create->node = node;
> +	create->move_to_root = move_to_root;
>   	create->done = &done;
>   	create->full_name = kvasprintf(GFP_KERNEL, namefmt, args);
>   	if (!create->full_name) {
> @@ -585,14 +628,40 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
>   	va_list args;
>   
>   	va_start(args, namefmt);
> -	task = __kthread_create_on_node(threadfn, data, node, namefmt, args);
> +	task = __kthread_create_on_node(threadfn, data, node, namefmt, args, false);
>   	va_end(args);
>   
>   	return task;
>   }
>   EXPORT_SYMBOL(kthread_create_on_node);
>   
> -static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state)
> +/*
> + * Move the newly created kthread to root cpuset if it is not already there.
> + * This happens if kthreadd is moved out of root cpuset by user. Otherwise same
> + * as the regular version.
> + */
> +struct task_struct *kthread_create_on_node_root_cpuset(
> +					   int (*threadfn)(void *data),
> +					   void *data, int node,
> +					   const char namefmt[],
> +					   ...)
> +
> +{
> +	struct task_struct *task;
> +	va_list args;
> +
> +	va_start(args, namefmt);
> +	task = __kthread_create_on_node(threadfn, data, node, namefmt, args, true);
> +	va_end(args);
> +
> +	return task;
> +}
> +EXPORT_SYMBOL(kthread_create_on_node_root_cpuset);
> +
> +
> +static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask,
> +  unsigned int state, bool no_setaffinity)
> +
>   {
>   	unsigned long flags;
>   
> @@ -604,22 +673,28 @@ static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mas
>   	/* It's safe because the task is inactive. */
>   	raw_spin_lock_irqsave(&p->pi_lock, flags);
>   	do_set_cpus_allowed(p, mask);
> -	p->flags |= PF_NO_SETAFFINITY;
> +	if (no_setaffinity)
> +		p->flags |= PF_NO_SETAFFINITY;
>   	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
>   }
>   
>   static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state)
>   {
> -	__kthread_bind_mask(p, cpumask_of(cpu), state);
> +	__kthread_bind_mask(p, cpumask_of(cpu), state, true);
>   }
>   
>   void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask)
>   {
>   	struct kthread *kthread = to_kthread(p);
> -	__kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE);
> +	__kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE, true);
>   	WARN_ON_ONCE(kthread->started);
>   }
>   
> +void kthread_bind_mask_cpuset(struct task_struct *p, const struct cpumask *mask)
> +{
> +	set_cpus_allowed_ptr(p, mask);
> +}
> +
>   /**
>    * kthread_bind - bind a just-created kthread to a cpu.
>    * @p: thread created by kthread_create().
> @@ -1044,7 +1119,7 @@ __kthread_create_worker_on_node(unsigned int flags, int node,
>   	kthread_init_worker(worker);
>   
>   	task = __kthread_create_on_node(kthread_worker_fn, worker,
> -					node, namefmt, args);
> +					node, namefmt, args, true);
>   	if (IS_ERR(task))
>   		goto fail_task;
>   
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 54e7d63f7785..b604a8451ba3 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2393,7 +2393,7 @@ void migrate_enable(void)
>   	struct task_struct *p = current;
>   	struct affinity_context ac = {
>   		.new_mask  = &p->cpus_mask,
> -		.flags     = SCA_MIGRATE_ENABLE,
> +		.flags     = SCA_MIGRATE_ENABLE | SCA_NO_CPUSET,
>   	};
>   
>   #ifdef CONFIG_DEBUG_PREEMPT
> @@ -3153,7 +3153,7 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
>    * task must not exit() & deallocate itself prematurely. The
>    * call is not atomic; no spinlocks may be held.
>    */
> -int __set_cpus_allowed_ptr(struct task_struct *p, struct affinity_context *ctx)
> +static int do_set_cpus_allowed_ptr(struct task_struct *p, struct affinity_context *ctx)
>   {
>   	struct rq_flags rf;
>   	struct rq *rq;
> @@ -3171,6 +3171,79 @@ int __set_cpus_allowed_ptr(struct task_struct *p, struct affinity_context *ctx)
>   	return __set_cpus_allowed_ptr_locked(p, ctx, rq, &rf);
>   }
>   
> +int __set_cpus_allowed_ptr(struct task_struct *p,
> +				  struct affinity_context *ctx)
The __set_cpus_allowed_ptr() function is almost the same as 
__sched_setaffinity(). Please break the moving and renaming parts out 
into a separate patch to make it easier to review.
> +{
> +	int retval;
> +	cpumask_var_t cpus_allowed, new_mask;
> +
> +	/*
> +	 * Don't restrict the thread to cpuset if explicitly specified or if locked.
> +	 */
> +	if ((ctx->flags & SCA_NO_CPUSET) || (p->flags & PF_NO_SETAFFINITY))
> +		return do_set_cpus_allowed_ptr(p, ctx);

Why you will allow a PF_NO_SETAFFIINITY task to change its affinity? 
What exactly is the purpose of the SCA_NO_CPUSET flag?

Cheers,
Longman

> +
> +	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
> +		WARN_ONCE(!(ctx->flags & SCA_USER),
> +		  "Unable to restrict kernel thread to cpuset due to low memory");
> +		return -ENOMEM;
> +	}
> +
> +	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
> +		WARN_ONCE(!(ctx->flags & SCA_USER),
> +		  "Unable to restrict kernel thread to cpuset due to low memory");
> +		retval = -ENOMEM;
> +		goto out_free_cpus_allowed;
> +	}
> +
> +	cpuset_cpus_allowed(p, cpus_allowed);
> +	cpumask_and(new_mask, ctx->new_mask, cpus_allowed);
> +
> +	ctx->new_mask = new_mask;
> +	ctx->flags |= SCA_CHECK;
> +
> +	retval = dl_task_check_affinity(p, new_mask);
> +	if (retval)
> +		goto out_free_new_mask;
> +
> +	retval = do_set_cpus_allowed_ptr(p, ctx);
> +	if (retval)
> +		goto out_free_new_mask;
> +
> +	cpuset_cpus_allowed(p, cpus_allowed);
> +	if (!cpumask_subset(new_mask, cpus_allowed)) {
> +		/*
> +		 * We must have raced with a concurrent cpuset update.
> +		 * Just reset the cpumask to the cpuset's cpus_allowed.
> +		 */
> +		cpumask_copy(new_mask, cpus_allowed);
> +
> +		/*
> +		 * If SCA_USER is set, a 2nd call to __set_cpus_allowed_ptr()
> +		 * will restore the previous user_cpus_ptr value.
> +		 *
> +		 * In the unlikely event a previous user_cpus_ptr exists,
> +		 * we need to further restrict the mask to what is allowed
> +		 * by that old user_cpus_ptr.
> +		 */
> +		if (unlikely((ctx->flags & SCA_USER) && ctx->user_mask)) {
> +			bool empty = !cpumask_and(new_mask, new_mask,
> +						  ctx->user_mask);
> +
> +			if (empty)
> +				cpumask_copy(new_mask, cpus_allowed);
> +		}
> +		__set_cpus_allowed_ptr(p, ctx);
> +		retval = -EINVAL;
> +	}
> +
> +out_free_new_mask:
> +	free_cpumask_var(new_mask);
> +out_free_cpus_allowed:
> +	free_cpumask_var(cpus_allowed);
> +	return retval;
> +}
> +
>   int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
>   {
>   	struct affinity_context ac = {
> @@ -3182,6 +3255,17 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
>   }
>   EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
>   
> +int set_cpus_allowed_ptr_flags(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
> +{
> +	struct affinity_context ac = {
> +		.new_mask  = new_mask,
> +		.flags     = flags,
> +	};
> +
> +	return __set_cpus_allowed_ptr(p, &ac);
> +}
> +EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr_flags);
> +
>   /*
>    * Change a given task's CPU affinity to the intersection of its current
>    * affinity mask and @subset_mask, writing the resulting mask to @new_mask.
> @@ -3283,15 +3367,15 @@ void relax_compatible_cpus_allowed_ptr(struct task_struct *p)
>   {
>   	struct affinity_context ac = {
>   		.new_mask  = task_user_cpus(p),
> -		.flags     = 0,
> +		.flags     = SCA_NO_CPUSET,
>   	};
>   	int ret;
>   
>   	/*
> -	 * Try to restore the old affinity mask with __sched_setaffinity().
> +	 * Try to restore the old affinity mask with __set_cpus_allowed_ptr().
>   	 * Cpuset masking will be done there too.
>   	 */
> -	ret = __sched_setaffinity(p, &ac);
> +	ret = __set_cpus_allowed_ptr(p, &ac);
>   	WARN_ON_ONCE(ret);
>   }
>   
> @@ -7292,6 +7376,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
>   }
>   #endif
>   
> +
>   #if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC)
>   int __sched __cond_resched(void)
>   {
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 91bea8d0a90b..9833432c9a75 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2576,11 +2576,6 @@ static inline bool sched_fair_runnable(struct rq *rq)
>   extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
>   extern struct task_struct *pick_task_idle(struct rq *rq);
>   
> -#define SCA_CHECK		0x01
> -#define SCA_MIGRATE_DISABLE	0x02
> -#define SCA_MIGRATE_ENABLE	0x04
> -#define SCA_USER		0x08
> -
>   #ifdef CONFIG_SMP
>   
>   extern void update_group_capacity(struct sched_domain *sd, int cpu);
> @@ -3939,7 +3934,6 @@ static inline int rt_effective_prio(struct task_struct *p, int prio)
>   #endif /* !CONFIG_RT_MUTEXES */
>   
>   extern int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi);
> -extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx);
>   extern const struct sched_class *__setscheduler_class(int policy, int prio);
>   extern void set_load_weight(struct task_struct *p, bool update_load);
>   extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
> diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
> index 547c1f05b667..6528153c1297 100644
> --- a/kernel/sched/syscalls.c
> +++ b/kernel/sched/syscalls.c
> @@ -1151,67 +1151,6 @@ int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
>   }
>   #endif /* CONFIG_SMP */
>   
> -int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx)
> -{
> -	int retval;
> -	cpumask_var_t cpus_allowed, new_mask;
> -
> -	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL))
> -		return -ENOMEM;
> -
> -	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
> -		retval = -ENOMEM;
> -		goto out_free_cpus_allowed;
> -	}
> -
> -	cpuset_cpus_allowed(p, cpus_allowed);
> -	cpumask_and(new_mask, ctx->new_mask, cpus_allowed);
> -
> -	ctx->new_mask = new_mask;
> -	ctx->flags |= SCA_CHECK;
> -
> -	retval = dl_task_check_affinity(p, new_mask);
> -	if (retval)
> -		goto out_free_new_mask;
> -
> -	retval = __set_cpus_allowed_ptr(p, ctx);
> -	if (retval)
> -		goto out_free_new_mask;
> -
> -	cpuset_cpus_allowed(p, cpus_allowed);
> -	if (!cpumask_subset(new_mask, cpus_allowed)) {
> -		/*
> -		 * We must have raced with a concurrent cpuset update.
> -		 * Just reset the cpumask to the cpuset's cpus_allowed.
> -		 */
> -		cpumask_copy(new_mask, cpus_allowed);
> -
> -		/*
> -		 * If SCA_USER is set, a 2nd call to __set_cpus_allowed_ptr()
> -		 * will restore the previous user_cpus_ptr value.
> -		 *
> -		 * In the unlikely event a previous user_cpus_ptr exists,
> -		 * we need to further restrict the mask to what is allowed
> -		 * by that old user_cpus_ptr.
> -		 */
> -		if (unlikely((ctx->flags & SCA_USER) && ctx->user_mask)) {
> -			bool empty = !cpumask_and(new_mask, new_mask,
> -						  ctx->user_mask);
> -
> -			if (empty)
> -				cpumask_copy(new_mask, cpus_allowed);
> -		}
> -		__set_cpus_allowed_ptr(p, ctx);
> -		retval = -EINVAL;
> -	}
> -
> -out_free_new_mask:
> -	free_cpumask_var(new_mask);
> -out_free_cpus_allowed:
> -	free_cpumask_var(cpus_allowed);
> -	return retval;
> -}
> -
>   long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
>   {
>   	struct affinity_context ac;
> @@ -1252,7 +1191,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
>   		.flags     = SCA_USER,
>   	};
>   
> -	retval = __sched_setaffinity(p, &ac);
> +	retval = __set_cpus_allowed_ptr(p, &ac);
>   	kfree(ac.user_mask);
>   
>   	return retval;
> diff --git a/kernel/workqueue.c b/kernel/workqueue.c
> index f9ef467020cf..d51c0716674e 100644
> --- a/kernel/workqueue.c
> +++ b/kernel/workqueue.c
> @@ -2813,7 +2813,10 @@ static struct worker *create_worker(struct worker_pool *pool)
>   		}
>   
>   		set_user_nice(worker->task, pool->attrs->nice);
> -		kthread_bind_mask(worker->task, pool_allowed_cpus(pool));
> +		if (!pool || (!worker->rescue_wq && pool->cpu >= 0))
> +			kthread_bind_mask(worker->task, pool_allowed_cpus(pool));
> +		else
> +			kthread_bind_mask_cpuset(worker->task, pool_allowed_cpus(pool));
>   	}
>   
>   	/* successful, attach the worker to the pool */
> @@ -5587,7 +5590,7 @@ static int init_rescuer(struct workqueue_struct *wq)
>   	if (wq->flags & WQ_UNBOUND)
>   		kthread_bind_mask(rescuer->task, unbound_effective_cpumask(wq));
>   	else
> -		kthread_bind_mask(rescuer->task, cpu_possible_mask);
> +		kthread_bind_mask_cpuset(rescuer->task, cpu_possible_mask);
>   	wake_up_process(rescuer->task);
>   
>   	return 0;
>

Re: [RFC/PATCH] sched: Support moving kthreads into cpuset cgroups

Posted by Xi Wang 9 months ago

On Tue, May 6, 2025 at 12:58 PM Waiman Long <llong@redhat.com> wrote:
>
> On 5/6/25 2:35 PM, Xi Wang wrote:
> > In theory we should be able to manage kernel tasks with cpuset
> > cgroups just like user tasks, would be a flexible way to limit
> > interferences to real-time and other sensitive workloads. This is
> > however not supported today: When setting cpu affinity for kthreads,
> > kernel code uses a simpler control path that directly lead to
> > __set_cpus_allowed_ptr or __ktread_bind_mask. Neither honors cpuset
> > restrictions.
> >
> > This patch adds cpuset support for kernel tasks by merging userspace
> > and kernel cpu affinity control paths and applying the same
> > restrictions to kthreads.
> >
> > The PF_NO_SETAFFINITY flag is still supported for tasks that have to
> > run with certain cpu affinities. Kernel ensures kthreads with this
> > flag have their affinities locked and they stay in the root cpuset:
> >
> > If userspace moves kthreadd out of the root cpuset (see example
> > below), a newly forked kthread will be in a non root cgroup as well.
> > If PF_NO_SETAFFINITY is detected for the kthread, it will move itself
> > into the root cpuset before the threadfn is called. This does depend
> > on the kthread create -> kthread bind -> wake up sequence.
> >
> > Since kthreads are clones of kthreadd, the typical usage pattern is:
> >
> > Create a cpuset cgroup for kernel threads.
> >
> > Move kthreadd to that cgroup - all new newly created kthreads are
> > automatically enrolled into that cgroup.
> >
> > Move all remaining unlocked (!PF_NO_SETAFFINITY) kthreads into that
> > group.
> >
> > After these steps, all unlocked kthreads are managed by the cgroup,
> > including current and future kthreads.
> >
> > Command line example:
> >
> > mkdir /sys/fs/cgroup/kernel
> > echo "+cpuset" > /sys/fs/cgroup/cgroup.subtree_control
> > echo "+cpuset" > /sys/fs/cgroup/kernel/cgroup.subtree_control
> >
> > ktd=`pgrep -x kthreadd`; echo "move kthreadd/$ktd first"; echo $ktd > /dev/cgroup/cpuset/kernel/tasks
> > kthreads=`ps -e -o pgrp= -o pid=  | sed -ne 's/^ *0 *// p'`
> > for p in $kthreads; do echo "moving $p (ok to fail for locked kthreads)"; echo $p > /sys/fs/cgroup/kernel/cgroup.procs; done
> > echo 4-7 > /sys/fs/cgroup/kernel/cpuset.cpus
> >
> > Signed-off-by: Xi Wang <xii@google.com>
> > ---
> >   include/linux/kthread.h | 10 ++++-
> >   include/linux/sched.h   | 11 +++++
> >   kernel/cgroup/cpuset.c  | 31 ++++++++++++--
> >   kernel/kthread.c        | 89 +++++++++++++++++++++++++++++++++++---
> >   kernel/sched/core.c     | 95 ++++++++++++++++++++++++++++++++++++++---
> >   kernel/sched/sched.h    |  6 ---
> >   kernel/sched/syscalls.c | 63 +--------------------------
> >   kernel/workqueue.c      |  7 ++-
> >   8 files changed, 226 insertions(+), 86 deletions(-)
> >
> > diff --git a/include/linux/kthread.h b/include/linux/kthread.h
> > index 8d27403888ce..36215a30d7f7 100644
> > --- a/include/linux/kthread.h
> > +++ b/include/linux/kthread.h
> > @@ -13,6 +13,14 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
> >                                          int node,
> >                                          const char namefmt[], ...);
> >
> > +__printf(4, 5)
> > +struct task_struct *kthread_create_on_node_root_cpuset(
> > +                                        int (*threadfn)(void *data),
> > +                                        void *data,
> > +                                        int node,
> > +                                        const char namefmt[], ...);
> > +
> > +
> >   /**
> >    * kthread_create - create a kthread on the current node
> >    * @threadfn: the function to run in the thread
> > @@ -27,7 +35,6 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
> >   #define kthread_create(threadfn, data, namefmt, arg...) \
> >       kthread_create_on_node(threadfn, data, NUMA_NO_NODE, namefmt, ##arg)
> >
> > -
> >   struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
> >                                         void *data,
> >                                         unsigned int cpu,
> > @@ -85,6 +92,7 @@ kthread_run_on_cpu(int (*threadfn)(void *data), void *data,
> >   void free_kthread_struct(struct task_struct *k);
> >   void kthread_bind(struct task_struct *k, unsigned int cpu);
> >   void kthread_bind_mask(struct task_struct *k, const struct cpumask *mask);
> > +void kthread_bind_mask_cpuset(struct task_struct *k, const struct cpumask *mask);
> >   int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask);
> >   int kthread_stop(struct task_struct *k);
> >   int kthread_stop_put(struct task_struct *k);
> > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > index 0782de6b20d5..45b912e21239 100644
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -1855,6 +1855,13 @@ extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpu
> >   extern int task_can_attach(struct task_struct *p);
> >   extern int dl_bw_alloc(int cpu, u64 dl_bw);
> >   extern void dl_bw_free(int cpu, u64 dl_bw);
> > +
> > +#define SCA_CHECK            0x01
> > +#define SCA_MIGRATE_DISABLE  0x02
> > +#define SCA_MIGRATE_ENABLE   0x04
> > +#define SCA_USER             0x08
> > +#define SCA_NO_CPUSET        0x10
> > +
> >   #ifdef CONFIG_SMP
> >
> >   /* do_set_cpus_allowed() - consider using set_cpus_allowed_ptr() instead */
> > @@ -1868,6 +1875,9 @@ extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new
> >    * Return: zero if successful, or a negative error code
> >    */
> >   extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask);
> > +extern int set_cpus_allowed_ptr_no_cpuset(struct task_struct *p, const struct cpumask *new_mask);
> > +extern int set_cpus_allowed_ptr_flags(
> > +     struct task_struct *p, const struct cpumask *new_mask, u32 flags);
> >   extern int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node);
> >   extern void release_user_cpus_ptr(struct task_struct *p);
> >   extern int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask);
> > @@ -1884,6 +1894,7 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpuma
> >               return -EINVAL;
> >       return 0;
> >   }
> > +
> >   static inline int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node)
> >   {
> >       if (src->user_cpus_ptr)
> > diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
> > index d0143b3dce47..ef929b349da8 100644
> > --- a/kernel/cgroup/cpuset.c
> > +++ b/kernel/cgroup/cpuset.c
> > @@ -1128,6 +1128,13 @@ void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
> >       while ((task = css_task_iter_next(&it))) {
> >               const struct cpumask *possible_mask = task_cpu_possible_mask(task);
> >
> > +             /*
> > +              * See also cpuset_can_attach. A thead with the flag could temporarily
> > +              * reside in a non root cpuset. Don't change its affinity.
> > +              */
> > +             if (task->flags & PF_NO_SETAFFINITY)
> > +                     continue;
> > +
> >               if (top_cs) {
> >                       /*
> >                        * Percpu kthreads in top_cpuset are ignored
> > @@ -3034,7 +3041,14 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
> >       mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
> >
> >       cgroup_taskset_for_each(task, css, tset) {
> > -             ret = task_can_attach(task);
> > +             /*
> > +              * With the kthreads in cpuset feature, kthreadd can be moved to a
> > +              * non root cpuset. We want to allow a PF_NO_SETAFFINITY task to be
> > +              * spawned and then moved to root, which needs to be allowed here.
> > +              */
> > +             ret = !(cs == &top_cpuset && task->flags & PF_NO_SETAFFINITY);
> > +             /* Check regular threads */
> > +             ret = ret && task_can_attach(task);
> >               if (ret)
> >                       goto out_unlock;
> >
> > @@ -3127,7 +3141,7 @@ static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task)
> >        * can_attach beforehand should guarantee that this doesn't
> >        * fail.  TODO: have a better way to handle failure here
> >        */
> > -     WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
> > +     WARN_ON_ONCE(set_cpus_allowed_ptr_flags(task, cpus_attach, SCA_NO_CPUSET));
> >
> >       cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
> >       cpuset1_update_task_spread_flags(cs, task);
> > @@ -3164,8 +3178,19 @@ static void cpuset_attach(struct cgroup_taskset *tset)
> >
> >       guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
> >
> > -     cgroup_taskset_for_each(task, css, tset)
> > +     cgroup_taskset_for_each(task, css, tset) {
> > +             /*
> > +              * See cpuset_can_attach.
> > +              * With the kthreads in cpuset feature, kthreadd can be moved to a
> > +              * non root cpuset. We want to allow a PF_NO_SETAFFINITY task to be
> > +              * spawned and then moved to root as it starts running. Don't reset the
> > +              * cpu affinity in this case because the thread could have already been
> > +              * pinned to a cpu with kthread_bind and we want to preserve that.
> > +              */
> > +             if (task->flags & PF_NO_SETAFFINITY)
> > +                     continue;
> >               cpuset_attach_task(cs, task);
> > +     }
> >
> >       /*
> >        * Change mm for all threadgroup leaders. This is expensive and may
> > diff --git a/kernel/kthread.c b/kernel/kthread.c
> > index 77c44924cf54..2689eb67846e 100644
> > --- a/kernel/kthread.c
> > +++ b/kernel/kthread.c
> > @@ -45,6 +45,7 @@ struct kthread_create_info
> >       int (*threadfn)(void *data);
> >       void *data;
> >       int node;
> > +     bool move_to_root;
> >
> >       /* Result passed back to kthread_create() from kthreadd. */
> >       struct task_struct *result;
> > @@ -409,6 +410,9 @@ static void kthread_affine_node(void)
> >       }
> >   }
> >
> > +int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
> > +                    bool threadgroup);
> > +
> >   static int kthread(void *_create)
> >   {
> >       static const struct sched_param param = { .sched_priority = 0 };
> > @@ -418,6 +422,7 @@ static int kthread(void *_create)
> >       void *data = create->data;
> >       struct completion *done;
> >       struct kthread *self;
> > +     bool move_to_root = create->move_to_root;
> >       int ret;
> >
> >       self = to_kthread(current);
> > @@ -454,6 +459,42 @@ static int kthread(void *_create)
> >
> >       self->started = 1;
> >
> > +#ifdef CONFIG_CPUSETS
> > +     /*
> > +      * With the kthreads in cgroup feature, kthreadd can be optionally put
> > +      * into a non root cpuset (such that newly created kernel threads are
> > +      * automatically restricted). Certain kernel threads that must to be in
> > +      * the root cpuset are moved to root here.
> > +      *
> > +      * This code is called after the schedule() above, thus kthread_bind
> > +      * or kthread_bind_mask should have already been called if present.
> > +      * PF_NO_SETAFFINITY set by these functions implicitly triggers the
> > +      * move to root action. It can also be explicitly triggered with the
> > +      * move_to_root flag.
> > +      *
> > +      * Potential races between the conditional and cgroup mutex lock:
> > +      *
> > +      * current can be out of root then moved into root before mutex lock,
> > +      * which is ok because cgroup_attach_task should be able to handle
> > +      * src == dst. There are checks in cgroup_migrate_prepare_dst etc.
> > +      *
> > +      * current can be in root then moved out of root before mutex lock,
> > +      * which is also ok: For threads with PF_NO_SETAFFINITY the move is
> > +      * disallowed so we can't have this race. For other threads, we allow
> > +      * users to move them out of the root cgroup and there is no guarantee
> > +      * on the order of actions.
> > +      */
> > +     if ((current->flags & PF_NO_SETAFFINITY || move_to_root) &&
> > +       !task_css_is_root(current, cpuset_cgrp_id)) {
> > +             mutex_lock(&cgroup_mutex);
> > +             percpu_down_write(&cgroup_threadgroup_rwsem);
> > +             if (cgroup_attach_task(&cpuset_cgrp_subsys.root->cgrp, current, false))
> > +                     WARN_ONCE(1, "Cannot move newly created kernel thread to root cpuset");
> > +             percpu_up_write(&cgroup_threadgroup_rwsem);
> > +             mutex_unlock(&cgroup_mutex);
> > +     }
> > +#endif
> > +
> >       if (!(current->flags & PF_NO_SETAFFINITY) && !self->preferred_affinity)
> >               kthread_affine_node();
> >
> > @@ -504,7 +545,8 @@ static __printf(4, 0)
> >   struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
> >                                                   void *data, int node,
> >                                                   const char namefmt[],
> > -                                                 va_list args)
> > +                                                 va_list args,
> > +                                                 bool move_to_root)
> >   {
> >       DECLARE_COMPLETION_ONSTACK(done);
> >       struct task_struct *task;
> > @@ -516,6 +558,7 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
> >       create->threadfn = threadfn;
> >       create->data = data;
> >       create->node = node;
> > +     create->move_to_root = move_to_root;
> >       create->done = &done;
> >       create->full_name = kvasprintf(GFP_KERNEL, namefmt, args);
> >       if (!create->full_name) {
> > @@ -585,14 +628,40 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
> >       va_list args;
> >
> >       va_start(args, namefmt);
> > -     task = __kthread_create_on_node(threadfn, data, node, namefmt, args);
> > +     task = __kthread_create_on_node(threadfn, data, node, namefmt, args, false);
> >       va_end(args);
> >
> >       return task;
> >   }
> >   EXPORT_SYMBOL(kthread_create_on_node);
> >
> > -static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state)
> > +/*
> > + * Move the newly created kthread to root cpuset if it is not already there.
> > + * This happens if kthreadd is moved out of root cpuset by user. Otherwise same
> > + * as the regular version.
> > + */
> > +struct task_struct *kthread_create_on_node_root_cpuset(
> > +                                        int (*threadfn)(void *data),
> > +                                        void *data, int node,
> > +                                        const char namefmt[],
> > +                                        ...)
> > +
> > +{
> > +     struct task_struct *task;
> > +     va_list args;
> > +
> > +     va_start(args, namefmt);
> > +     task = __kthread_create_on_node(threadfn, data, node, namefmt, args, true);
> > +     va_end(args);
> > +
> > +     return task;
> > +}
> > +EXPORT_SYMBOL(kthread_create_on_node_root_cpuset);
> > +
> > +
> > +static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask,
> > +  unsigned int state, bool no_setaffinity)
> > +
> >   {
> >       unsigned long flags;
> >
> > @@ -604,22 +673,28 @@ static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mas
> >       /* It's safe because the task is inactive. */
> >       raw_spin_lock_irqsave(&p->pi_lock, flags);
> >       do_set_cpus_allowed(p, mask);
> > -     p->flags |= PF_NO_SETAFFINITY;
> > +     if (no_setaffinity)
> > +             p->flags |= PF_NO_SETAFFINITY;
> >       raw_spin_unlock_irqrestore(&p->pi_lock, flags);
> >   }
> >
> >   static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state)
> >   {
> > -     __kthread_bind_mask(p, cpumask_of(cpu), state);
> > +     __kthread_bind_mask(p, cpumask_of(cpu), state, true);
> >   }
> >
> >   void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask)
> >   {
> >       struct kthread *kthread = to_kthread(p);
> > -     __kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE);
> > +     __kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE, true);
> >       WARN_ON_ONCE(kthread->started);
> >   }
> >
> > +void kthread_bind_mask_cpuset(struct task_struct *p, const struct cpumask *mask)
> > +{
> > +     set_cpus_allowed_ptr(p, mask);
> > +}
> > +
> >   /**
> >    * kthread_bind - bind a just-created kthread to a cpu.
> >    * @p: thread created by kthread_create().
> > @@ -1044,7 +1119,7 @@ __kthread_create_worker_on_node(unsigned int flags, int node,
> >       kthread_init_worker(worker);
> >
> >       task = __kthread_create_on_node(kthread_worker_fn, worker,
> > -                                     node, namefmt, args);
> > +                                     node, namefmt, args, true);
> >       if (IS_ERR(task))
> >               goto fail_task;
> >
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index 54e7d63f7785..b604a8451ba3 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -2393,7 +2393,7 @@ void migrate_enable(void)
> >       struct task_struct *p = current;
> >       struct affinity_context ac = {
> >               .new_mask  = &p->cpus_mask,
> > -             .flags     = SCA_MIGRATE_ENABLE,
> > +             .flags     = SCA_MIGRATE_ENABLE | SCA_NO_CPUSET,
> >       };
> >
> >   #ifdef CONFIG_DEBUG_PREEMPT
> > @@ -3153,7 +3153,7 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
> >    * task must not exit() & deallocate itself prematurely. The
> >    * call is not atomic; no spinlocks may be held.
> >    */
> > -int __set_cpus_allowed_ptr(struct task_struct *p, struct affinity_context *ctx)
> > +static int do_set_cpus_allowed_ptr(struct task_struct *p, struct affinity_context *ctx)
> >   {
> >       struct rq_flags rf;
> >       struct rq *rq;
> > @@ -3171,6 +3171,79 @@ int __set_cpus_allowed_ptr(struct task_struct *p, struct affinity_context *ctx)
> >       return __set_cpus_allowed_ptr_locked(p, ctx, rq, &rf);
> >   }
> >
> > +int __set_cpus_allowed_ptr(struct task_struct *p,
> > +                               struct affinity_context *ctx)
> The __set_cpus_allowed_ptr() function is almost the same as
> __sched_setaffinity(). Please break the moving and renaming parts out
> into a separate patch to make it easier to review.
> > +{
> > +     int retval;
> > +     cpumask_var_t cpus_allowed, new_mask;
> > +
> > +     /*
> > +      * Don't restrict the thread to cpuset if explicitly specified or if locked.
> > +      */
> > +     if ((ctx->flags & SCA_NO_CPUSET) || (p->flags & PF_NO_SETAFFINITY))
> > +             return do_set_cpus_allowed_ptr(p, ctx);
>
> Why you will allow a PF_NO_SETAFFIINITY task to change its affinity?
> What exactly is the purpose of the SCA_NO_CPUSET flag?

A PF_NO_SETAFFIINITY task still needs to have its affinity changed once after
creation - kthread_create doesn't have an affinity parameter so we have the
kthread_create -> kthread_bind -> wake up sequence for these tasks.

SCA_NO_CPUSET means don't do cpuset based restriction just like how it is done
without this patch, or passthrough mode for __set_cpus_allowed_ptr.

> > +
> > +     if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
> > +             WARN_ONCE(!(ctx->flags & SCA_USER),
> > +               "Unable to restrict kernel thread to cpuset due to low memory");
> > +             return -ENOMEM;
> > +     }
> > +
> > +     if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
> > +             WARN_ONCE(!(ctx->flags & SCA_USER),
> > +               "Unable to restrict kernel thread to cpuset due to low memory");
> > +             retval = -ENOMEM;
> > +             goto out_free_cpus_allowed;
> > +     }
> > +
> > +     cpuset_cpus_allowed(p, cpus_allowed);
> > +     cpumask_and(new_mask, ctx->new_mask, cpus_allowed);
> > +
> > +     ctx->new_mask = new_mask;
> > +     ctx->flags |= SCA_CHECK;
> > +
> > +     retval = dl_task_check_affinity(p, new_mask);
> > +     if (retval)
> > +             goto out_free_new_mask;
> > +
> > +     retval = do_set_cpus_allowed_ptr(p, ctx);
> > +     if (retval)
> > +             goto out_free_new_mask;
> > +
> > +     cpuset_cpus_allowed(p, cpus_allowed);
> > +     if (!cpumask_subset(new_mask, cpus_allowed)) {
> > +             /*
> > +              * We must have raced with a concurrent cpuset update.
> > +              * Just reset the cpumask to the cpuset's cpus_allowed.
> > +              */
> > +             cpumask_copy(new_mask, cpus_allowed);
> > +
> > +             /*
> > +              * If SCA_USER is set, a 2nd call to __set_cpus_allowed_ptr()
> > +              * will restore the previous user_cpus_ptr value.
> > +              *
> > +              * In the unlikely event a previous user_cpus_ptr exists,
> > +              * we need to further restrict the mask to what is allowed
> > +              * by that old user_cpus_ptr.
> > +              */
> > +             if (unlikely((ctx->flags & SCA_USER) && ctx->user_mask)) {
> > +                     bool empty = !cpumask_and(new_mask, new_mask,
> > +                                               ctx->user_mask);
> > +
> > +                     if (empty)
> > +                             cpumask_copy(new_mask, cpus_allowed);
> > +             }
> > +             __set_cpus_allowed_ptr(p, ctx);
> > +             retval = -EINVAL;
> > +     }
> > +
> > +out_free_new_mask:
> > +     free_cpumask_var(new_mask);
> > +out_free_cpus_allowed:
> > +     free_cpumask_var(cpus_allowed);
> > +     return retval;
> > +}
> > +
> >   int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
> >   {
> >       struct affinity_context ac = {
> > @@ -3182,6 +3255,17 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
> >   }
> >   EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
> >
> > +int set_cpus_allowed_ptr_flags(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
> > +{
> > +     struct affinity_context ac = {
> > +             .new_mask  = new_mask,
> > +             .flags     = flags,
> > +     };
> > +
> > +     return __set_cpus_allowed_ptr(p, &ac);
> > +}
> > +EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr_flags);
> > +
> >   /*
> >    * Change a given task's CPU affinity to the intersection of its current
> >    * affinity mask and @subset_mask, writing the resulting mask to @new_mask.
> > @@ -3283,15 +3367,15 @@ void relax_compatible_cpus_allowed_ptr(struct task_struct *p)
> >   {
> >       struct affinity_context ac = {
> >               .new_mask  = task_user_cpus(p),
> > -             .flags     = 0,
> > +             .flags     = SCA_NO_CPUSET,
> >       };
> >       int ret;
> >
> >       /*
> > -      * Try to restore the old affinity mask with __sched_setaffinity().
> > +      * Try to restore the old affinity mask with __set_cpus_allowed_ptr().
> >        * Cpuset masking will be done there too.
> >        */
> > -     ret = __sched_setaffinity(p, &ac);
> > +     ret = __set_cpus_allowed_ptr(p, &ac);
> >       WARN_ON_ONCE(ret);
> >   }
> >
> > @@ -7292,6 +7376,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
> >   }
> >   #endif
> >
> > +
> >   #if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC)
> >   int __sched __cond_resched(void)
> >   {
> > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> > index 91bea8d0a90b..9833432c9a75 100644
> > --- a/kernel/sched/sched.h
> > +++ b/kernel/sched/sched.h
> > @@ -2576,11 +2576,6 @@ static inline bool sched_fair_runnable(struct rq *rq)
> >   extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
> >   extern struct task_struct *pick_task_idle(struct rq *rq);
> >
> > -#define SCA_CHECK            0x01
> > -#define SCA_MIGRATE_DISABLE  0x02
> > -#define SCA_MIGRATE_ENABLE   0x04
> > -#define SCA_USER             0x08
> > -
> >   #ifdef CONFIG_SMP
> >
> >   extern void update_group_capacity(struct sched_domain *sd, int cpu);
> > @@ -3939,7 +3934,6 @@ static inline int rt_effective_prio(struct task_struct *p, int prio)
> >   #endif /* !CONFIG_RT_MUTEXES */
> >
> >   extern int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi);
> > -extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx);
> >   extern const struct sched_class *__setscheduler_class(int policy, int prio);
> >   extern void set_load_weight(struct task_struct *p, bool update_load);
> >   extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
> > diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
> > index 547c1f05b667..6528153c1297 100644
> > --- a/kernel/sched/syscalls.c
> > +++ b/kernel/sched/syscalls.c
> > @@ -1151,67 +1151,6 @@ int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
> >   }
> >   #endif /* CONFIG_SMP */
> >
> > -int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx)
> > -{
> > -     int retval;
> > -     cpumask_var_t cpus_allowed, new_mask;
> > -
> > -     if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL))
> > -             return -ENOMEM;
> > -
> > -     if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
> > -             retval = -ENOMEM;
> > -             goto out_free_cpus_allowed;
> > -     }
> > -
> > -     cpuset_cpus_allowed(p, cpus_allowed);
> > -     cpumask_and(new_mask, ctx->new_mask, cpus_allowed);
> > -
> > -     ctx->new_mask = new_mask;
> > -     ctx->flags |= SCA_CHECK;
> > -
> > -     retval = dl_task_check_affinity(p, new_mask);
> > -     if (retval)
> > -             goto out_free_new_mask;
> > -
> > -     retval = __set_cpus_allowed_ptr(p, ctx);
> > -     if (retval)
> > -             goto out_free_new_mask;
> > -
> > -     cpuset_cpus_allowed(p, cpus_allowed);
> > -     if (!cpumask_subset(new_mask, cpus_allowed)) {
> > -             /*
> > -              * We must have raced with a concurrent cpuset update.
> > -              * Just reset the cpumask to the cpuset's cpus_allowed.
> > -              */
> > -             cpumask_copy(new_mask, cpus_allowed);
> > -
> > -             /*
> > -              * If SCA_USER is set, a 2nd call to __set_cpus_allowed_ptr()
> > -              * will restore the previous user_cpus_ptr value.
> > -              *
> > -              * In the unlikely event a previous user_cpus_ptr exists,
> > -              * we need to further restrict the mask to what is allowed
> > -              * by that old user_cpus_ptr.
> > -              */
> > -             if (unlikely((ctx->flags & SCA_USER) && ctx->user_mask)) {
> > -                     bool empty = !cpumask_and(new_mask, new_mask,
> > -                                               ctx->user_mask);
> > -
> > -                     if (empty)
> > -                             cpumask_copy(new_mask, cpus_allowed);
> > -             }
> > -             __set_cpus_allowed_ptr(p, ctx);
> > -             retval = -EINVAL;
> > -     }
> > -
> > -out_free_new_mask:
> > -     free_cpumask_var(new_mask);
> > -out_free_cpus_allowed:
> > -     free_cpumask_var(cpus_allowed);
> > -     return retval;
> > -}
> > -
> >   long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
> >   {
> >       struct affinity_context ac;
> > @@ -1252,7 +1191,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
> >               .flags     = SCA_USER,
> >       };
> >
> > -     retval = __sched_setaffinity(p, &ac);
> > +     retval = __set_cpus_allowed_ptr(p, &ac);
> >       kfree(ac.user_mask);
> >
> >       return retval;
> > diff --git a/kernel/workqueue.c b/kernel/workqueue.c
> > index f9ef467020cf..d51c0716674e 100644
> > --- a/kernel/workqueue.c
> > +++ b/kernel/workqueue.c
> > @@ -2813,7 +2813,10 @@ static struct worker *create_worker(struct worker_pool *pool)
> >               }
> >
> >               set_user_nice(worker->task, pool->attrs->nice);
> > -             kthread_bind_mask(worker->task, pool_allowed_cpus(pool));
> > +             if (!pool || (!worker->rescue_wq && pool->cpu >= 0))
> > +                     kthread_bind_mask(worker->task, pool_allowed_cpus(pool));
> > +             else
> > +                     kthread_bind_mask_cpuset(worker->task, pool_allowed_cpus(pool));
> >       }
> >
> >       /* successful, attach the worker to the pool */
> > @@ -5587,7 +5590,7 @@ static int init_rescuer(struct workqueue_struct *wq)
> >       if (wq->flags & WQ_UNBOUND)
> >               kthread_bind_mask(rescuer->task, unbound_effective_cpumask(wq));
> >       else
> > -             kthread_bind_mask(rescuer->task, cpu_possible_mask);
> > +             kthread_bind_mask_cpuset(rescuer->task, cpu_possible_mask);
> >       wake_up_process(rescuer->task);
> >
> >       return 0;
> >
>