[PATCH v2] cgroup: avoid css_set_lock in cgroup_css_set_fork()

Mateusz Guzik posted 1 patch 2 weeks, 2 days ago
kernel/cgroup/cgroup-internal.h | 11 +++++--
kernel/cgroup/cgroup.c          | 54 +++++++++++++++++++++++++--------
2 files changed, 49 insertions(+), 16 deletions(-)
[PATCH v2] cgroup: avoid css_set_lock in cgroup_css_set_fork()
Posted by Mateusz Guzik 2 weeks, 2 days ago
In the stock kernel the css_set_lock is taken three times during thread
life cycle, turning it into the primary bottleneck in fork-heavy
workloads.

The acquire in perparation for clone can be avoided with a sequence
counter, which in turn pushes the lock down.

Accounts only for 6% speed up when creating threads in parallel on 20
cores as most of the contention shifts to pidmap_lock.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
---

v2:
- change comment about clone_seq
- raw_write_seqcount* -> write_seqcount
- just loop on failed seq check
- don't bump it on task exit

 kernel/cgroup/cgroup-internal.h | 11 +++++--
 kernel/cgroup/cgroup.c          | 54 +++++++++++++++++++++++++--------
 2 files changed, 49 insertions(+), 16 deletions(-)

diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 22051b4f1ccb..04a3aadcbc7f 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -194,6 +194,9 @@ static inline bool notify_on_release(const struct cgroup *cgrp)
 	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
 }
 
+/*
+ * refcounted get/put for css_set objects
+ */
 void put_css_set_locked(struct css_set *cset);
 
 static inline void put_css_set(struct css_set *cset)
@@ -213,14 +216,16 @@ static inline void put_css_set(struct css_set *cset)
 	spin_unlock_irqrestore(&css_set_lock, flags);
 }
 
-/*
- * refcounted get/put for css_set objects
- */
 static inline void get_css_set(struct css_set *cset)
 {
 	refcount_inc(&cset->refcount);
 }
 
+static inline bool get_css_set_not_zero(struct css_set *cset)
+{
+	return refcount_inc_not_zero(&cset->refcount);
+}
+
 bool cgroup_ssid_enabled(int ssid);
 bool cgroup_on_dfl(const struct cgroup *cgrp);
 
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 94788bd1fdf0..0053582b9b56 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -87,7 +87,14 @@
  * cgroup.h can use them for lockdep annotations.
  */
 DEFINE_MUTEX(cgroup_mutex);
-DEFINE_SPINLOCK(css_set_lock);
+__cacheline_aligned DEFINE_SPINLOCK(css_set_lock);
+
+/*
+ * css_set_for_clone_seq synchronizes access to task_struct::cgroup
+ * and cgroup::kill_seq used on clone path
+ */
+static __cacheline_aligned seqcount_spinlock_t css_set_for_clone_seq =
+	SEQCNT_SPINLOCK_ZERO(css_set_for_clone_seq, &css_set_lock);
 
 #if (defined CONFIG_PROVE_RCU || defined CONFIG_LOCKDEP)
 EXPORT_SYMBOL_GPL(cgroup_mutex);
@@ -907,6 +914,7 @@ static void css_set_skip_task_iters(struct css_set *cset,
  * @from_cset: css_set @task currently belongs to (may be NULL)
  * @to_cset: new css_set @task is being moved to (may be NULL)
  * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks
+ * @skip_clone_seq: don't bump css_set_for_clone_seq
  *
  * Move @task from @from_cset to @to_cset.  If @task didn't belong to any
  * css_set, @from_cset can be NULL.  If @task is being disassociated
@@ -918,13 +926,16 @@ static void css_set_skip_task_iters(struct css_set *cset,
  */
 static void css_set_move_task(struct task_struct *task,
 			      struct css_set *from_cset, struct css_set *to_cset,
-			      bool use_mg_tasks)
+			      bool use_mg_tasks, bool skip_clone_seq)
 {
 	lockdep_assert_held(&css_set_lock);
 
 	if (to_cset && !css_set_populated(to_cset))
 		css_set_update_populated(to_cset, true);
 
+	if (!skip_clone_seq)
+		write_seqcount_begin(&css_set_for_clone_seq);
+
 	if (from_cset) {
 		WARN_ON_ONCE(list_empty(&task->cg_list));
 
@@ -949,6 +960,9 @@ static void css_set_move_task(struct task_struct *task,
 		list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
 							     &to_cset->tasks);
 	}
+
+	if (!skip_clone_seq)
+		write_seqcount_end(&css_set_for_clone_seq);
 }
 
 /*
@@ -2723,7 +2737,7 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
 
 			get_css_set(to_cset);
 			to_cset->nr_tasks++;
-			css_set_move_task(task, from_cset, to_cset, true);
+			css_set_move_task(task, from_cset, to_cset, true, false);
 			from_cset->nr_tasks--;
 			/*
 			 * If the source or destination cgroup is frozen,
@@ -4183,7 +4197,9 @@ static void __cgroup_kill(struct cgroup *cgrp)
 	lockdep_assert_held(&cgroup_mutex);
 
 	spin_lock_irq(&css_set_lock);
+	write_seqcount_begin(&css_set_for_clone_seq);
 	cgrp->kill_seq++;
+	write_seqcount_end(&css_set_for_clone_seq);
 	spin_unlock_irq(&css_set_lock);
 
 	css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it);
@@ -6696,14 +6712,26 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
 
 	cgroup_threadgroup_change_begin(current);
 
-	spin_lock_irq(&css_set_lock);
-	cset = task_css_set(current);
-	get_css_set(cset);
-	if (kargs->cgrp)
-		kargs->kill_seq = kargs->cgrp->kill_seq;
-	else
-		kargs->kill_seq = cset->dfl_cgrp->kill_seq;
-	spin_unlock_irq(&css_set_lock);
+	for (;;) {
+		unsigned seq = raw_read_seqcount_begin(&css_set_for_clone_seq);
+		bool got_ref = false;
+		rcu_read_lock();
+		cset = task_css_set(current);
+		if (kargs->cgrp)
+			kargs->kill_seq = kargs->cgrp->kill_seq;
+		else
+			kargs->kill_seq = cset->dfl_cgrp->kill_seq;
+		if (get_css_set_not_zero(cset))
+			got_ref = true;
+		rcu_read_unlock();
+		if (unlikely(!got_ref || read_seqcount_retry(&css_set_for_clone_seq, seq))) {
+			if (got_ref)
+				put_css_set(cset);
+			cpu_relax();
+			continue;
+		}
+		break;
+	}
 
 	if (!(kargs->flags & CLONE_INTO_CGROUP)) {
 		kargs->cset = cset;
@@ -6907,7 +6935,7 @@ void cgroup_post_fork(struct task_struct *child,
 
 		WARN_ON_ONCE(!list_empty(&child->cg_list));
 		cset->nr_tasks++;
-		css_set_move_task(child, NULL, cset, false);
+		css_set_move_task(child, NULL, cset, false, true);
 	} else {
 		put_css_set(cset);
 		cset = NULL;
@@ -6995,7 +7023,7 @@ static void do_cgroup_task_dead(struct task_struct *tsk)
 
 	WARN_ON_ONCE(list_empty(&tsk->cg_list));
 	cset = task_css_set(tsk);
-	css_set_move_task(tsk, cset, NULL, false);
+	css_set_move_task(tsk, cset, NULL, false, true);
 	cset->nr_tasks--;
 	/* matches the signal->live check in css_task_iter_advance() */
 	if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live))
-- 
2.48.1
Re: [PATCH v2] cgroup: avoid css_set_lock in cgroup_css_set_fork()
Posted by Mateusz Guzik 1 week, 3 days ago
ping? I need cgroups out of the way for further scalability work in fork+ exit

On Thu, Jan 22, 2026 at 12:29 PM Mateusz Guzik <mjguzik@gmail.com> wrote:
>
> In the stock kernel the css_set_lock is taken three times during thread
> life cycle, turning it into the primary bottleneck in fork-heavy
> workloads.
>
> The acquire in perparation for clone can be avoided with a sequence
> counter, which in turn pushes the lock down.
>
> Accounts only for 6% speed up when creating threads in parallel on 20
> cores as most of the contention shifts to pidmap_lock.
>
> Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
> ---
>
> v2:
> - change comment about clone_seq
> - raw_write_seqcount* -> write_seqcount
> - just loop on failed seq check
> - don't bump it on task exit
>
>  kernel/cgroup/cgroup-internal.h | 11 +++++--
>  kernel/cgroup/cgroup.c          | 54 +++++++++++++++++++++++++--------
>  2 files changed, 49 insertions(+), 16 deletions(-)
>
> diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
> index 22051b4f1ccb..04a3aadcbc7f 100644
> --- a/kernel/cgroup/cgroup-internal.h
> +++ b/kernel/cgroup/cgroup-internal.h
> @@ -194,6 +194,9 @@ static inline bool notify_on_release(const struct cgroup *cgrp)
>         return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
>  }
>
> +/*
> + * refcounted get/put for css_set objects
> + */
>  void put_css_set_locked(struct css_set *cset);
>
>  static inline void put_css_set(struct css_set *cset)
> @@ -213,14 +216,16 @@ static inline void put_css_set(struct css_set *cset)
>         spin_unlock_irqrestore(&css_set_lock, flags);
>  }
>
> -/*
> - * refcounted get/put for css_set objects
> - */
>  static inline void get_css_set(struct css_set *cset)
>  {
>         refcount_inc(&cset->refcount);
>  }
>
> +static inline bool get_css_set_not_zero(struct css_set *cset)
> +{
> +       return refcount_inc_not_zero(&cset->refcount);
> +}
> +
>  bool cgroup_ssid_enabled(int ssid);
>  bool cgroup_on_dfl(const struct cgroup *cgrp);
>
> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index 94788bd1fdf0..0053582b9b56 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -87,7 +87,14 @@
>   * cgroup.h can use them for lockdep annotations.
>   */
>  DEFINE_MUTEX(cgroup_mutex);
> -DEFINE_SPINLOCK(css_set_lock);
> +__cacheline_aligned DEFINE_SPINLOCK(css_set_lock);
> +
> +/*
> + * css_set_for_clone_seq synchronizes access to task_struct::cgroup
> + * and cgroup::kill_seq used on clone path
> + */
> +static __cacheline_aligned seqcount_spinlock_t css_set_for_clone_seq =
> +       SEQCNT_SPINLOCK_ZERO(css_set_for_clone_seq, &css_set_lock);
>
>  #if (defined CONFIG_PROVE_RCU || defined CONFIG_LOCKDEP)
>  EXPORT_SYMBOL_GPL(cgroup_mutex);
> @@ -907,6 +914,7 @@ static void css_set_skip_task_iters(struct css_set *cset,
>   * @from_cset: css_set @task currently belongs to (may be NULL)
>   * @to_cset: new css_set @task is being moved to (may be NULL)
>   * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks
> + * @skip_clone_seq: don't bump css_set_for_clone_seq
>   *
>   * Move @task from @from_cset to @to_cset.  If @task didn't belong to any
>   * css_set, @from_cset can be NULL.  If @task is being disassociated
> @@ -918,13 +926,16 @@ static void css_set_skip_task_iters(struct css_set *cset,
>   */
>  static void css_set_move_task(struct task_struct *task,
>                               struct css_set *from_cset, struct css_set *to_cset,
> -                             bool use_mg_tasks)
> +                             bool use_mg_tasks, bool skip_clone_seq)
>  {
>         lockdep_assert_held(&css_set_lock);
>
>         if (to_cset && !css_set_populated(to_cset))
>                 css_set_update_populated(to_cset, true);
>
> +       if (!skip_clone_seq)
> +               write_seqcount_begin(&css_set_for_clone_seq);
> +
>         if (from_cset) {
>                 WARN_ON_ONCE(list_empty(&task->cg_list));
>
> @@ -949,6 +960,9 @@ static void css_set_move_task(struct task_struct *task,
>                 list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
>                                                              &to_cset->tasks);
>         }
> +
> +       if (!skip_clone_seq)
> +               write_seqcount_end(&css_set_for_clone_seq);
>  }
>
>  /*
> @@ -2723,7 +2737,7 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
>
>                         get_css_set(to_cset);
>                         to_cset->nr_tasks++;
> -                       css_set_move_task(task, from_cset, to_cset, true);
> +                       css_set_move_task(task, from_cset, to_cset, true, false);
>                         from_cset->nr_tasks--;
>                         /*
>                          * If the source or destination cgroup is frozen,
> @@ -4183,7 +4197,9 @@ static void __cgroup_kill(struct cgroup *cgrp)
>         lockdep_assert_held(&cgroup_mutex);
>
>         spin_lock_irq(&css_set_lock);
> +       write_seqcount_begin(&css_set_for_clone_seq);
>         cgrp->kill_seq++;
> +       write_seqcount_end(&css_set_for_clone_seq);
>         spin_unlock_irq(&css_set_lock);
>
>         css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it);
> @@ -6696,14 +6712,26 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
>
>         cgroup_threadgroup_change_begin(current);
>
> -       spin_lock_irq(&css_set_lock);
> -       cset = task_css_set(current);
> -       get_css_set(cset);
> -       if (kargs->cgrp)
> -               kargs->kill_seq = kargs->cgrp->kill_seq;
> -       else
> -               kargs->kill_seq = cset->dfl_cgrp->kill_seq;
> -       spin_unlock_irq(&css_set_lock);
> +       for (;;) {
> +               unsigned seq = raw_read_seqcount_begin(&css_set_for_clone_seq);
> +               bool got_ref = false;
> +               rcu_read_lock();
> +               cset = task_css_set(current);
> +               if (kargs->cgrp)
> +                       kargs->kill_seq = kargs->cgrp->kill_seq;
> +               else
> +                       kargs->kill_seq = cset->dfl_cgrp->kill_seq;
> +               if (get_css_set_not_zero(cset))
> +                       got_ref = true;
> +               rcu_read_unlock();
> +               if (unlikely(!got_ref || read_seqcount_retry(&css_set_for_clone_seq, seq))) {
> +                       if (got_ref)
> +                               put_css_set(cset);
> +                       cpu_relax();
> +                       continue;
> +               }
> +               break;
> +       }
>
>         if (!(kargs->flags & CLONE_INTO_CGROUP)) {
>                 kargs->cset = cset;
> @@ -6907,7 +6935,7 @@ void cgroup_post_fork(struct task_struct *child,
>
>                 WARN_ON_ONCE(!list_empty(&child->cg_list));
>                 cset->nr_tasks++;
> -               css_set_move_task(child, NULL, cset, false);
> +               css_set_move_task(child, NULL, cset, false, true);
>         } else {
>                 put_css_set(cset);
>                 cset = NULL;
> @@ -6995,7 +7023,7 @@ static void do_cgroup_task_dead(struct task_struct *tsk)
>
>         WARN_ON_ONCE(list_empty(&tsk->cg_list));
>         cset = task_css_set(tsk);
> -       css_set_move_task(tsk, cset, NULL, false);
> +       css_set_move_task(tsk, cset, NULL, false, true);
>         cset->nr_tasks--;
>         /* matches the signal->live check in css_task_iter_advance() */
>         if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live))
> --
> 2.48.1
>
Re: [PATCH v2] cgroup: avoid css_set_lock in cgroup_css_set_fork()
Posted by Michal Koutný 1 week, 3 days ago
On Tue, Jan 27, 2026 at 03:30:14PM +0100, Mateusz Guzik <mjguzik@gmail.com> wrote:
> ping? I need cgroups out of the way for further scalability work in fork+ exit

I got stuck on following:
- possible implementation with (simpler?) rwlock,
- effect of css_set_lock in cgroup_post_fork().

I want to try some measurements of the latter since I assume that
limits the effect of the elision in cgroup_css_set_fork(), doesn't it?
(IIUC, you'd see it again if you reduced the pidmap_lock contention.)

Regards,
Michal
Re: [PATCH v2] cgroup: avoid css_set_lock in cgroup_css_set_fork()
Posted by Mateusz Guzik 1 week, 3 days ago
On Tue, Jan 27, 2026 at 6:27 PM Michal Koutný <mkoutny@suse.com> wrote:
>
> On Tue, Jan 27, 2026 at 03:30:14PM +0100, Mateusz Guzik <mjguzik@gmail.com> wrote:
> > ping? I need cgroups out of the way for further scalability work in fork+ exit
>
> I got stuck on following:
> - possible implementation with (simpler?) rwlock,
> - effect of css_set_lock in cgroup_post_fork().
>
> I want to try some measurements of the latter since I assume that
> limits the effect of the elision in cgroup_css_set_fork(), doesn't it?
> (IIUC, you'd see it again if you reduced the pidmap_lock contention.)
>

Not sure what you mean here.

If what you mean is merely converting css_set_lock into a rwlock  and
read-locking in the spot handled with seq in my patch that's a no-go
-- frequent reader vs writer transitions kill perf in their own right
and you still have 3 contention spots, i.e., this will remain as the
primary bottleneck.

To reiterate,  in the kernel as found in next at the moment top of the
profile is still the pidmap lock.

There is a patch to remove most of the overhead under the lock:
https://lore.kernel.org/linux-fsdevel/20260120-work-pidfs-rhashtable-v2-1-d593c4d0f576@kernel.org/

It may need some tidy ups but for the purpose of this discussion we
can pretend it landed.

With that in place top of the profile is the css set lock.

With *this* patch in place we are back to pidmap, which is then
followed by tasklist.

clone + exit codepaths are globally serialized as follows:
- pidmap -- once per side
- tasklist -- once on clone, twice on exit
- cgroups -- twice on clone, once on exit
- some lock for random harvesting, can probably just go

In principle with enough effort(tm) one can introduce finer-grained
locking for all of these, but I suspect it is not warranted and I'm
not going to do it, especially so for the tasklist lock.

So I very much expect the clone + exit pair will remain globally
serialized, it's the question of the nature of said serialization.

I think a sensible goal is serialization at most once per side (if
possible) and even then sanitized hold time.

The tasklist thing may be too problematic to only take twice, but even
then I can trivially reduce the hold time. If 3 spots have to remain,
it will be the new top. If I work it down to two, who knows.

So ye, css very much can be considered a problem here.

This comment:
> - effect of css_set_lock in cgroup_post_fork().

... I don't get whatsoever.

Stability of cgroup placement aside, to my reading the lock is needed
in part to serialize addition of the task to the cgroup list. No
matter what this will have to be serialized both ways with the same
thing.

Perhaps said stability can be assured in other ways and the list can
be decomposed, but that's some complexity which is not warranted.
Re: [PATCH v2] cgroup: avoid css_set_lock in cgroup_css_set_fork()
Posted by Michal Koutný 1 week, 1 day ago
On Tue, Jan 27, 2026 at 07:18:44PM +0100, Mateusz Guzik <mjguzik@gmail.com> wrote:
> Not sure what you mean here.

Let me add a patch for illustration.
Does that clarify? (How would that change your watched metric?)

...
> It may need some tidy ups but for the purpose of this discussion we
> can pretend it landed.

That I follow.

> clone + exit codepaths are globally serialized as follows:
> - pidmap -- once per side
> - tasklist -- once on clone, twice on exit
> - cgroups -- twice on clone, once on exit
> - some lock for random harvesting, can probably just go
...
> So ye, css very much can be considered a problem here.

Acknowledged.

> 
> This comment:
> > - effect of css_set_lock in cgroup_post_fork().
> 
> ... I don't get whatsoever.

I meant that css_set_lock is taken 2nd time in cgroup_post_fork() (also
after this rework it remains there).

And I'm wondering whether removal only in cgroup_css_set_fork() improves
parallelism because the tasks (before patching) are queued on the first
css_set_lock, serialized through the first critical section and when
they arrive to the second critical section in cgroup_post_fork() their
arrival rate is already reduced because they had to pass through the
first critical section. Hence the 2nd pass through the critical section
should be less contended (w/out waiting).

I understand it's good to reduce the overall hold time of (every
mentioned) lock but I'm unsure how much helps eliminating css_set_lock
from two to one pass on the clone path.

> 
> Stability of cgroup placement aside, to my reading the lock is needed
> in part to serialize addition of the task to the cgroup list. No
> matter what this will have to be serialized both ways with the same
> thing.

Yes, the modification of css_set->tasks list still needs css_set_lock in
post fork.

> 
> Perhaps said stability can be assured in other ways and the list can
> be decomposed, but that's some complexity which is not warranted.

The decomposition is not obvious to me :-/


--- 8< ---
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index bc892e3b37eea..a176fd60ba08f 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -413,11 +413,13 @@ static inline void cgroup_unlock(void)
  * as locks used during the cgroup_subsys::attach() methods.
  */
 #ifdef CONFIG_PROVE_RCU
+extern rwlock_t css_set_clone_lock;
 #define task_css_set_check(task, __c)					\
 	rcu_dereference_check((task)->cgroups,				\
 		rcu_read_lock_sched_held() ||				\
 		lockdep_is_held(&cgroup_mutex) ||			\
 		lockdep_is_held(&css_set_lock) ||			\
+		lockdep_is_held(&css_set_clone_lock) ||			\
 		((task)->flags & PF_EXITING) || (__c))
 #else
 #define task_css_set_check(task, __c)					\
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 5f0d33b049102..4e28e922e5668 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -83,15 +83,21 @@
  * css_set_lock protects task->cgroups pointer, the list of css_set
  * objects, and the chain of tasks off each css_set.
  *
+ * css_set_clone_lock synchronizes access to task->cgroups and cgroup->kill_seq
+ * instead of css_set_lock on clone path
+ *
  * These locks are exported if CONFIG_PROVE_RCU so that accessors in
  * cgroup.h can use them for lockdep annotations.
  */
 DEFINE_MUTEX(cgroup_mutex);
-DEFINE_SPINLOCK(css_set_lock);
+__cacheline_aligned DEFINE_SPINLOCK(css_set_lock);
+__cacheline_aligned DEFINE_RWLOCK(css_set_clone_lock);
+
 
 #if (defined CONFIG_PROVE_RCU || defined CONFIG_LOCKDEP)
 EXPORT_SYMBOL_GPL(cgroup_mutex);
 EXPORT_SYMBOL_GPL(css_set_lock);
+EXPORT_SYMBOL_GPL(css_set_clone_lock);
 #endif
 
 struct blocking_notifier_head cgroup_lifetime_notifier =
@@ -901,6 +907,10 @@ static void css_set_skip_task_iters(struct css_set *cset,
 		css_task_iter_skip(it, task);
 }
 
+enum css_set_move_flags {
+	CSET_MOVE_USE_MG_TASKS	= (1 << 0),
+	CSET_MOVE_SKIP_LOCK	= (1 << 1),
+};
 /**
  * css_set_move_task - move a task from one css_set to another
  * @task: task being moved
@@ -918,13 +928,23 @@ static void css_set_skip_task_iters(struct css_set *cset,
  */
 static void css_set_move_task(struct task_struct *task,
 			      struct css_set *from_cset, struct css_set *to_cset,
-			      bool use_mg_tasks)
+			      enum css_set_move_flags flags)
 {
+	/*
+	 * Self task cannot move and clone concurrently and disassociation
+	 * doesn't modify task->cgroups that's relevant for clone
+	 */
+	bool skip_clone_lock = task == current || to_cset == NULL;
+	skip_clone_lock |= flags & CSET_MOVE_SKIP_LOCK;
+
 	lockdep_assert_held(&css_set_lock);
 
 	if (to_cset && !css_set_populated(to_cset))
 		css_set_update_populated(to_cset, true);
 
+	if (!skip_clone_lock)
+		write_lock(&css_set_clone_lock);
+
 	if (from_cset) {
 		WARN_ON_ONCE(list_empty(&task->cg_list));
 
@@ -946,9 +966,13 @@ static void css_set_move_task(struct task_struct *task,
 		WARN_ON_ONCE(task->flags & PF_EXITING);
 
 		cgroup_move_task(task, to_cset);
-		list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
-							     &to_cset->tasks);
+		list_add_tail(&task->cg_list,
+			      (flags & CSET_MOVE_USE_MG_TASKS) ? &to_cset->mg_tasks :
+								 &to_cset->tasks);
 	}
+
+	if (!skip_clone_lock)
+		write_unlock(&css_set_clone_lock);
 }
 
 /*
@@ -2723,7 +2747,7 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
 
 			get_css_set(to_cset);
 			to_cset->nr_tasks++;
-			css_set_move_task(task, from_cset, to_cset, true);
+			css_set_move_task(task, from_cset, to_cset, CSET_MOVE_USE_MG_TASKS);
 			from_cset->nr_tasks--;
 			/*
 			 * If the source or destination cgroup is frozen,
@@ -4183,7 +4207,9 @@ static void __cgroup_kill(struct cgroup *cgrp)
 	lockdep_assert_held(&cgroup_mutex);
 
 	spin_lock_irq(&css_set_lock);
+	write_lock(&css_set_clone_lock);
 	cgrp->kill_seq++;
+	write_unlock(&css_set_clone_lock);
 	spin_unlock_irq(&css_set_lock);
 
 	css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it);
@@ -6696,14 +6722,15 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
 
 	cgroup_threadgroup_change_begin(current);
 
-	spin_lock_irq(&css_set_lock);
+	read_lock(&css_set_clone_lock);
+
 	cset = task_css_set(current);
 	get_css_set(cset);
 	if (kargs->cgrp)
 		kargs->kill_seq = kargs->cgrp->kill_seq;
 	else
 		kargs->kill_seq = cset->dfl_cgrp->kill_seq;
-	spin_unlock_irq(&css_set_lock);
+	read_unlock(&css_set_clone_lock);
 
 	if (!(kargs->flags & CLONE_INTO_CGROUP)) {
 		kargs->cset = cset;
@@ -6893,6 +6920,7 @@ void cgroup_post_fork(struct task_struct *child,
 	cset = kargs->cset;
 	kargs->cset = NULL;
 
+	// XXX could this be read_lock(css_set_clone_lock) ?
 	spin_lock_irq(&css_set_lock);
 
 	/* init tasks are special, only link regular threads */
@@ -6907,7 +6935,8 @@ void cgroup_post_fork(struct task_struct *child,
 
 		WARN_ON_ONCE(!list_empty(&child->cg_list));
 		cset->nr_tasks++;
-		css_set_move_task(child, NULL, cset, false);
+		/* child cannot run (another) clone, skip lock */
+		css_set_move_task(child, NULL, cset, CSET_MOVE_SKIP_LOCK);
 	} else {
 		put_css_set(cset);
 		cset = NULL;
@@ -6995,7 +7024,7 @@ static void do_cgroup_task_dead(struct task_struct *tsk)
 
 	WARN_ON_ONCE(list_empty(&tsk->cg_list));
 	cset = task_css_set(tsk);
-	css_set_move_task(tsk, cset, NULL, false);
+	css_set_move_task(tsk, cset, NULL, CSET_MOVE_SKIP_LOCK);
 	cset->nr_tasks--;
 	/* matches the signal->live check in css_task_iter_advance() */
 	if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live))