[PATCH] cgroup: avoid css_set_lock in cgroup_css_set_fork()

Mateusz Guzik posted 1 patch 2 weeks, 3 days ago
There is a newer version of this series
kernel/cgroup/cgroup-internal.h | 11 ++++--
kernel/cgroup/cgroup.c          | 60 ++++++++++++++++++++++++++-------
2 files changed, 55 insertions(+), 16 deletions(-)
[PATCH] cgroup: avoid css_set_lock in cgroup_css_set_fork()
Posted by Mateusz Guzik 2 weeks, 3 days ago
In the stock kernel the css_set_lock is taken three times during thread
life cycle, turning it into the primary bottleneck in fork-heavy
workloads.

The acquire in perparation for clone can be avoided with a sequence
counter, which in turn pushes the lock down.

Accounts only for 6% speed up when creating threads in parallel on 20
cores as most of the contention shifts to pidmap_lock (from about 740k
ops/s to 790k ops/s).

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
---

I don't really care for cgroups, I merely need the thing out of the way
for fork. If someone wants to handle this differently, I'm not going to
argue as long as the bottleneck is taken care of.

On the stock kernel pidmap_lock is still the biggest problem, but
there is a patch to fix it:
https://lore.kernel.org/linux-fsdevel/CAGudoHFuhbkJ+8iA92LYPmphBboJB7sxxC2L7A8OtBXA22UXzA@mail.gmail.com/T/#m832ac70f5e8f5ea14e69ca78459578d687efdd9f

.. afterwards it is cgroups and the commit message was written
pretending it already landed.

with the patch below contention is back on pidmap_lock

 kernel/cgroup/cgroup-internal.h | 11 ++++--
 kernel/cgroup/cgroup.c          | 60 ++++++++++++++++++++++++++-------
 2 files changed, 55 insertions(+), 16 deletions(-)

diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 22051b4f1ccb..04a3aadcbc7f 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -194,6 +194,9 @@ static inline bool notify_on_release(const struct cgroup *cgrp)
 	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
 }
 
+/*
+ * refcounted get/put for css_set objects
+ */
 void put_css_set_locked(struct css_set *cset);
 
 static inline void put_css_set(struct css_set *cset)
@@ -213,14 +216,16 @@ static inline void put_css_set(struct css_set *cset)
 	spin_unlock_irqrestore(&css_set_lock, flags);
 }
 
-/*
- * refcounted get/put for css_set objects
- */
 static inline void get_css_set(struct css_set *cset)
 {
 	refcount_inc(&cset->refcount);
 }
 
+static inline bool get_css_set_not_zero(struct css_set *cset)
+{
+	return refcount_inc_not_zero(&cset->refcount);
+}
+
 bool cgroup_ssid_enabled(int ssid);
 bool cgroup_on_dfl(const struct cgroup *cgrp);
 
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 94788bd1fdf0..16d2a8d204e8 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -87,7 +87,12 @@
  * cgroup.h can use them for lockdep annotations.
  */
 DEFINE_MUTEX(cgroup_mutex);
-DEFINE_SPINLOCK(css_set_lock);
+__cacheline_aligned DEFINE_SPINLOCK(css_set_lock);
+/*
+ * css_set_for_clone_seq is used to allow lockless operation in cgroup_css_set_fork()
+ */
+static __cacheline_aligned seqcount_spinlock_t css_set_for_clone_seq =
+	SEQCNT_SPINLOCK_ZERO(css_set_for_clone_seq, &css_set_lock);
 
 #if (defined CONFIG_PROVE_RCU || defined CONFIG_LOCKDEP)
 EXPORT_SYMBOL_GPL(cgroup_mutex);
@@ -907,6 +912,7 @@ static void css_set_skip_task_iters(struct css_set *cset,
  * @from_cset: css_set @task currently belongs to (may be NULL)
  * @to_cset: new css_set @task is being moved to (may be NULL)
  * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks
+ * @is_clone: indicator whether @task is amids clone
  *
  * Move @task from @from_cset to @to_cset.  If @task didn't belong to any
  * css_set, @from_cset can be NULL.  If @task is being disassociated
@@ -918,13 +924,16 @@ static void css_set_skip_task_iters(struct css_set *cset,
  */
 static void css_set_move_task(struct task_struct *task,
 			      struct css_set *from_cset, struct css_set *to_cset,
-			      bool use_mg_tasks)
+			      bool use_mg_tasks, bool is_clone)
 {
 	lockdep_assert_held(&css_set_lock);
 
 	if (to_cset && !css_set_populated(to_cset))
 		css_set_update_populated(to_cset, true);
 
+	if (!is_clone)
+		raw_write_seqcount_begin(&css_set_for_clone_seq);
+
 	if (from_cset) {
 		WARN_ON_ONCE(list_empty(&task->cg_list));
 
@@ -949,6 +958,9 @@ static void css_set_move_task(struct task_struct *task,
 		list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
 							     &to_cset->tasks);
 	}
+
+	if (!is_clone)
+		raw_write_seqcount_end(&css_set_for_clone_seq);
 }
 
 /*
@@ -2723,7 +2735,7 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
 
 			get_css_set(to_cset);
 			to_cset->nr_tasks++;
-			css_set_move_task(task, from_cset, to_cset, true);
+			css_set_move_task(task, from_cset, to_cset, true, false);
 			from_cset->nr_tasks--;
 			/*
 			 * If the source or destination cgroup is frozen,
@@ -4183,7 +4195,9 @@ static void __cgroup_kill(struct cgroup *cgrp)
 	lockdep_assert_held(&cgroup_mutex);
 
 	spin_lock_irq(&css_set_lock);
+	raw_write_seqcount_begin(&css_set_for_clone_seq);
 	cgrp->kill_seq++;
+	raw_write_seqcount_end(&css_set_for_clone_seq);
 	spin_unlock_irq(&css_set_lock);
 
 	css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it);
@@ -6690,20 +6704,40 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
 	struct cgroup *dst_cgrp = NULL;
 	struct css_set *cset;
 	struct super_block *sb;
+	bool need_lock;
 
 	if (kargs->flags & CLONE_INTO_CGROUP)
 		cgroup_lock();
 
 	cgroup_threadgroup_change_begin(current);
 
-	spin_lock_irq(&css_set_lock);
-	cset = task_css_set(current);
-	get_css_set(cset);
-	if (kargs->cgrp)
-		kargs->kill_seq = kargs->cgrp->kill_seq;
-	else
-		kargs->kill_seq = cset->dfl_cgrp->kill_seq;
-	spin_unlock_irq(&css_set_lock);
+	need_lock = true;
+	scoped_guard(rcu) {
+		unsigned seq = raw_read_seqcount_begin(&css_set_for_clone_seq);
+		cset = task_css_set(current);
+		if (unlikely(!cset || !get_css_set_not_zero(cset)))
+			break;
+		if (kargs->cgrp)
+			kargs->kill_seq = kargs->cgrp->kill_seq;
+		else
+			kargs->kill_seq = cset->dfl_cgrp->kill_seq;
+		if (read_seqcount_retry(&css_set_for_clone_seq, seq)) {
+			put_css_set(cset);
+			break;
+		}
+		need_lock = false;
+	}
+
+	if (unlikely(need_lock)) {
+		spin_lock_irq(&css_set_lock);
+		cset = task_css_set(current);
+		get_css_set(cset);
+		if (kargs->cgrp)
+			kargs->kill_seq = kargs->cgrp->kill_seq;
+		else
+			kargs->kill_seq = cset->dfl_cgrp->kill_seq;
+		spin_unlock_irq(&css_set_lock);
+	}
 
 	if (!(kargs->flags & CLONE_INTO_CGROUP)) {
 		kargs->cset = cset;
@@ -6907,7 +6941,7 @@ void cgroup_post_fork(struct task_struct *child,
 
 		WARN_ON_ONCE(!list_empty(&child->cg_list));
 		cset->nr_tasks++;
-		css_set_move_task(child, NULL, cset, false);
+		css_set_move_task(child, NULL, cset, false, true);
 	} else {
 		put_css_set(cset);
 		cset = NULL;
@@ -6995,7 +7029,7 @@ static void do_cgroup_task_dead(struct task_struct *tsk)
 
 	WARN_ON_ONCE(list_empty(&tsk->cg_list));
 	cset = task_css_set(tsk);
-	css_set_move_task(tsk, cset, NULL, false);
+	css_set_move_task(tsk, cset, NULL, false, false);
 	cset->nr_tasks--;
 	/* matches the signal->live check in css_task_iter_advance() */
 	if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live))
-- 
2.48.1
Re: [PATCH] cgroup: avoid css_set_lock in cgroup_css_set_fork()
Posted by Michal Koutný 2 weeks, 3 days ago
Hi.

On Tue, Jan 20, 2026 at 06:08:59PM +0100, Mateusz Guzik <mjguzik@gmail.com> wrote:
> In the stock kernel the css_set_lock is taken three times during thread
> life cycle, turning it into the primary bottleneck in fork-heavy
> workloads.
> 
> The acquire in perparation for clone can be avoided with a sequence
> counter, which in turn pushes the lock down.

I think this can work in principle. Thanks for digging into that.
I'd like to clarify a few details though so that the reasoning behind
the change is complete.

> Accounts only for 6% speed up when creating threads in parallel on 20
> cores as most of the contention shifts to pidmap_lock (from about 740k
> ops/s to 790k ops/s).

BTW what code/benchmark do you use to measure this?

> 
> Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
> ---
> 
> I don't really care for cgroups, I merely need the thing out of the way
> for fork. If someone wants to handle this differently, I'm not going to
> argue as long as the bottleneck is taken care of.
> 
> On the stock kernel pidmap_lock is still the biggest problem, but
> there is a patch to fix it:
> https://lore.kernel.org/linux-fsdevel/CAGudoHFuhbkJ+8iA92LYPmphBboJB7sxxC2L7A8OtBXA22UXzA@mail.gmail.com/T/#m832ac70f5e8f5ea14e69ca78459578d687efdd9f
> 
> .. afterwards it is cgroups and the commit message was written
> pretending it already landed.
> 
> with the patch below contention is back on pidmap_lock
> 
>  kernel/cgroup/cgroup-internal.h | 11 ++++--
>  kernel/cgroup/cgroup.c          | 60 ++++++++++++++++++++++++++-------
>  2 files changed, 55 insertions(+), 16 deletions(-)
> 
> diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
> index 22051b4f1ccb..04a3aadcbc7f 100644
> --- a/kernel/cgroup/cgroup-internal.h
> +++ b/kernel/cgroup/cgroup-internal.h
> @@ -194,6 +194,9 @@ static inline bool notify_on_release(const struct cgroup *cgrp)
>  	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
>  }
>  
> +/*
> + * refcounted get/put for css_set objects
> + */
>  void put_css_set_locked(struct css_set *cset);
>  
>  static inline void put_css_set(struct css_set *cset)
> @@ -213,14 +216,16 @@ static inline void put_css_set(struct css_set *cset)
>  	spin_unlock_irqrestore(&css_set_lock, flags);
>  }
>  
> -/*
> - * refcounted get/put for css_set objects
> - */
>  static inline void get_css_set(struct css_set *cset)
>  {
>  	refcount_inc(&cset->refcount);
>  }
>  
> +static inline bool get_css_set_not_zero(struct css_set *cset)
> +{
> +	return refcount_inc_not_zero(&cset->refcount);
> +}
> +
>  bool cgroup_ssid_enabled(int ssid);
>  bool cgroup_on_dfl(const struct cgroup *cgrp);
>  
> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index 94788bd1fdf0..16d2a8d204e8 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -87,7 +87,12 @@
>   * cgroup.h can use them for lockdep annotations.
>   */
>  DEFINE_MUTEX(cgroup_mutex);
> -DEFINE_SPINLOCK(css_set_lock);
> +__cacheline_aligned DEFINE_SPINLOCK(css_set_lock);
> +/*
> + * css_set_for_clone_seq is used to allow lockless operation in cgroup_css_set_fork()
> + */
> +static __cacheline_aligned seqcount_spinlock_t css_set_for_clone_seq =
> +	SEQCNT_SPINLOCK_ZERO(css_set_for_clone_seq, &css_set_lock);

Maybe a better comment:
	css_set_for_clone_seq synchronizes access to task_struct::cgroup
	or cgroup::kill_seq used on clone path

BTW why the __cacheline_aligned? Have you observed cacheline contention
with that cgroup_mutex or anything else?

>  
>  #if (defined CONFIG_PROVE_RCU || defined CONFIG_LOCKDEP)
>  EXPORT_SYMBOL_GPL(cgroup_mutex);
> @@ -907,6 +912,7 @@ static void css_set_skip_task_iters(struct css_set *cset,
>   * @from_cset: css_set @task currently belongs to (may be NULL)
>   * @to_cset: new css_set @task is being moved to (may be NULL)
>   * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks
> + * @is_clone: indicator whether @task is amids clone
>   *
>   * Move @task from @from_cset to @to_cset.  If @task didn't belong to any
>   * css_set, @from_cset can be NULL.  If @task is being disassociated
> @@ -918,13 +924,16 @@ static void css_set_skip_task_iters(struct css_set *cset,
>   */
>  static void css_set_move_task(struct task_struct *task,
>  			      struct css_set *from_cset, struct css_set *to_cset,
> -			      bool use_mg_tasks)
> +			      bool use_mg_tasks, bool is_clone)

I think the is_clone arg could be dropped. The harm from incrementing
write_seq from other places should be negligible. But it could be
optimized by just looking at to_cset (not being NULL) as that's the
migration that'd invalidate clone's value.


>  {
>  	lockdep_assert_held(&css_set_lock);
>  
>  	if (to_cset && !css_set_populated(to_cset))
>  		css_set_update_populated(to_cset, true);
>  
> +	if (!is_clone)
> +		raw_write_seqcount_begin(&css_set_for_clone_seq);

BTW What is the reason to use raw_ flavor of the seqcount functions?
(I think it's good to have lockdep covering our backs.)

> +
>  	if (from_cset) {
>  		WARN_ON_ONCE(list_empty(&task->cg_list));
>  
> @@ -949,6 +958,9 @@ static void css_set_move_task(struct task_struct *task,
>  		list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
>  							     &to_cset->tasks);
>  	}
> +
> +	if (!is_clone)
> +		raw_write_seqcount_end(&css_set_for_clone_seq);
>  }
>  
>  /*
> @@ -2723,7 +2735,7 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
>  
>  			get_css_set(to_cset);
>  			to_cset->nr_tasks++;
> -			css_set_move_task(task, from_cset, to_cset, true);
> +			css_set_move_task(task, from_cset, to_cset, true, false);

I'm afraid this should be also do the write locking.
(To synchronize migration and forking.) But it's alternate formulation
of the to_cset guard above.

>  			from_cset->nr_tasks--;
>  			/*
>  			 * If the source or destination cgroup is frozen,
> @@ -4183,7 +4195,9 @@ static void __cgroup_kill(struct cgroup *cgrp)
>  	lockdep_assert_held(&cgroup_mutex);
>  
>  	spin_lock_irq(&css_set_lock);
> +	raw_write_seqcount_begin(&css_set_for_clone_seq);
>  	cgrp->kill_seq++;
> +	raw_write_seqcount_end(&css_set_for_clone_seq);
>  	spin_unlock_irq(&css_set_lock);
>  
>  	css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it);
> @@ -6690,20 +6704,40 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
>  	struct cgroup *dst_cgrp = NULL;
>  	struct css_set *cset;
>  	struct super_block *sb;
> +	bool need_lock;
>  
>  	if (kargs->flags & CLONE_INTO_CGROUP)
>  		cgroup_lock();
>  
>  	cgroup_threadgroup_change_begin(current);
>  
> -	spin_lock_irq(&css_set_lock);
> -	cset = task_css_set(current);
> -	get_css_set(cset);
> -	if (kargs->cgrp)
> -		kargs->kill_seq = kargs->cgrp->kill_seq;
> -	else
> -		kargs->kill_seq = cset->dfl_cgrp->kill_seq;
> -	spin_unlock_irq(&css_set_lock);
> +	need_lock = true;
> +	scoped_guard(rcu) {
> +		unsigned seq = raw_read_seqcount_begin(&css_set_for_clone_seq);
> +		cset = task_css_set(current);
> +		if (unlikely(!cset || !get_css_set_not_zero(cset)))
> +			break;
> +		if (kargs->cgrp)
> +			kargs->kill_seq = kargs->cgrp->kill_seq;
> +		else
> +			kargs->kill_seq = cset->dfl_cgrp->kill_seq;
> +		if (read_seqcount_retry(&css_set_for_clone_seq, seq)) {
> +			put_css_set(cset);
> +			break;
> +		}
> +		need_lock = false;

I see that this construction of using the read_seqcount_retry() only
once and then falling back to spinlock is quite uncommon.
Assuming the relevant writers are properly enclosed within seqcount,
would there be a reason to do this double approach instead of "spin"
inside the seqcount's read section? (As usual with seqcount reader
sides.)


> +	}
> +
> +	if (unlikely(need_lock)) {
> +		spin_lock_irq(&css_set_lock);
> +		cset = task_css_set(current);
> +		get_css_set(cset);
> +		if (kargs->cgrp)
> +			kargs->kill_seq = kargs->cgrp->kill_seq;
> +		else
> +			kargs->kill_seq = cset->dfl_cgrp->kill_seq;
> +		spin_unlock_irq(&css_set_lock);
> +	}
>  
>  	if (!(kargs->flags & CLONE_INTO_CGROUP)) {
>  		kargs->cset = cset;
> @@ -6907,7 +6941,7 @@ void cgroup_post_fork(struct task_struct *child,
>  
>  		WARN_ON_ONCE(!list_empty(&child->cg_list));
>  		cset->nr_tasks++;
> -		css_set_move_task(child, NULL, cset, false);
> +		css_set_move_task(child, NULL, cset, false, true);
>  	} else {
>  		put_css_set(cset);
>  		cset = NULL;
> @@ -6995,7 +7029,7 @@ static void do_cgroup_task_dead(struct task_struct *tsk)
>  
>  	WARN_ON_ONCE(list_empty(&tsk->cg_list));
>  	cset = task_css_set(tsk);
> -	css_set_move_task(tsk, cset, NULL, false);
> +	css_set_move_task(tsk, cset, NULL, false, false);
>  	cset->nr_tasks--;
>  	/* matches the signal->live check in css_task_iter_advance() */
>  	if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live))
> -- 
> 2.48.1
> 
Re: [PATCH] cgroup: avoid css_set_lock in cgroup_css_set_fork()
Posted by Mateusz Guzik 2 weeks, 2 days ago
On Wed, Jan 21, 2026 at 12:01 PM Michal Koutný <mkoutny@suse.com> wrote:
>
> Hi.
>
> On Tue, Jan 20, 2026 at 06:08:59PM +0100, Mateusz Guzik <mjguzik@gmail.com> wrote:
> > In the stock kernel the css_set_lock is taken three times during thread
> > life cycle, turning it into the primary bottleneck in fork-heavy
> > workloads.
> >
> > The acquire in perparation for clone can be avoided with a sequence
> > counter, which in turn pushes the lock down.
>
> I think this can work in principle. Thanks for digging into that.
> I'd like to clarify a few details though so that the reasoning behind
> the change is complete.
>
> > Accounts only for 6% speed up when creating threads in parallel on 20
> > cores as most of the contention shifts to pidmap_lock (from about 740k
> > ops/s to 790k ops/s).
>
> BTW what code/benchmark do you use to measure this?
>

see https://lore.kernel.org/linux-mm/20251206131955.780557-1-mjguzik@gmail.com/

> >
> > Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
> > ---
> >
> > I don't really care for cgroups, I merely need the thing out of the way
> > for fork. If someone wants to handle this differently, I'm not going to
> > argue as long as the bottleneck is taken care of.
> >
> > On the stock kernel pidmap_lock is still the biggest problem, but
> > there is a patch to fix it:
> > https://lore.kernel.org/linux-fsdevel/CAGudoHFuhbkJ+8iA92LYPmphBboJB7sxxC2L7A8OtBXA22UXzA@mail.gmail.com/T/#m832ac70f5e8f5ea14e69ca78459578d687efdd9f
> >
> > .. afterwards it is cgroups and the commit message was written
> > pretending it already landed.
> >
> > with the patch below contention is back on pidmap_lock
> >
> >  kernel/cgroup/cgroup-internal.h | 11 ++++--
> >  kernel/cgroup/cgroup.c          | 60 ++++++++++++++++++++++++++-------
> >  2 files changed, 55 insertions(+), 16 deletions(-)
> >
> > diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
> > index 22051b4f1ccb..04a3aadcbc7f 100644
> > --- a/kernel/cgroup/cgroup-internal.h
> > +++ b/kernel/cgroup/cgroup-internal.h
> > @@ -194,6 +194,9 @@ static inline bool notify_on_release(const struct cgroup *cgrp)
> >       return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
> >  }
> >
> > +/*
> > + * refcounted get/put for css_set objects
> > + */
> >  void put_css_set_locked(struct css_set *cset);
> >
> >  static inline void put_css_set(struct css_set *cset)
> > @@ -213,14 +216,16 @@ static inline void put_css_set(struct css_set *cset)
> >       spin_unlock_irqrestore(&css_set_lock, flags);
> >  }
> >
> > -/*
> > - * refcounted get/put for css_set objects
> > - */
> >  static inline void get_css_set(struct css_set *cset)
> >  {
> >       refcount_inc(&cset->refcount);
> >  }
> >
> > +static inline bool get_css_set_not_zero(struct css_set *cset)
> > +{
> > +     return refcount_inc_not_zero(&cset->refcount);
> > +}
> > +
> >  bool cgroup_ssid_enabled(int ssid);
> >  bool cgroup_on_dfl(const struct cgroup *cgrp);
> >
> > diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> > index 94788bd1fdf0..16d2a8d204e8 100644
> > --- a/kernel/cgroup/cgroup.c
> > +++ b/kernel/cgroup/cgroup.c
> > @@ -87,7 +87,12 @@
> >   * cgroup.h can use them for lockdep annotations.
> >   */
> >  DEFINE_MUTEX(cgroup_mutex);
> > -DEFINE_SPINLOCK(css_set_lock);
> > +__cacheline_aligned DEFINE_SPINLOCK(css_set_lock);
> > +/*
> > + * css_set_for_clone_seq is used to allow lockless operation in cgroup_css_set_fork()
> > + */
> > +static __cacheline_aligned seqcount_spinlock_t css_set_for_clone_seq =
> > +     SEQCNT_SPINLOCK_ZERO(css_set_for_clone_seq, &css_set_lock);
>
> Maybe a better comment:
>         css_set_for_clone_seq synchronizes access to task_struct::cgroup
>         or cgroup::kill_seq used on clone path
>

will patch it in

> BTW why the __cacheline_aligned? Have you observed cacheline contention
> with that cgroup_mutex or anything else?
>

It's future-proofing to avoid false sharing from unrelated changes
down the road. I could perhaps annotate it with __read_mostly instead.

> >
> >  #if (defined CONFIG_PROVE_RCU || defined CONFIG_LOCKDEP)
> >  EXPORT_SYMBOL_GPL(cgroup_mutex);
> > @@ -907,6 +912,7 @@ static void css_set_skip_task_iters(struct css_set *cset,
> >   * @from_cset: css_set @task currently belongs to (may be NULL)
> >   * @to_cset: new css_set @task is being moved to (may be NULL)
> >   * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks
> > + * @is_clone: indicator whether @task is amids clone
> >   *
> >   * Move @task from @from_cset to @to_cset.  If @task didn't belong to any
> >   * css_set, @from_cset can be NULL.  If @task is being disassociated
> > @@ -918,13 +924,16 @@ static void css_set_skip_task_iters(struct css_set *cset,
> >   */
> >  static void css_set_move_task(struct task_struct *task,
> >                             struct css_set *from_cset, struct css_set *to_cset,
> > -                           bool use_mg_tasks)
> > +                           bool use_mg_tasks, bool is_clone)
>
> I think the is_clone arg could be dropped. The harm from incrementing
> write_seq from other places should be negligible. But it could be
> optimized by just looking at to_cset (not being NULL) as that's the
> migration that'd invalidate clone's value.
>

to_cset is not NULL on clone where the modified task can't be in the
process of forking anyway, meaning there a special case here which
does *not* invalidate the seq

this made me realize that my current patch fails to skip seq change
for the transition the other way -- clearing up cset for a dying
process is also a case where i can't be forking, so there is no need
to invalidate seq

anyhow, the routine is called on every clone and exit and
unconditional writes there very much would be a problem as they
increase bounces with the css lock held

bottom line, the code needs to be able to spot the two special cases
of assigning cset for the first time and clearing it for the last time

>
> >  {
> >       lockdep_assert_held(&css_set_lock);
> >
> >       if (to_cset && !css_set_populated(to_cset))
> >               css_set_update_populated(to_cset, true);
> >
> > +     if (!is_clone)
> > +             raw_write_seqcount_begin(&css_set_for_clone_seq);
>
> BTW What is the reason to use raw_ flavor of the seqcount functions?
> (I think it's good to have lockdep covering our backs.)

i mindlessly copied usage from dcache.c, will patch it

>
> > +
> >       if (from_cset) {
> >               WARN_ON_ONCE(list_empty(&task->cg_list));
> >
> > @@ -949,6 +958,9 @@ static void css_set_move_task(struct task_struct *task,
> >               list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
> >                                                            &to_cset->tasks);
> >       }
> > +
> > +     if (!is_clone)
> > +             raw_write_seqcount_end(&css_set_for_clone_seq);
> >  }
> >
> >  /*
> > @@ -2723,7 +2735,7 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
> >
> >                       get_css_set(to_cset);
> >                       to_cset->nr_tasks++;
> > -                     css_set_move_task(task, from_cset, to_cset, true);
> > +                     css_set_move_task(task, from_cset, to_cset, true, false);
>
> I'm afraid this should be also do the write locking.
> (To synchronize migration and forking.) But it's alternate formulation
> of the to_cset guard above.

I don't follow this comment whatsoever. With the patch as is the area
is locked by the css lock and bumps the seq count.

>
> >                       from_cset->nr_tasks--;
> >                       /*
> >                        * If the source or destination cgroup is frozen,
> > @@ -4183,7 +4195,9 @@ static void __cgroup_kill(struct cgroup *cgrp)
> >       lockdep_assert_held(&cgroup_mutex);
> >
> >       spin_lock_irq(&css_set_lock);
> > +     raw_write_seqcount_begin(&css_set_for_clone_seq);
> >       cgrp->kill_seq++;
> > +     raw_write_seqcount_end(&css_set_for_clone_seq);
> >       spin_unlock_irq(&css_set_lock);
> >
> >       css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it);
> > @@ -6690,20 +6704,40 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
> >       struct cgroup *dst_cgrp = NULL;
> >       struct css_set *cset;
> >       struct super_block *sb;
> > +     bool need_lock;
> >
> >       if (kargs->flags & CLONE_INTO_CGROUP)
> >               cgroup_lock();
> >
> >       cgroup_threadgroup_change_begin(current);
> >
> > -     spin_lock_irq(&css_set_lock);
> > -     cset = task_css_set(current);
> > -     get_css_set(cset);
> > -     if (kargs->cgrp)
> > -             kargs->kill_seq = kargs->cgrp->kill_seq;
> > -     else
> > -             kargs->kill_seq = cset->dfl_cgrp->kill_seq;
> > -     spin_unlock_irq(&css_set_lock);
> > +     need_lock = true;
> > +     scoped_guard(rcu) {
> > +             unsigned seq = raw_read_seqcount_begin(&css_set_for_clone_seq);
> > +             cset = task_css_set(current);
> > +             if (unlikely(!cset || !get_css_set_not_zero(cset)))
> > +                     break;
> > +             if (kargs->cgrp)
> > +                     kargs->kill_seq = kargs->cgrp->kill_seq;
> > +             else
> > +                     kargs->kill_seq = cset->dfl_cgrp->kill_seq;
> > +             if (read_seqcount_retry(&css_set_for_clone_seq, seq)) {
> > +                     put_css_set(cset);
> > +                     break;
> > +             }
> > +             need_lock = false;
>
> I see that this construction of using the read_seqcount_retry() only
> once and then falling back to spinlock is quite uncommon.
> Assuming the relevant writers are properly enclosed within seqcount,
> would there be a reason to do this double approach instead of "spin"
> inside the seqcount's read section? (As usual with seqcount reader
> sides.)
>

Just retrying in a loop does not have forward progress guarantee and
all places which merely loop are buggy on that front at least in
principle.

There is a macro read_seqbegin_or_lock which can be used to dedup the
code though.

But i'm not going to argue, it's probably fine to just loop here.

>
> > +     }
> > +
> > +     if (unlikely(need_lock)) {
> > +             spin_lock_irq(&css_set_lock);
> > +             cset = task_css_set(current);
> > +             get_css_set(cset);
> > +             if (kargs->cgrp)
> > +                     kargs->kill_seq = kargs->cgrp->kill_seq;
> > +             else
> > +                     kargs->kill_seq = cset->dfl_cgrp->kill_seq;
> > +             spin_unlock_irq(&css_set_lock);
> > +     }
> >
> >       if (!(kargs->flags & CLONE_INTO_CGROUP)) {
> >               kargs->cset = cset;
> > @@ -6907,7 +6941,7 @@ void cgroup_post_fork(struct task_struct *child,
> >
> >               WARN_ON_ONCE(!list_empty(&child->cg_list));
> >               cset->nr_tasks++;
> > -             css_set_move_task(child, NULL, cset, false);
> > +             css_set_move_task(child, NULL, cset, false, true);
> >       } else {
> >               put_css_set(cset);
> >               cset = NULL;
> > @@ -6995,7 +7029,7 @@ static void do_cgroup_task_dead(struct task_struct *tsk)
> >
> >       WARN_ON_ONCE(list_empty(&tsk->cg_list));
> >       cset = task_css_set(tsk);
> > -     css_set_move_task(tsk, cset, NULL, false);
> > +     css_set_move_task(tsk, cset, NULL, false, false);
> >       cset->nr_tasks--;
> >       /* matches the signal->live check in css_task_iter_advance() */
> >       if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live))
> > --
> > 2.48.1
> >