cgroup/rstat: convert rstat lock from spinlock to rwlock

[RFC PATCH] cgroup/rstat: convert rstat lock from spinlock to rwlock

Posted by Thomas Falcon 5 days, 6 hours ago

Implement rstat_ss_lock and rstat_base_lock as read-write locks
instead of spinlocks. In addition, update tracing to reflect new
locking implementation.

The benchmark below, meant to simulate a workload performing many
concurrent cgroup cpu.stat reads, completes in 134 seconds on an Intel
Xeon Platinum 8568Y with 144 cpus compared to 241 seconds when
using spinlocks.

Signed-off-by: Thomas Falcon <thomas.falcon@intel.com>
---
Initially discussed here: https://lore.kernel.org/cgroups/20250617102644.752201-2-bertrand.wlodarczyk@intel.com/
Benchmark: https://gist.github.com/bwlodarcz/c955b36b5667f0167dffcff23953d1da
musl-gcc -o benchmark -static -g3 -DNUM_THREADS=10 -DNUM_ITER=10000 -O2 -Wall benchmark.c -pthread
---
include/linux/cgroup-defs.h   |  2 +-
 include/trace/events/cgroup.h | 22 ++++++++--------
 kernel/cgroup/rstat.c         | 47 ++++++++++++++++++++++-------------
 3 files changed, 43 insertions(+), 28 deletions(-)

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 50a784da7a81..8609d6f6b29f 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -861,7 +861,7 @@ struct cgroup_subsys {
 	 */
 	unsigned int depends_on;
 
-	spinlock_t rstat_ss_lock;
+	rwlock_t rstat_ss_lock;
 	struct llist_head __percpu *lhead; /* lockless update list head */
 };
 
diff --git a/include/trace/events/cgroup.h b/include/trace/events/cgroup.h
index b736da06340a..1bd80a08c116 100644
--- a/include/trace/events/cgroup.h
+++ b/include/trace/events/cgroup.h
@@ -206,15 +206,16 @@ DEFINE_EVENT(cgroup_event, cgroup_notify_frozen,
 
 DECLARE_EVENT_CLASS(cgroup_rstat,
 
-	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+	TP_PROTO(struct cgroup *cgrp, int cpu, const char *type, bool contended),
 
-	TP_ARGS(cgrp, cpu, contended),
+	TP_ARGS(cgrp, cpu, type, contended),
 
 	TP_STRUCT__entry(
 		__field(	int,		root			)
 		__field(	int,		level			)
 		__field(	u64,		id			)
 		__field(	int,		cpu			)
+		__string(	type,		type			)
 		__field(	bool,		contended		)
 	),
 
@@ -223,12 +224,13 @@ DECLARE_EVENT_CLASS(cgroup_rstat,
 		__entry->id = cgroup_id(cgrp);
 		__entry->level = cgrp->level;
 		__entry->cpu = cpu;
+		__assign_str(type);
 		__entry->contended = contended;
 	),
 
-	TP_printk("root=%d id=%llu level=%d cpu=%d lock contended:%d",
+	TP_printk("root=%d id=%llu level=%d cpu=%d lock-type=%s lock contended:%d",
 		  __entry->root, __entry->id, __entry->level,
-		  __entry->cpu, __entry->contended)
+		  __entry->cpu, __get_str(type), __entry->contended)
 );
 
 /*
@@ -238,23 +240,23 @@ DECLARE_EVENT_CLASS(cgroup_rstat,
  */
 DEFINE_EVENT(cgroup_rstat, cgroup_rstat_lock_contended,
 
-	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+	TP_PROTO(struct cgroup *cgrp, int cpu, const char *type, bool contended),
 
-	TP_ARGS(cgrp, cpu, contended)
+	TP_ARGS(cgrp, cpu, type, contended)
 );
 
 DEFINE_EVENT(cgroup_rstat, cgroup_rstat_locked,
 
-	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+	TP_PROTO(struct cgroup *cgrp, int cpu, const char *type, bool contended),
 
-	TP_ARGS(cgrp, cpu, contended)
+	TP_ARGS(cgrp, cpu, type, contended)
 );
 
 DEFINE_EVENT(cgroup_rstat, cgroup_rstat_unlock,
 
-	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+	TP_PROTO(struct cgroup *cgrp, int cpu, const char *type, bool contended),
 
-	TP_ARGS(cgrp, cpu, contended)
+	TP_ARGS(cgrp, cpu, type, contended)
 );
 
 #endif /* _TRACE_CGROUP_H */
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 150e5871e66f..e33f31914b7d 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -9,7 +9,7 @@
 
 #include <trace/events/cgroup.h>
 
-static DEFINE_SPINLOCK(rstat_base_lock);
+static DEFINE_RWLOCK(rstat_base_lock);
 static DEFINE_PER_CPU(struct llist_head, rstat_backlog_list);
 
 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
@@ -37,7 +37,7 @@ static struct cgroup_rstat_base_cpu *cgroup_rstat_base_cpu(
 	return per_cpu_ptr(cgrp->rstat_base_cpu, cpu);
 }
 
-static spinlock_t *ss_rstat_lock(struct cgroup_subsys *ss)
+static rwlock_t *ss_rstat_lock(struct cgroup_subsys *ss)
 {
 	if (ss)
 		return &ss->rstat_ss_lock;
@@ -356,32 +356,45 @@ __bpf_hook_end();
  * number processed last.
  */
 static inline void __css_rstat_lock(struct cgroup_subsys_state *css,
-		int cpu_in_loop)
+				    int cpu_in_loop, bool write)
 	__acquires(ss_rstat_lock(css->ss))
 {
+	const char *type = write ? "write" : "read";
 	struct cgroup *cgrp = css->cgroup;
-	spinlock_t *lock;
+	rwlock_t *lock;
 	bool contended;
 
 	lock = ss_rstat_lock(css->ss);
-	contended = !spin_trylock_irq(lock);
+
+	local_irq_disable();
+	contended = write ? !write_trylock(lock) : !read_trylock(lock);
 	if (contended) {
-		trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended);
-		spin_lock_irq(lock);
+		local_irq_enable();
+		trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, type, contended);
+		if (write)
+			write_lock_irq(lock);
+		else
+			read_lock_irq(lock);
 	}
-	trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended);
+
+	trace_cgroup_rstat_locked(cgrp, cpu_in_loop, type, contended);
 }
 
 static inline void __css_rstat_unlock(struct cgroup_subsys_state *css,
-				      int cpu_in_loop)
+				      int cpu_in_loop, bool write)
 	__releases(ss_rstat_lock(css->ss))
 {
+	const char *type = write ? "write" : "read";
 	struct cgroup *cgrp = css->cgroup;
-	spinlock_t *lock;
+	rwlock_t *lock;
 
 	lock = ss_rstat_lock(css->ss);
-	trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false);
-	spin_unlock_irq(lock);
+	trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, type, false);
+
+	if (write)
+		write_unlock_irq(lock);
+	else
+		read_unlock_irq(lock);
 }
 
 /**
@@ -414,7 +427,7 @@ __bpf_kfunc void css_rstat_flush(struct cgroup_subsys_state *css)
 		struct cgroup_subsys_state *pos;
 
 		/* Reacquire for each CPU to avoid disabling IRQs too long */
-		__css_rstat_lock(css, cpu);
+		__css_rstat_lock(css, cpu, true);
 		pos = css_rstat_updated_list(css, cpu);
 		for (; pos; pos = pos->rstat_flush_next) {
 			if (is_self) {
@@ -424,7 +437,7 @@ __bpf_kfunc void css_rstat_flush(struct cgroup_subsys_state *css)
 			} else
 				pos->ss->css_rstat_flush(pos, cpu);
 		}
-		__css_rstat_unlock(css, cpu);
+		__css_rstat_unlock(css, cpu, true);
 		if (!cond_resched())
 			cpu_relax();
 	}
@@ -525,7 +538,7 @@ int __init ss_rstat_init(struct cgroup_subsys *ss)
 			return -ENOMEM;
 	}
 
-	spin_lock_init(ss_rstat_lock(ss));
+	rwlock_init(ss_rstat_lock(ss));
 	for_each_possible_cpu(cpu)
 		init_llist_head(ss_lhead_cpu(ss, cpu));
 
@@ -717,11 +730,11 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
 
 	if (cgroup_parent(cgrp)) {
 		css_rstat_flush(&cgrp->self);
-		__css_rstat_lock(&cgrp->self, -1);
+		__css_rstat_lock(&cgrp->self, -1, false);
 		bstat = cgrp->bstat;
 		cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
 			       &bstat.cputime.utime, &bstat.cputime.stime);
-		__css_rstat_unlock(&cgrp->self, -1);
+		__css_rstat_unlock(&cgrp->self, -1, false);
 	} else {
 		root_cgroup_cputime(&bstat);
 	}
-- 
2.43.0

Re: [RFC PATCH] cgroup/rstat: convert rstat lock from spinlock to rwlock

Posted by Michal Koutný 2 days, 8 hours ago

Hi Thomas.

On Tue, May 19, 2026 at 12:31:34PM -0500, Thomas Falcon <thomas.falcon@intel.com> wrote:
> @@ -414,7 +427,7 @@ __bpf_kfunc void css_rstat_flush(struct cgroup_subsys_state *css)
>  		struct cgroup_subsys_state *pos;
>  
>  		/* Reacquire for each CPU to avoid disabling IRQs too long */
> -		__css_rstat_lock(css, cpu);
> +		__css_rstat_lock(css, cpu, true);
>  		pos = css_rstat_updated_list(css, cpu);
>  		for (; pos; pos = pos->rstat_flush_next) {
>  			if (is_self) {
> @@ -424,7 +437,7 @@ __bpf_kfunc void css_rstat_flush(struct cgroup_subsys_state *css)
>  			} else
>  				pos->ss->css_rstat_flush(pos, cpu);
>  		}
> -		__css_rstat_unlock(css, cpu);
> +		__css_rstat_unlock(css, cpu, true);
>  		if (!cond_resched())
>  			cpu_relax();
>  	}


> @@ -717,11 +730,11 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
>  
>  	if (cgroup_parent(cgrp)) {
>  		css_rstat_flush(&cgrp->self);
> -		__css_rstat_lock(&cgrp->self, -1);
> +		__css_rstat_lock(&cgrp->self, -1, false);
>  		bstat = cgrp->bstat;
>  		cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
>  			       &bstat.cputime.utime, &bstat.cputime.stime);
> -		__css_rstat_unlock(&cgrp->self, -1);
> +		__css_rstat_unlock(&cgrp->self, -1, false);

I was wondering where these distinctions of readers vs writers stem from
and here I see that it's mainly the per-subsys vs rstat_base_lock.
Given that cputime_adjust() is here only modifying the local bstat
value, the read-like lock makes sense.

However, there's still the cgroup's flush right above which would take
the per-subsys locks in write-mode anyway.
Can you add some more explanation why this works?

More generally, I'm wondering where are the opportunities for replacing
per-subsys lock with an RW lock (or seqcount).

Thanks for looking into cpu.stat scalability,
Michal

Re: [RFC PATCH] cgroup/rstat: convert rstat lock from spinlock to rwlock

Posted by Tejun Heo 5 days, 6 hours ago

On Tue, May 19, 2026 at 12:31:34PM -0500, Thomas Falcon wrote:
> Implement rstat_ss_lock and rstat_base_lock as read-write locks
> instead of spinlocks. In addition, update tracing to reflect new
> locking implementation.
> 
> The benchmark below, meant to simulate a workload performing many
> concurrent cgroup cpu.stat reads, completes in 134 seconds on an Intel
> Xeon Platinum 8568Y with 144 cpus compared to 241 seconds when
> using spinlocks.

Can you try using seqlock for readers? That'd be decouple readers a lot
better than rwlocks.

Thanks.

-- 
tejun

Re: [RFC PATCH] cgroup/rstat: convert rstat lock from spinlock to rwlock

Posted by Falcon, Thomas 5 days, 6 hours ago

On Tue, 2026-05-19 at 07:49 -1000, Tejun Heo wrote:
> On Tue, May 19, 2026 at 12:31:34PM -0500, Thomas Falcon wrote:
> > Implement rstat_ss_lock and rstat_base_lock as read-write locks
> > instead of spinlocks. In addition, update tracing to reflect new
> > locking implementation.
> > 
> > The benchmark below, meant to simulate a workload performing many
> > concurrent cgroup cpu.stat reads, completes in 134 seconds on an
> > Intel
> > Xeon Platinum 8568Y with 144 cpus compared to 241 seconds when
> > using spinlocks.
> 
> Can you try using seqlock for readers? That'd be decouple readers a
> lot
> better than rwlocks.
> 
> Thanks.
> 
Hi, thanks for the quick response. I will try that.

Thanks,
Tom