include/linux/cgroup-defs.h | 2 +- include/trace/events/cgroup.h | 22 ++++++++-------- kernel/cgroup/rstat.c | 47 ++++++++++++++++++++++------------- 3 files changed, 43 insertions(+), 28 deletions(-)
Implement rstat_ss_lock and rstat_base_lock as read-write locks
instead of spinlocks. In addition, update tracing to reflect new
locking implementation.
The benchmark below, meant to simulate a workload performing many
concurrent cgroup cpu.stat reads, completes in 134 seconds on an Intel
Xeon Platinum 8568Y with 144 cpus compared to 241 seconds when
using spinlocks.
Signed-off-by: Thomas Falcon <thomas.falcon@intel.com>
---
Initially discussed here: https://lore.kernel.org/cgroups/20250617102644.752201-2-bertrand.wlodarczyk@intel.com/
Benchmark: https://gist.github.com/bwlodarcz/c955b36b5667f0167dffcff23953d1da
musl-gcc -o benchmark -static -g3 -DNUM_THREADS=10 -DNUM_ITER=10000 -O2 -Wall benchmark.c -pthread
---
include/linux/cgroup-defs.h | 2 +-
include/trace/events/cgroup.h | 22 ++++++++--------
kernel/cgroup/rstat.c | 47 ++++++++++++++++++++++-------------
3 files changed, 43 insertions(+), 28 deletions(-)
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 50a784da7a81..8609d6f6b29f 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -861,7 +861,7 @@ struct cgroup_subsys {
*/
unsigned int depends_on;
- spinlock_t rstat_ss_lock;
+ rwlock_t rstat_ss_lock;
struct llist_head __percpu *lhead; /* lockless update list head */
};
diff --git a/include/trace/events/cgroup.h b/include/trace/events/cgroup.h
index b736da06340a..1bd80a08c116 100644
--- a/include/trace/events/cgroup.h
+++ b/include/trace/events/cgroup.h
@@ -206,15 +206,16 @@ DEFINE_EVENT(cgroup_event, cgroup_notify_frozen,
DECLARE_EVENT_CLASS(cgroup_rstat,
- TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+ TP_PROTO(struct cgroup *cgrp, int cpu, const char *type, bool contended),
- TP_ARGS(cgrp, cpu, contended),
+ TP_ARGS(cgrp, cpu, type, contended),
TP_STRUCT__entry(
__field( int, root )
__field( int, level )
__field( u64, id )
__field( int, cpu )
+ __string( type, type )
__field( bool, contended )
),
@@ -223,12 +224,13 @@ DECLARE_EVENT_CLASS(cgroup_rstat,
__entry->id = cgroup_id(cgrp);
__entry->level = cgrp->level;
__entry->cpu = cpu;
+ __assign_str(type);
__entry->contended = contended;
),
- TP_printk("root=%d id=%llu level=%d cpu=%d lock contended:%d",
+ TP_printk("root=%d id=%llu level=%d cpu=%d lock-type=%s lock contended:%d",
__entry->root, __entry->id, __entry->level,
- __entry->cpu, __entry->contended)
+ __entry->cpu, __get_str(type), __entry->contended)
);
/*
@@ -238,23 +240,23 @@ DECLARE_EVENT_CLASS(cgroup_rstat,
*/
DEFINE_EVENT(cgroup_rstat, cgroup_rstat_lock_contended,
- TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+ TP_PROTO(struct cgroup *cgrp, int cpu, const char *type, bool contended),
- TP_ARGS(cgrp, cpu, contended)
+ TP_ARGS(cgrp, cpu, type, contended)
);
DEFINE_EVENT(cgroup_rstat, cgroup_rstat_locked,
- TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+ TP_PROTO(struct cgroup *cgrp, int cpu, const char *type, bool contended),
- TP_ARGS(cgrp, cpu, contended)
+ TP_ARGS(cgrp, cpu, type, contended)
);
DEFINE_EVENT(cgroup_rstat, cgroup_rstat_unlock,
- TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+ TP_PROTO(struct cgroup *cgrp, int cpu, const char *type, bool contended),
- TP_ARGS(cgrp, cpu, contended)
+ TP_ARGS(cgrp, cpu, type, contended)
);
#endif /* _TRACE_CGROUP_H */
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 150e5871e66f..e33f31914b7d 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -9,7 +9,7 @@
#include <trace/events/cgroup.h>
-static DEFINE_SPINLOCK(rstat_base_lock);
+static DEFINE_RWLOCK(rstat_base_lock);
static DEFINE_PER_CPU(struct llist_head, rstat_backlog_list);
static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
@@ -37,7 +37,7 @@ static struct cgroup_rstat_base_cpu *cgroup_rstat_base_cpu(
return per_cpu_ptr(cgrp->rstat_base_cpu, cpu);
}
-static spinlock_t *ss_rstat_lock(struct cgroup_subsys *ss)
+static rwlock_t *ss_rstat_lock(struct cgroup_subsys *ss)
{
if (ss)
return &ss->rstat_ss_lock;
@@ -356,32 +356,45 @@ __bpf_hook_end();
* number processed last.
*/
static inline void __css_rstat_lock(struct cgroup_subsys_state *css,
- int cpu_in_loop)
+ int cpu_in_loop, bool write)
__acquires(ss_rstat_lock(css->ss))
{
+ const char *type = write ? "write" : "read";
struct cgroup *cgrp = css->cgroup;
- spinlock_t *lock;
+ rwlock_t *lock;
bool contended;
lock = ss_rstat_lock(css->ss);
- contended = !spin_trylock_irq(lock);
+
+ local_irq_disable();
+ contended = write ? !write_trylock(lock) : !read_trylock(lock);
if (contended) {
- trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended);
- spin_lock_irq(lock);
+ local_irq_enable();
+ trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, type, contended);
+ if (write)
+ write_lock_irq(lock);
+ else
+ read_lock_irq(lock);
}
- trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended);
+
+ trace_cgroup_rstat_locked(cgrp, cpu_in_loop, type, contended);
}
static inline void __css_rstat_unlock(struct cgroup_subsys_state *css,
- int cpu_in_loop)
+ int cpu_in_loop, bool write)
__releases(ss_rstat_lock(css->ss))
{
+ const char *type = write ? "write" : "read";
struct cgroup *cgrp = css->cgroup;
- spinlock_t *lock;
+ rwlock_t *lock;
lock = ss_rstat_lock(css->ss);
- trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false);
- spin_unlock_irq(lock);
+ trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, type, false);
+
+ if (write)
+ write_unlock_irq(lock);
+ else
+ read_unlock_irq(lock);
}
/**
@@ -414,7 +427,7 @@ __bpf_kfunc void css_rstat_flush(struct cgroup_subsys_state *css)
struct cgroup_subsys_state *pos;
/* Reacquire for each CPU to avoid disabling IRQs too long */
- __css_rstat_lock(css, cpu);
+ __css_rstat_lock(css, cpu, true);
pos = css_rstat_updated_list(css, cpu);
for (; pos; pos = pos->rstat_flush_next) {
if (is_self) {
@@ -424,7 +437,7 @@ __bpf_kfunc void css_rstat_flush(struct cgroup_subsys_state *css)
} else
pos->ss->css_rstat_flush(pos, cpu);
}
- __css_rstat_unlock(css, cpu);
+ __css_rstat_unlock(css, cpu, true);
if (!cond_resched())
cpu_relax();
}
@@ -525,7 +538,7 @@ int __init ss_rstat_init(struct cgroup_subsys *ss)
return -ENOMEM;
}
- spin_lock_init(ss_rstat_lock(ss));
+ rwlock_init(ss_rstat_lock(ss));
for_each_possible_cpu(cpu)
init_llist_head(ss_lhead_cpu(ss, cpu));
@@ -717,11 +730,11 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
if (cgroup_parent(cgrp)) {
css_rstat_flush(&cgrp->self);
- __css_rstat_lock(&cgrp->self, -1);
+ __css_rstat_lock(&cgrp->self, -1, false);
bstat = cgrp->bstat;
cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
&bstat.cputime.utime, &bstat.cputime.stime);
- __css_rstat_unlock(&cgrp->self, -1);
+ __css_rstat_unlock(&cgrp->self, -1, false);
} else {
root_cgroup_cputime(&bstat);
}
--
2.43.0
Hi Thomas.
On Tue, May 19, 2026 at 12:31:34PM -0500, Thomas Falcon <thomas.falcon@intel.com> wrote:
> @@ -414,7 +427,7 @@ __bpf_kfunc void css_rstat_flush(struct cgroup_subsys_state *css)
> struct cgroup_subsys_state *pos;
>
> /* Reacquire for each CPU to avoid disabling IRQs too long */
> - __css_rstat_lock(css, cpu);
> + __css_rstat_lock(css, cpu, true);
> pos = css_rstat_updated_list(css, cpu);
> for (; pos; pos = pos->rstat_flush_next) {
> if (is_self) {
> @@ -424,7 +437,7 @@ __bpf_kfunc void css_rstat_flush(struct cgroup_subsys_state *css)
> } else
> pos->ss->css_rstat_flush(pos, cpu);
> }
> - __css_rstat_unlock(css, cpu);
> + __css_rstat_unlock(css, cpu, true);
> if (!cond_resched())
> cpu_relax();
> }
> @@ -717,11 +730,11 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
>
> if (cgroup_parent(cgrp)) {
> css_rstat_flush(&cgrp->self);
> - __css_rstat_lock(&cgrp->self, -1);
> + __css_rstat_lock(&cgrp->self, -1, false);
> bstat = cgrp->bstat;
> cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
> &bstat.cputime.utime, &bstat.cputime.stime);
> - __css_rstat_unlock(&cgrp->self, -1);
> + __css_rstat_unlock(&cgrp->self, -1, false);
I was wondering where these distinctions of readers vs writers stem from
and here I see that it's mainly the per-subsys vs rstat_base_lock.
Given that cputime_adjust() is here only modifying the local bstat
value, the read-like lock makes sense.
However, there's still the cgroup's flush right above which would take
the per-subsys locks in write-mode anyway.
Can you add some more explanation why this works?
More generally, I'm wondering where are the opportunities for replacing
per-subsys lock with an RW lock (or seqcount).
Thanks for looking into cpu.stat scalability,
Michal
On Tue, May 19, 2026 at 12:31:34PM -0500, Thomas Falcon wrote: > Implement rstat_ss_lock and rstat_base_lock as read-write locks > instead of spinlocks. In addition, update tracing to reflect new > locking implementation. > > The benchmark below, meant to simulate a workload performing many > concurrent cgroup cpu.stat reads, completes in 134 seconds on an Intel > Xeon Platinum 8568Y with 144 cpus compared to 241 seconds when > using spinlocks. Can you try using seqlock for readers? That'd be decouple readers a lot better than rwlocks. Thanks. -- tejun
On Tue, 2026-05-19 at 07:49 -1000, Tejun Heo wrote: > On Tue, May 19, 2026 at 12:31:34PM -0500, Thomas Falcon wrote: > > Implement rstat_ss_lock and rstat_base_lock as read-write locks > > instead of spinlocks. In addition, update tracing to reflect new > > locking implementation. > > > > The benchmark below, meant to simulate a workload performing many > > concurrent cgroup cpu.stat reads, completes in 134 seconds on an > > Intel > > Xeon Platinum 8568Y with 144 cpus compared to 241 seconds when > > using spinlocks. > > Can you try using seqlock for readers? That'd be decouple readers a > lot > better than rwlocks. > > Thanks. > Hi, thanks for the quick response. I will try that. Thanks, Tom
© 2016 - 2026 Red Hat, Inc.