cgroup/rstat: move css_rstat_lock outside cpu loop

[PATCH] cgroup/rstat: move css_rstat_lock outside cpu loop

Posted by Bertrand Wlodarczyk 2 months, 2 weeks ago

By moving the lock outside the CPU loop, we reduce the frequency
of costly lock acquisition.
This adjustment slightly improves performance in scenarios where
multiple CPUs concurrently attempt to acquire the lock for
the same css.

The cpu argument passed to __css_rstat_lock, which was utilized
by the event system, has been removed because it is no longer in use.
---
Results:
 
QEMU vm
+-------+---------+
| main  | patched |
+-------+---------+
| 9.17s | 2.36s   |
+-------+---------+
ext4 raw image with debian:
qemu-system-x86_64 -enable-kvm -cpu host -smp 102 -m 16G -kernel linux-cgroup/arch/x86/boot/bzImage -drive file=rootfs.ext4,if=virtio,format=raw -append "rootwait root=/dev/vda console=tty1 console=ttyS0 nokaslr cgroup_enable=memory cgroup_memory=1" -net nic,model=virtio -net user -nographic

Benchmark code: https://gist.github.com/bwlodarcz/c955b36b5667f0167dffcff23953d1da
musl-gcc -o benchmark -static -g3 -DNUM_THREADS=10 -DNUM_ITER=10000 -O2 -Wall benchmark.c -pthread
---
 include/trace/events/cgroup.h | 22 ++++++++++------------
 kernel/cgroup/rstat.c         | 20 +++++++++-----------
 2 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/include/trace/events/cgroup.h b/include/trace/events/cgroup.h
index ba9229af9a34..eb674eef8b99 100644
--- a/include/trace/events/cgroup.h
+++ b/include/trace/events/cgroup.h
@@ -206,15 +206,14 @@ DEFINE_EVENT(cgroup_event, cgroup_notify_frozen,
 
 DECLARE_EVENT_CLASS(cgroup_rstat,
 
-	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+	TP_PROTO(struct cgroup *cgrp, bool contended),
 
-	TP_ARGS(cgrp, cpu, contended),
+	TP_ARGS(cgrp, contended),
 
 	TP_STRUCT__entry(
 		__field(	int,		root			)
 		__field(	int,		level			)
 		__field(	u64,		id			)
-		__field(	int,		cpu			)
 		__field(	bool,		contended		)
 	),
 
@@ -222,13 +221,12 @@ DECLARE_EVENT_CLASS(cgroup_rstat,
 		__entry->root = cgrp->root->hierarchy_id;
 		__entry->id = cgroup_id(cgrp);
 		__entry->level = cgrp->level;
-		__entry->cpu = cpu;
 		__entry->contended = contended;
 	),
 
-	TP_printk("root=%d id=%llu level=%d cpu=%d lock contended:%d",
+	TP_printk("root=%d id=%llu level=%d lock contended:%d",
 		  __entry->root, __entry->id, __entry->level,
-		  __entry->cpu, __entry->contended)
+		  __entry->contended)
 );
 
 /*
@@ -238,23 +236,23 @@ DECLARE_EVENT_CLASS(cgroup_rstat,
  */
 DEFINE_EVENT(cgroup_rstat, cgroup_rstat_lock_contended,
 
-	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+	TP_PROTO(struct cgroup *cgrp, bool contended),
 
-	TP_ARGS(cgrp, cpu, contended)
+	TP_ARGS(cgrp, contended)
 );
 
 DEFINE_EVENT(cgroup_rstat, cgroup_rstat_locked,
 
-	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+	TP_PROTO(struct cgroup *cgrp, bool contended),
 
-	TP_ARGS(cgrp, cpu, contended)
+	TP_ARGS(cgrp, contended)
 );
 
 DEFINE_EVENT(cgroup_rstat, cgroup_rstat_unlock,
 
-	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+	TP_PROTO(struct cgroup *cgrp, bool contended),
 
-	TP_ARGS(cgrp, cpu, contended)
+	TP_ARGS(cgrp, contended)
 );
 
 #endif /* _TRACE_CGROUP_H */
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index c8a48cf83878..dd312fe1896d 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -326,8 +326,7 @@ __bpf_hook_end();
  * value -1 is used when obtaining the main lock else this is the CPU
  * number processed last.
  */
-static inline void __css_rstat_lock(struct cgroup_subsys_state *css,
-		int cpu_in_loop)
+static inline void __css_rstat_lock(struct cgroup_subsys_state *css)
 	__acquires(ss_rstat_lock(css->ss))
 {
 	struct cgroup *cgrp = css->cgroup;
@@ -337,21 +336,20 @@ static inline void __css_rstat_lock(struct cgroup_subsys_state *css,
 	lock = ss_rstat_lock(css->ss);
 	contended = !spin_trylock_irq(lock);
 	if (contended) {
-		trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended);
+		trace_cgroup_rstat_lock_contended(cgrp, contended);
 		spin_lock_irq(lock);
 	}
-	trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended);
+	trace_cgroup_rstat_locked(cgrp, contended);
 }
 
-static inline void __css_rstat_unlock(struct cgroup_subsys_state *css,
-				      int cpu_in_loop)
+static inline void __css_rstat_unlock(struct cgroup_subsys_state *css)
 	__releases(ss_rstat_lock(css->ss))
 {
 	struct cgroup *cgrp = css->cgroup;
 	spinlock_t *lock;
 
 	lock = ss_rstat_lock(css->ss);
-	trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false);
+	trace_cgroup_rstat_unlock(cgrp, false);
 	spin_unlock_irq(lock);
 }
 
@@ -381,11 +379,11 @@ __bpf_kfunc void css_rstat_flush(struct cgroup_subsys_state *css)
 		return;
 
 	might_sleep();
+	__css_rstat_lock(css);
 	for_each_possible_cpu(cpu) {
 		struct cgroup_subsys_state *pos;
 
 		/* Reacquire for each CPU to avoid disabling IRQs too long */
-		__css_rstat_lock(css, cpu);
 		pos = css_rstat_updated_list(css, cpu);
 		for (; pos; pos = pos->rstat_flush_next) {
 			if (is_self) {
@@ -395,10 +393,10 @@ __bpf_kfunc void css_rstat_flush(struct cgroup_subsys_state *css)
 			} else
 				pos->ss->css_rstat_flush(pos, cpu);
 		}
-		__css_rstat_unlock(css, cpu);
 		if (!cond_resched())
 			cpu_relax();
 	}
+	__css_rstat_unlock(css);
 }
 
 int css_rstat_init(struct cgroup_subsys_state *css)
@@ -685,11 +683,11 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
 
 	if (cgroup_parent(cgrp)) {
 		css_rstat_flush(&cgrp->self);
-		__css_rstat_lock(&cgrp->self, -1);
+		__css_rstat_lock(&cgrp->self);
 		bstat = cgrp->bstat;
 		cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
 			       &bstat.cputime.utime, &bstat.cputime.stime);
-		__css_rstat_unlock(&cgrp->self, -1);
+		__css_rstat_unlock(&cgrp->self);
 	} else {
 		root_cgroup_cputime(&bstat);
 	}
-- 
2.49.0

Re: [PATCH] cgroup/rstat: move css_rstat_lock outside cpu loop

Posted by Michal Koutný 2 months ago

On Tue, Jul 22, 2025 at 02:43:19PM +0200, Bertrand Wlodarczyk <bertrand.wlodarczyk@intel.com> wrote:
> By moving the lock outside the CPU loop, we reduce the frequency
> of costly lock acquisition.

So IIUC, mere acquisition is so costly that it dwarves actual holding
(flushing) as there are presumbly no updates collected.

As Shakeel wrote, there are reasons for not holding this lock for very
long (beside that being a general rule).

> This adjustment slightly improves performance in scenarios where
> multiple CPUs concurrently attempt to acquire the lock for
> the same css.

It means they can read cpu.stat more frequently but will they get any
fresher or more precise data? This is not clear to me, I understand why
this benchmark improves but does it represent some real world
improvement?

Thanks,
Michal

Re: [PATCH] cgroup/rstat: move css_rstat_lock outside cpu loop

Posted by Shakeel Butt 2 months, 2 weeks ago

On Tue, Jul 22, 2025 at 02:43:19PM +0200, Bertrand Wlodarczyk wrote:
> By moving the lock outside the CPU loop, we reduce the frequency
> of costly lock acquisition.
> This adjustment slightly improves performance in scenarios where
> multiple CPUs concurrently attempt to acquire the lock for
> the same css.

Did you see the commit 0efc297a3c497 ("cgroup/rstat: avoid disabling
irqs for O(num_cpu)") for the reasoning on why we are doing this?

> 
> The cpu argument passed to __css_rstat_lock, which was utilized
> by the event system, has been removed because it is no longer in use.
> ---
> Results:
>  
> QEMU vm
> +-------+---------+
> | main  | patched |
> +-------+---------+
> | 9.17s | 2.36s   |
> +-------+---------+
> ext4 raw image with debian:
> qemu-system-x86_64 -enable-kvm -cpu host -smp 102 -m 16G -kernel linux-cgroup/arch/x86/boot/bzImage -drive file=rootfs.ext4,if=virtio,format=raw -append "rootwait root=/dev/vda console=tty1 console=ttyS0 nokaslr cgroup_enable=memory cgroup_memory=1" -net nic,model=virtio -net user -nographic
> 
> Benchmark code: https://gist.github.com/bwlodarcz/c955b36b5667f0167dffcff23953d1da
> musl-gcc -o benchmark -static -g3 -DNUM_THREADS=10 -DNUM_ITER=10000 -O2 -Wall benchmark.c -pthread
> ---
>  include/trace/events/cgroup.h | 22 ++++++++++------------
>  kernel/cgroup/rstat.c         | 20 +++++++++-----------
>  2 files changed, 19 insertions(+), 23 deletions(-)
> 
> diff --git a/include/trace/events/cgroup.h b/include/trace/events/cgroup.h
> index ba9229af9a34..eb674eef8b99 100644
> --- a/include/trace/events/cgroup.h
> +++ b/include/trace/events/cgroup.h
> @@ -206,15 +206,14 @@ DEFINE_EVENT(cgroup_event, cgroup_notify_frozen,
>  
>  DECLARE_EVENT_CLASS(cgroup_rstat,
>  
> -	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
> +	TP_PROTO(struct cgroup *cgrp, bool contended),
>  
> -	TP_ARGS(cgrp, cpu, contended),
> +	TP_ARGS(cgrp, contended),
>  
>  	TP_STRUCT__entry(
>  		__field(	int,		root			)
>  		__field(	int,		level			)
>  		__field(	u64,		id			)
> -		__field(	int,		cpu			)
>  		__field(	bool,		contended		)
>  	),
>  
> @@ -222,13 +221,12 @@ DECLARE_EVENT_CLASS(cgroup_rstat,
>  		__entry->root = cgrp->root->hierarchy_id;
>  		__entry->id = cgroup_id(cgrp);
>  		__entry->level = cgrp->level;
> -		__entry->cpu = cpu;
>  		__entry->contended = contended;
>  	),
>  
> -	TP_printk("root=%d id=%llu level=%d cpu=%d lock contended:%d",
> +	TP_printk("root=%d id=%llu level=%d lock contended:%d",
>  		  __entry->root, __entry->id, __entry->level,
> -		  __entry->cpu, __entry->contended)
> +		  __entry->contended)
>  );
>  
>  /*
> @@ -238,23 +236,23 @@ DECLARE_EVENT_CLASS(cgroup_rstat,
>   */
>  DEFINE_EVENT(cgroup_rstat, cgroup_rstat_lock_contended,
>  
> -	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
> +	TP_PROTO(struct cgroup *cgrp, bool contended),
>  
> -	TP_ARGS(cgrp, cpu, contended)
> +	TP_ARGS(cgrp, contended)
>  );
>  
>  DEFINE_EVENT(cgroup_rstat, cgroup_rstat_locked,
>  
> -	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
> +	TP_PROTO(struct cgroup *cgrp, bool contended),
>  
> -	TP_ARGS(cgrp, cpu, contended)
> +	TP_ARGS(cgrp, contended)
>  );
>  
>  DEFINE_EVENT(cgroup_rstat, cgroup_rstat_unlock,
>  
> -	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
> +	TP_PROTO(struct cgroup *cgrp, bool contended),
>  
> -	TP_ARGS(cgrp, cpu, contended)
> +	TP_ARGS(cgrp, contended)
>  );
>  
>  #endif /* _TRACE_CGROUP_H */
> diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
> index c8a48cf83878..dd312fe1896d 100644
> --- a/kernel/cgroup/rstat.c
> +++ b/kernel/cgroup/rstat.c
> @@ -326,8 +326,7 @@ __bpf_hook_end();
>   * value -1 is used when obtaining the main lock else this is the CPU
>   * number processed last.
>   */
> -static inline void __css_rstat_lock(struct cgroup_subsys_state *css,
> -		int cpu_in_loop)
> +static inline void __css_rstat_lock(struct cgroup_subsys_state *css)
>  	__acquires(ss_rstat_lock(css->ss))
>  {
>  	struct cgroup *cgrp = css->cgroup;
> @@ -337,21 +336,20 @@ static inline void __css_rstat_lock(struct cgroup_subsys_state *css,
>  	lock = ss_rstat_lock(css->ss);
>  	contended = !spin_trylock_irq(lock);
>  	if (contended) {
> -		trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended);
> +		trace_cgroup_rstat_lock_contended(cgrp, contended);
>  		spin_lock_irq(lock);
>  	}
> -	trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended);
> +	trace_cgroup_rstat_locked(cgrp, contended);
>  }
>  
> -static inline void __css_rstat_unlock(struct cgroup_subsys_state *css,
> -				      int cpu_in_loop)
> +static inline void __css_rstat_unlock(struct cgroup_subsys_state *css)
>  	__releases(ss_rstat_lock(css->ss))
>  {
>  	struct cgroup *cgrp = css->cgroup;
>  	spinlock_t *lock;
>  
>  	lock = ss_rstat_lock(css->ss);
> -	trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false);
> +	trace_cgroup_rstat_unlock(cgrp, false);
>  	spin_unlock_irq(lock);
>  }
>  
> @@ -381,11 +379,11 @@ __bpf_kfunc void css_rstat_flush(struct cgroup_subsys_state *css)
>  		return;
>  
>  	might_sleep();
> +	__css_rstat_lock(css);
>  	for_each_possible_cpu(cpu) {
>  		struct cgroup_subsys_state *pos;
>  
>  		/* Reacquire for each CPU to avoid disabling IRQs too long */
> -		__css_rstat_lock(css, cpu);
>  		pos = css_rstat_updated_list(css, cpu);
>  		for (; pos; pos = pos->rstat_flush_next) {
>  			if (is_self) {
> @@ -395,10 +393,10 @@ __bpf_kfunc void css_rstat_flush(struct cgroup_subsys_state *css)
>  			} else
>  				pos->ss->css_rstat_flush(pos, cpu);
>  		}
> -		__css_rstat_unlock(css, cpu);
>  		if (!cond_resched())

cond_resched() with the spin lock held?

>  			cpu_relax();
>  	}
> +	__css_rstat_unlock(css);
>  }
>  
>  int css_rstat_init(struct cgroup_subsys_state *css)
> @@ -685,11 +683,11 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
>  
>  	if (cgroup_parent(cgrp)) {
>  		css_rstat_flush(&cgrp->self);
> -		__css_rstat_lock(&cgrp->self, -1);
> +		__css_rstat_lock(&cgrp->self);
>  		bstat = cgrp->bstat;
>  		cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
>  			       &bstat.cputime.utime, &bstat.cputime.stime);
> -		__css_rstat_unlock(&cgrp->self, -1);
> +		__css_rstat_unlock(&cgrp->self);
>  	} else {
>  		root_cgroup_cputime(&bstat);
>  	}
> -- 
> 2.49.0
>