[PATCH v2] perf/core: Fix slow perf_event_task_exit() with LBR callstacks

Namhyung Kim posted 1 patch 3 weeks, 2 days ago
kernel/events/core.c | 18 +++++++++++++++++-
1 file changed, 17 insertions(+), 1 deletion(-)
[PATCH v2] perf/core: Fix slow perf_event_task_exit() with LBR callstacks
Posted by Namhyung Kim 3 weeks, 2 days ago
I got a report that a task is stuck in perf_event_exit_task() waiting
for global_ctx_data_rwsem.  On large systems with lots threads, it'd
have performance issues when it grabs the lock to iterate all threads
in the system to allocate the context data.

And it'd block task exit path which is problematic especially under
memory pressure.

  perf_event_open
    perf_event_alloc
      attach_perf_ctx_data
        attach_global_ctx_data
          percpu_down_write (global_ctx_data_rwsem)
            for_each_process_thread
              alloc_task_ctx_data
                                               do_exit
                                                 perf_event_exit_task
                                                   percpu_down_read (global_ctx_data_rwsem)

It should not hold the global_ctx_data_rwsem on the exit path.  Let's
skip allocation for exiting tasks and free the data carefully.

Reported-by: Rosalie Fang <rosaliefang@google.com>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 kernel/events/core.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 376fb07d869b8b50..b164e884102323f5 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5424,6 +5424,17 @@ attach_task_ctx_data(struct task_struct *task, struct kmem_cache *ctx_cache,
 		if (try_cmpxchg((struct perf_ctx_data **)&task->perf_ctx_data, &old, cd)) {
 			if (old)
 				perf_free_ctx_data_rcu(old);
+			/*
+			 * Above try_cmpxchg() pairs with try_cmpxchg() from
+			 * detach_task_ctx_data() such that
+			 * if we race with perf_event_exit_task(), we must
+			 * observe PF_EXITING.
+			 */
+			if (task->flags & PF_EXITING) {
+				/* detach_task_ctx_data() may free it already */
+				if (try_cmpxchg(&task->perf_ctx_data, &cd, NULL))
+					perf_free_ctx_data_rcu(cd);
+			}
 			return 0;
 		}
 
@@ -5469,6 +5480,8 @@ attach_global_ctx_data(struct kmem_cache *ctx_cache)
 	/* Allocate everything */
 	scoped_guard (rcu) {
 		for_each_process_thread(g, p) {
+			if (p->flags & PF_EXITING)
+				continue;
 			cd = rcu_dereference(p->perf_ctx_data);
 			if (cd && !cd->global) {
 				cd->global = 1;
@@ -14562,8 +14575,11 @@ void perf_event_exit_task(struct task_struct *task)
 
 	/*
 	 * Detach the perf_ctx_data for the system-wide event.
+	 *
+	 * Done without holding global_ctx_data_rwsem; typically
+	 * attach_global_ctx_data() will skip over this task, but otherwise
+	 * attach_task_ctx_data() will observe PF_EXITING.
 	 */
-	guard(percpu_read)(&global_ctx_data_rwsem);
 	detach_task_ctx_data(task);
 }
 
-- 
2.52.0.457.g6b5491de43-goog
Re: [PATCH v2] perf/core: Fix slow perf_event_task_exit() with LBR callstacks
Posted by Namhyung Kim 3 weeks ago
Hi Peter,

On Wed, Jan 14, 2026 at 10:01:30AM -0800, Namhyung Kim wrote:
> I got a report that a task is stuck in perf_event_exit_task() waiting
> for global_ctx_data_rwsem.  On large systems with lots threads, it'd
> have performance issues when it grabs the lock to iterate all threads
> in the system to allocate the context data.
> 
> And it'd block task exit path which is problematic especially under
> memory pressure.
> 
>   perf_event_open
>     perf_event_alloc
>       attach_perf_ctx_data
>         attach_global_ctx_data
>           percpu_down_write (global_ctx_data_rwsem)
>             for_each_process_thread
>               alloc_task_ctx_data
>                                                do_exit
>                                                  perf_event_exit_task
>                                                    percpu_down_read (global_ctx_data_rwsem)
> 
> It should not hold the global_ctx_data_rwsem on the exit path.  Let's
> skip allocation for exiting tasks and free the data carefully.
> 
> Reported-by: Rosalie Fang <rosaliefang@google.com>
> Suggested-by: Peter Zijlstra <peterz@infradead.org>
> Signed-off-by: Namhyung Kim <namhyung@kernel.org>

It seems you merged v1 which has a sparse warning.

Thanks,
Namhyung

> ---
>  kernel/events/core.c | 18 +++++++++++++++++-
>  1 file changed, 17 insertions(+), 1 deletion(-)
> 
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 376fb07d869b8b50..b164e884102323f5 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -5424,6 +5424,17 @@ attach_task_ctx_data(struct task_struct *task, struct kmem_cache *ctx_cache,
>  		if (try_cmpxchg((struct perf_ctx_data **)&task->perf_ctx_data, &old, cd)) {
>  			if (old)
>  				perf_free_ctx_data_rcu(old);
> +			/*
> +			 * Above try_cmpxchg() pairs with try_cmpxchg() from
> +			 * detach_task_ctx_data() such that
> +			 * if we race with perf_event_exit_task(), we must
> +			 * observe PF_EXITING.
> +			 */
> +			if (task->flags & PF_EXITING) {
> +				/* detach_task_ctx_data() may free it already */
> +				if (try_cmpxchg(&task->perf_ctx_data, &cd, NULL))
> +					perf_free_ctx_data_rcu(cd);
> +			}
>  			return 0;
>  		}
>  
> @@ -5469,6 +5480,8 @@ attach_global_ctx_data(struct kmem_cache *ctx_cache)
>  	/* Allocate everything */
>  	scoped_guard (rcu) {
>  		for_each_process_thread(g, p) {
> +			if (p->flags & PF_EXITING)
> +				continue;
>  			cd = rcu_dereference(p->perf_ctx_data);
>  			if (cd && !cd->global) {
>  				cd->global = 1;
> @@ -14562,8 +14575,11 @@ void perf_event_exit_task(struct task_struct *task)
>  
>  	/*
>  	 * Detach the perf_ctx_data for the system-wide event.
> +	 *
> +	 * Done without holding global_ctx_data_rwsem; typically
> +	 * attach_global_ctx_data() will skip over this task, but otherwise
> +	 * attach_task_ctx_data() will observe PF_EXITING.
>  	 */
> -	guard(percpu_read)(&global_ctx_data_rwsem);
>  	detach_task_ctx_data(task);
>  }
>  
> -- 
> 2.52.0.457.g6b5491de43-goog
>