kernel/events/core.c | 42 ++++++++++-------------------------------- 1 file changed, 10 insertions(+), 32 deletions(-)
We encounter perf warnings when using cgroup events like:
```
cd /sys/fs/cgroup
mkdir test
perf stat -e cycles -a -G test
```
WARNING: CPU: 0 PID: 690 at kernel/events/core.c:849 perf_cgroup_switch+0xb2/0xc0
[ 91.393417] Call Trace:
[ 91.393772] <TASK>
[ 91.394080] __schedule+0x4ae/0x9f0
[ 91.394535] ? _raw_spin_unlock_irqrestore+0x23/0x40
[ 91.395145] ? __cond_resched+0x18/0x20
[ 91.395622] preempt_schedule_common+0x2d/0x70
[ 91.396163] __cond_resched+0x18/0x20
[ 91.396621] wait_for_completion+0x2f/0x160
[ 91.397137] ? cpu_stop_queue_work+0x9e/0x130
[ 91.397665] affine_move_task+0x18a/0x4f0
WARNING: CPU: 0 PID: 690 at kernel/events/core.c:829 ctx_sched_in+0x1cf/0x1e0
[ 91.430151] Call Trace:
[ 91.430490] <TASK>
[ 91.430793] ? ctx_sched_out+0xb7/0x1b0
[ 91.431274] perf_cgroup_switch+0x88/0xc0
[ 91.431778] __schedule+0x4ae/0x9f0
[ 91.432215] ? _raw_spin_unlock_irqrestore+0x23/0x40
[ 91.432825] ? __cond_resched+0x18/0x20
[ 91.433299] preempt_schedule_common+0x2d/0x70
[ 91.433839] __cond_resched+0x18/0x20
[ 91.434298] wait_for_completion+0x2f/0x160
[ 91.434808] ? cpu_stop_queue_work+0x9e/0x130
[ 91.435334] affine_move_task+0x18a/0x4f0
The above two warnings are not complete here since I remove other
unimportant information. The problem is caused by the perf cgroup
events tracking:
CPU0 CPU1
perf_event_open()
perf_event_alloc()
account_event()
account_event_cpu()
atomic_inc(perf_cgroup_events)
__perf_event_task_sched_out()
if (atomic_read(perf_cgroup_events))
perf_cgroup_switch()
// kernel/events/core.c:849
WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0)
if (READ_ONCE(cpuctx->cgrp) == cgrp) // false
return
perf_ctx_lock()
ctx_sched_out()
cpuctx->cgrp = cgrp
ctx_sched_in()
perf_cgroup_set_timestamp()
// kernel/events/core.c:829
WARN_ON_ONCE(!ctx->nr_cgroups)
perf_ctx_unlock()
perf_install_in_context()
add_event_to_ctx()
list_add_event()
perf_cgroup_event_enable()
ctx->nr_cgroups++
cpuctx->cgrp = X
We can see from above that we wrongly use percpu atomic perf_cgroup_events
to check if we need to perf_cgroup_switch(), which should only be used
when we know this CPU has cgroup events enabled.
The commit bd2756811766 ("perf: Rewrite core context handling") change
to have only one context per-CPU, so we can just use cpuctx->cgrp to
check if this CPU has cgroup events enabled.
So percpu atomic perf_cgroup_events is not needed.
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
---
kernel/events/core.c | 42 ++++++++++--------------------------------
1 file changed, 10 insertions(+), 32 deletions(-)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index eacc3702654d..5d97a9f26003 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -380,7 +380,6 @@ enum event_type_t {
/*
* perf_sched_events : >0 events exist
- * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
*/
static void perf_sched_delayed(struct work_struct *work);
@@ -389,7 +388,6 @@ static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
static DEFINE_MUTEX(perf_sched_mutex);
static atomic_t perf_sched_count;
-static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
static atomic_t nr_mmap_events __read_mostly;
@@ -844,9 +842,16 @@ static void perf_cgroup_switch(struct task_struct *task)
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_cgroup *cgrp;
- cgrp = perf_cgroup_from_task(task, NULL);
+ /*
+ * cpuctx->cgrp is set when the first cgroup event enabled,
+ * and is cleared when the last cgroup event disabled.
+ */
+ if (READ_ONCE(cpuctx->cgrp) == NULL)
+ return;
WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
+
+ cgrp = perf_cgroup_from_task(task, NULL);
if (READ_ONCE(cpuctx->cgrp) == cgrp)
return;
@@ -3631,8 +3636,7 @@ void __perf_event_task_sched_out(struct task_struct *task,
* to check if we have to switch out PMU state.
* cgroup event are system-wide mode only
*/
- if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
- perf_cgroup_switch(next);
+ perf_cgroup_switch(next);
}
static bool perf_less_group_idx(const void *l, const void *r)
@@ -4974,15 +4978,6 @@ static void unaccount_pmu_sb_event(struct perf_event *event)
detach_sb_event(event);
}
-static void unaccount_event_cpu(struct perf_event *event, int cpu)
-{
- if (event->parent)
- return;
-
- if (is_cgroup_event(event))
- atomic_dec(&per_cpu(perf_cgroup_events, cpu));
-}
-
#ifdef CONFIG_NO_HZ_FULL
static DEFINE_SPINLOCK(nr_freq_lock);
#endif
@@ -5048,8 +5043,6 @@ static void unaccount_event(struct perf_event *event)
schedule_delayed_work(&perf_sched_work, HZ);
}
- unaccount_event_cpu(event, event->cpu);
-
unaccount_pmu_sb_event(event);
}
@@ -11679,15 +11672,6 @@ static void account_pmu_sb_event(struct perf_event *event)
attach_sb_event(event);
}
-static void account_event_cpu(struct perf_event *event, int cpu)
-{
- if (event->parent)
- return;
-
- if (is_cgroup_event(event))
- atomic_inc(&per_cpu(perf_cgroup_events, cpu));
-}
-
/* Freq events need the tick to stay alive (see perf_event_task_tick). */
static void account_freq_event_nohz(void)
{
@@ -11775,8 +11759,6 @@ static void account_event(struct perf_event *event)
}
enabled:
- account_event_cpu(event, event->cpu);
-
account_pmu_sb_event(event);
}
@@ -12822,13 +12804,11 @@ static void __perf_pmu_remove(struct perf_event_context *ctx,
perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) {
perf_remove_from_context(event, 0);
- unaccount_event_cpu(event, cpu);
put_pmu_ctx(event->pmu_ctx);
list_add(&event->migrate_entry, events);
for_each_sibling_event(sibling, event) {
perf_remove_from_context(sibling, 0);
- unaccount_event_cpu(sibling, cpu);
put_pmu_ctx(sibling->pmu_ctx);
list_add(&sibling->migrate_entry, events);
}
@@ -12847,7 +12827,6 @@ static void __perf_pmu_install_event(struct pmu *pmu,
if (event->state >= PERF_EVENT_STATE_OFF)
event->state = PERF_EVENT_STATE_INACTIVE;
- account_event_cpu(event, cpu);
perf_install_in_context(ctx, event, cpu);
}
@@ -13742,8 +13721,7 @@ static int __perf_cgroup_move(void *info)
struct task_struct *task = info;
preempt_disable();
- if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
- perf_cgroup_switch(task);
+ perf_cgroup_switch(task);
preempt_enable();
return 0;
--
2.37.2
On 06-Dec-22 8:20 AM, Chengming Zhou wrote: > We encounter perf warnings when using cgroup events like: > ``` > cd /sys/fs/cgroup > mkdir test > perf stat -e cycles -a -G test > ``` > > WARNING: CPU: 0 PID: 690 at kernel/events/core.c:849 perf_cgroup_switch+0xb2/0xc0 > [ 91.393417] Call Trace: > [ 91.393772] <TASK> > [ 91.394080] __schedule+0x4ae/0x9f0 > [ 91.394535] ? _raw_spin_unlock_irqrestore+0x23/0x40 > [ 91.395145] ? __cond_resched+0x18/0x20 > [ 91.395622] preempt_schedule_common+0x2d/0x70 > [ 91.396163] __cond_resched+0x18/0x20 > [ 91.396621] wait_for_completion+0x2f/0x160 > [ 91.397137] ? cpu_stop_queue_work+0x9e/0x130 > [ 91.397665] affine_move_task+0x18a/0x4f0 nit: These timestamps can be removed in commit log. > > WARNING: CPU: 0 PID: 690 at kernel/events/core.c:829 ctx_sched_in+0x1cf/0x1e0 > [ 91.430151] Call Trace: > [ 91.430490] <TASK> > [ 91.430793] ? ctx_sched_out+0xb7/0x1b0 > [ 91.431274] perf_cgroup_switch+0x88/0xc0 > [ 91.431778] __schedule+0x4ae/0x9f0 > [ 91.432215] ? _raw_spin_unlock_irqrestore+0x23/0x40 > [ 91.432825] ? __cond_resched+0x18/0x20 > [ 91.433299] preempt_schedule_common+0x2d/0x70 > [ 91.433839] __cond_resched+0x18/0x20 > [ 91.434298] wait_for_completion+0x2f/0x160 > [ 91.434808] ? cpu_stop_queue_work+0x9e/0x130 > [ 91.435334] affine_move_task+0x18a/0x4f0 > > The above two warnings are not complete here since I remove other > unimportant information. The problem is caused by the perf cgroup > events tracking: > > CPU0 CPU1 > perf_event_open() > perf_event_alloc() > account_event() > account_event_cpu() > atomic_inc(perf_cgroup_events) > __perf_event_task_sched_out() > if (atomic_read(perf_cgroup_events)) > perf_cgroup_switch() > // kernel/events/core.c:849 > WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0) > if (READ_ONCE(cpuctx->cgrp) == cgrp) // false > return > perf_ctx_lock() > ctx_sched_out() > cpuctx->cgrp = cgrp > ctx_sched_in() > perf_cgroup_set_timestamp() > // kernel/events/core.c:829 > WARN_ON_ONCE(!ctx->nr_cgroups) > perf_ctx_unlock() > perf_install_in_context() > add_event_to_ctx() > list_add_event() > perf_cgroup_event_enable() > ctx->nr_cgroups++ > cpuctx->cgrp = X IIUC, since it's a cgroup event, perf_install_in_context() will do: cpu_function_call(cpu, __perf_install_in_context, event). And thus, callchain starting with add_event_to_ctx() will be executed on CPU1, not on CPU0. > We can see from above that we wrongly use percpu atomic perf_cgroup_events > to check if we need to perf_cgroup_switch(), which should only be used > when we know this CPU has cgroup events enabled. > > The commit bd2756811766 ("perf: Rewrite core context handling") change > to have only one context per-CPU, so we can just use cpuctx->cgrp to > check if this CPU has cgroup events enabled. > > So percpu atomic perf_cgroup_events is not needed. > > Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com> Fixes: bd2756811766 ("perf: Rewrite core context handling") Otherwise looks good. Tested-by: Ravi Bangoria <ravi.bangoria@amd.com> Thanks, Ravi
On 2022/12/7 18:41, Ravi Bangoria wrote: > On 06-Dec-22 8:20 AM, Chengming Zhou wrote: >> We encounter perf warnings when using cgroup events like: >> ``` >> cd /sys/fs/cgroup >> mkdir test >> perf stat -e cycles -a -G test >> ``` >> >> WARNING: CPU: 0 PID: 690 at kernel/events/core.c:849 perf_cgroup_switch+0xb2/0xc0 >> [ 91.393417] Call Trace: >> [ 91.393772] <TASK> >> [ 91.394080] __schedule+0x4ae/0x9f0 >> [ 91.394535] ? _raw_spin_unlock_irqrestore+0x23/0x40 >> [ 91.395145] ? __cond_resched+0x18/0x20 >> [ 91.395622] preempt_schedule_common+0x2d/0x70 >> [ 91.396163] __cond_resched+0x18/0x20 >> [ 91.396621] wait_for_completion+0x2f/0x160 >> [ 91.397137] ? cpu_stop_queue_work+0x9e/0x130 >> [ 91.397665] affine_move_task+0x18a/0x4f0 > > nit: These timestamps can be removed in commit log. Ok, will remove. > >> >> WARNING: CPU: 0 PID: 690 at kernel/events/core.c:829 ctx_sched_in+0x1cf/0x1e0 >> [ 91.430151] Call Trace: >> [ 91.430490] <TASK> >> [ 91.430793] ? ctx_sched_out+0xb7/0x1b0 >> [ 91.431274] perf_cgroup_switch+0x88/0xc0 >> [ 91.431778] __schedule+0x4ae/0x9f0 >> [ 91.432215] ? _raw_spin_unlock_irqrestore+0x23/0x40 >> [ 91.432825] ? __cond_resched+0x18/0x20 >> [ 91.433299] preempt_schedule_common+0x2d/0x70 >> [ 91.433839] __cond_resched+0x18/0x20 >> [ 91.434298] wait_for_completion+0x2f/0x160 >> [ 91.434808] ? cpu_stop_queue_work+0x9e/0x130 >> [ 91.435334] affine_move_task+0x18a/0x4f0 >> >> The above two warnings are not complete here since I remove other >> unimportant information. The problem is caused by the perf cgroup >> events tracking: >> >> CPU0 CPU1 >> perf_event_open() >> perf_event_alloc() >> account_event() >> account_event_cpu() >> atomic_inc(perf_cgroup_events) >> __perf_event_task_sched_out() >> if (atomic_read(perf_cgroup_events)) >> perf_cgroup_switch() >> // kernel/events/core.c:849 >> WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0) >> if (READ_ONCE(cpuctx->cgrp) == cgrp) // false >> return >> perf_ctx_lock() >> ctx_sched_out() >> cpuctx->cgrp = cgrp >> ctx_sched_in() >> perf_cgroup_set_timestamp() >> // kernel/events/core.c:829 >> WARN_ON_ONCE(!ctx->nr_cgroups) >> perf_ctx_unlock() >> perf_install_in_context() >> add_event_to_ctx() >> list_add_event() >> perf_cgroup_event_enable() >> ctx->nr_cgroups++ >> cpuctx->cgrp = X > > IIUC, since it's a cgroup event, perf_install_in_context() will do: > cpu_function_call(cpu, __perf_install_in_context, event). And thus, > callchain starting with add_event_to_ctx() will be executed on CPU1, > not on CPU0. Right, will fix it next version. > >> We can see from above that we wrongly use percpu atomic perf_cgroup_events >> to check if we need to perf_cgroup_switch(), which should only be used >> when we know this CPU has cgroup events enabled. >> >> The commit bd2756811766 ("perf: Rewrite core context handling") change >> to have only one context per-CPU, so we can just use cpuctx->cgrp to >> check if this CPU has cgroup events enabled. >> >> So percpu atomic perf_cgroup_events is not needed. >> >> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com> > > Fixes: bd2756811766 ("perf: Rewrite core context handling") > > Otherwise looks good. > Tested-by: Ravi Bangoria <ravi.bangoria@amd.com> Ok, will add Fixes tag next version. Thanks! > > Thanks, > Ravi
© 2016 - 2025 Red Hat, Inc.