Previously it was only safe to call perf_pmu_unregister() if there
were no active events of that pmu around -- which was impossible to
guarantee since it races all sorts against perf_init_event().
Rework the whole thing by:
- keeping track of all events for a given pmu
- 'hiding' the pmu from perf_init_event()
- waiting for the appropriate (s)rcu grace periods such that all
prior references to the PMU will be completed
- detaching all still existing events of that pmu (see first point)
and moving them to a new REVOKED state.
- actually freeing the pmu data.
Where notably the new REVOKED state must inhibit all event actions
from reaching code that wants to use event->pmu.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
include/linux/perf_event.h | 15 +-
kernel/events/core.c | 266 ++++++++++++++++++++++++++++++++++++++++-----
2 files changed, 249 insertions(+), 32 deletions(-)
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -325,6 +325,9 @@ struct perf_output_handle;
struct pmu {
struct list_head entry;
+ spinlock_t events_lock;
+ struct list_head events;
+
struct module *module;
struct device *dev;
struct device *parent;
@@ -632,9 +635,10 @@ struct perf_addr_filter_range {
* enum perf_event_state - the states of an event:
*/
enum perf_event_state {
- PERF_EVENT_STATE_DEAD = -4,
- PERF_EVENT_STATE_EXIT = -3,
- PERF_EVENT_STATE_ERROR = -2,
+ PERF_EVENT_STATE_DEAD = -5,
+ PERF_EVENT_STATE_REVOKED = -4, /* pmu gone, must not touch */
+ PERF_EVENT_STATE_EXIT = -3, /* task died, still inherit */
+ PERF_EVENT_STATE_ERROR = -2, /* scheduling error, can enable */
PERF_EVENT_STATE_OFF = -1,
PERF_EVENT_STATE_INACTIVE = 0,
PERF_EVENT_STATE_ACTIVE = 1,
@@ -875,6 +879,7 @@ struct perf_event {
void *security;
#endif
struct list_head sb_list;
+ struct list_head pmu_list;
/*
* Certain events gets forwarded to another pmu internally by over-
@@ -1126,7 +1131,7 @@ extern void perf_aux_output_flag(struct
extern void perf_event_itrace_started(struct perf_event *event);
extern int perf_pmu_register(struct pmu *pmu, const char *name, int type);
-extern void perf_pmu_unregister(struct pmu *pmu);
+extern int perf_pmu_unregister(struct pmu *pmu);
extern void __perf_event_task_sched_in(struct task_struct *prev,
struct task_struct *task);
@@ -1737,7 +1742,7 @@ static inline bool needs_branch_stack(st
static inline bool has_aux(struct perf_event *event)
{
- return event->pmu->setup_aux;
+ return event->pmu && event->pmu->setup_aux;
}
static inline bool has_aux_action(struct perf_event *event)
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2418,7 +2418,9 @@ ctx_time_update_event(struct perf_event_
#define DETACH_GROUP 0x01UL
#define DETACH_CHILD 0x02UL
-#define DETACH_DEAD 0x04UL
+#define DETACH_EXIT 0x04UL
+#define DETACH_REVOKE 0x08UL
+#define DETACH_DEAD 0x10UL
/*
* Cross CPU call to remove a performance event
@@ -2433,6 +2435,7 @@ __perf_remove_from_context(struct perf_e
void *info)
{
struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
+ enum perf_event_state state = PERF_EVENT_STATE_OFF;
unsigned long flags = (unsigned long)info;
ctx_time_update(cpuctx, ctx);
@@ -2441,16 +2444,22 @@ __perf_remove_from_context(struct perf_e
* Ensure event_sched_out() switches to OFF, at the very least
* this avoids raising perf_pending_task() at this time.
*/
- if (flags & DETACH_DEAD)
+ if (flags & DETACH_EXIT)
+ state = PERF_EVENT_STATE_EXIT;
+ if (flags & DETACH_REVOKE)
+ state = PERF_EVENT_STATE_REVOKED;
+ if (flags & DETACH_DEAD) {
event->pending_disable = 1;
+ state = PERF_EVENT_STATE_DEAD;
+ }
event_sched_out(event, ctx);
if (flags & DETACH_GROUP)
perf_group_detach(event);
if (flags & DETACH_CHILD)
perf_child_detach(event);
list_del_event(event, ctx);
- if (flags & DETACH_DEAD)
- event->state = PERF_EVENT_STATE_DEAD;
+
+ event->state = min(event->state, state);
if (!pmu_ctx->nr_events) {
pmu_ctx->rotate_necessary = 0;
@@ -4510,7 +4519,8 @@ static void perf_event_enable_on_exec(st
static void perf_remove_from_owner(struct perf_event *event);
static void perf_event_exit_event(struct perf_event *event,
- struct perf_event_context *ctx);
+ struct perf_event_context *ctx,
+ bool revoke);
/*
* Removes all events from the current task that have been marked
@@ -4537,7 +4547,7 @@ static void perf_event_remove_on_exec(st
modified = true;
- perf_event_exit_event(event, ctx);
+ perf_event_exit_event(event, ctx, false);
}
raw_spin_lock_irqsave(&ctx->lock, flags);
@@ -5137,6 +5147,7 @@ static bool is_sb_event(struct perf_even
attr->context_switch || attr->text_poke ||
attr->bpf_event)
return true;
+
return false;
}
@@ -5338,6 +5349,8 @@ static void perf_pending_task_sync(struc
/* vs perf_event_alloc() error */
static void __free_event(struct perf_event *event)
{
+ struct pmu *pmu = event->pmu;
+
if (event->attach_state & PERF_ATTACH_CALLCHAIN)
put_callchain_buffers();
@@ -5364,6 +5377,7 @@ static void __free_event(struct perf_eve
* put_pmu_ctx() needs an event->ctx reference, because of
* epc->ctx.
*/
+ WARN_ON_ONCE(!pmu);
WARN_ON_ONCE(!event->ctx);
WARN_ON_ONCE(event->pmu_ctx->ctx != event->ctx);
put_pmu_ctx(event->pmu_ctx);
@@ -5376,8 +5390,13 @@ static void __free_event(struct perf_eve
if (event->ctx)
put_ctx(event->ctx);
- if (event->pmu)
- module_put(event->pmu->module);
+ if (pmu) {
+ module_put(pmu->module);
+ scoped_guard (spinlock, &pmu->events_lock) {
+ list_del(&event->pmu_list);
+ wake_up_var(pmu);
+ }
+ }
call_rcu(&event->rcu_head, free_event_rcu);
}
@@ -5525,7 +5544,11 @@ int perf_event_release_kernel(struct per
* Thus this guarantees that we will in fact observe and kill _ALL_
* child events.
*/
- perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);
+ if (event->state > PERF_EVENT_STATE_REVOKED) {
+ perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);
+ } else {
+ event->state = PERF_EVENT_STATE_DEAD;
+ }
perf_event_ctx_unlock(event, ctx);
@@ -5578,7 +5601,7 @@ int perf_event_release_kernel(struct per
mutex_unlock(&ctx->mutex);
if (child)
- free_event(child);
+ put_event(child);
put_ctx(ctx);
goto again;
@@ -5813,7 +5836,7 @@ __perf_read(struct perf_event *event, ch
* error state (i.e. because it was pinned but it couldn't be
* scheduled on to the CPU at some point).
*/
- if (event->state == PERF_EVENT_STATE_ERROR)
+ if (event->state <= PERF_EVENT_STATE_ERROR)
return 0;
if (count < event->read_size)
@@ -5852,8 +5875,14 @@ static __poll_t perf_poll(struct file *f
struct perf_buffer *rb;
__poll_t events = EPOLLHUP;
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return EPOLLERR;
+
poll_wait(file, &event->waitq, wait);
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return EPOLLERR;
+
if (is_event_hup(event))
return events;
@@ -6027,6 +6056,9 @@ static long _perf_ioctl(struct perf_even
void (*func)(struct perf_event *);
u32 flags = arg;
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return -ENODEV;
+
switch (cmd) {
case PERF_EVENT_IOC_ENABLE:
func = _perf_event_enable;
@@ -6412,7 +6444,7 @@ static void perf_mmap_open(struct vm_are
if (vma->vm_pgoff)
atomic_inc(&event->rb->aux_mmap_count);
- if (event->pmu->event_mapped)
+ if (event->pmu && event->pmu->event_mapped)
event->pmu->event_mapped(event, vma->vm_mm);
}
@@ -6435,7 +6467,8 @@ static void perf_mmap_close(struct vm_ar
unsigned long size = perf_data_size(rb);
bool detach_rest = false;
- if (event->pmu->event_unmapped)
+ /* FIXIES vs perf_pmu_unregister() */
+ if (event->pmu && event->pmu->event_unmapped)
event->pmu->event_unmapped(event, vma->vm_mm);
/*
@@ -6659,6 +6692,16 @@ static int perf_mmap(struct file *file,
mutex_lock(&event->mmap_mutex);
ret = -EINVAL;
+ /*
+ * This relies on __pmu_detach_event() taking mmap_mutex after marking
+ * the event REVOKED. Either we observe the state, or __pmu_detach_event()
+ * will detach the rb created here.
+ */
+ if (event->state <= PERF_EVENT_STATE_REVOKED) {
+ ret = -ENODEV;
+ goto unlock;
+ }
+
if (vma->vm_pgoff == 0) {
nr_pages -= 1;
@@ -6837,7 +6880,7 @@ static int perf_mmap(struct file *file,
if (!ret)
ret = map_range(rb, vma);
- if (!ret && event->pmu->event_mapped)
+ if (!ret && event->pmu && event->pmu->event_mapped)
event->pmu->event_mapped(event, vma->vm_mm);
return ret;
@@ -6849,6 +6892,9 @@ static int perf_fasync(int fd, struct fi
struct perf_event *event = filp->private_data;
int retval;
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return -ENODEV;
+
inode_lock(inode);
retval = fasync_helper(fd, filp, on, &event->fasync);
inode_unlock(inode);
@@ -10813,6 +10859,9 @@ static int __perf_event_set_bpf_prog(str
{
bool is_kprobe, is_uprobe, is_tracepoint, is_syscall_tp;
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return -ENODEV;
+
if (!perf_event_is_tracing(event))
return perf_event_set_bpf_handler(event, prog, bpf_cookie);
@@ -11997,6 +12046,9 @@ int perf_pmu_register(struct pmu *_pmu,
if (!pmu->event_idx)
pmu->event_idx = perf_event_idx_default;
+ INIT_LIST_HEAD(&pmu->events);
+ spin_lock_init(&pmu->events_lock);
+
/*
* Now that the PMU is complete, make it visible to perf_try_init_event().
*/
@@ -12010,21 +12062,143 @@ int perf_pmu_register(struct pmu *_pmu,
}
EXPORT_SYMBOL_GPL(perf_pmu_register);
-void perf_pmu_unregister(struct pmu *pmu)
+static void __pmu_detach_event(struct pmu *pmu, struct perf_event *event,
+ struct perf_event_context *ctx)
+{
+ /*
+ * De-schedule the event and mark it REVOKED.
+ */
+ perf_event_exit_event(event, ctx, true);
+
+ /*
+ * All _free_event() bits that rely on event->pmu:
+ *
+ * Notably, perf_mmap() relies on the ordering here.
+ */
+ scoped_guard (mutex, &event->mmap_mutex) {
+ WARN_ON_ONCE(pmu->event_unmapped);
+ /*
+ * Mostly an empty lock sequence, such that perf_mmap(), which
+ * relies on mmap_mutex, is sure to observe the state change.
+ */
+ }
+
+ perf_event_free_bpf_prog(event);
+ perf_free_addr_filters(event);
+
+ if (event->destroy) {
+ event->destroy(event);
+ event->destroy = NULL;
+ }
+
+ if (event->pmu_ctx) {
+ put_pmu_ctx(event->pmu_ctx);
+ event->pmu_ctx = NULL;
+ }
+
+ exclusive_event_destroy(event);
+ module_put(pmu->module);
+
+ event->pmu = NULL; /* force fault instead of UAF */
+}
+
+static void pmu_detach_event(struct pmu *pmu, struct perf_event *event)
+{
+ struct perf_event_context *ctx;
+
+ ctx = perf_event_ctx_lock(event);
+ __pmu_detach_event(pmu, event, ctx);
+ perf_event_ctx_unlock(event, ctx);
+
+ scoped_guard (spinlock, &pmu->events_lock)
+ list_del(&event->pmu_list);
+}
+
+static struct perf_event *pmu_get_event(struct pmu *pmu)
+{
+ struct perf_event *event;
+
+ guard(spinlock)(&pmu->events_lock);
+ list_for_each_entry(event, &pmu->events, pmu_list) {
+ if (atomic_long_inc_not_zero(&event->refcount))
+ return event;
+ }
+
+ return NULL;
+}
+
+static bool pmu_empty(struct pmu *pmu)
+{
+ guard(spinlock)(&pmu->events_lock);
+ return list_empty(&pmu->events);
+}
+
+static void pmu_detach_events(struct pmu *pmu)
+{
+ struct perf_event *event;
+
+ for (;;) {
+ event = pmu_get_event(pmu);
+ if (!event)
+ break;
+
+ pmu_detach_event(pmu, event);
+ put_event(event);
+ }
+
+ /*
+ * wait for pending _free_event()s
+ */
+ wait_var_event(pmu, pmu_empty(pmu));
+}
+
+int perf_pmu_unregister(struct pmu *pmu)
{
scoped_guard (mutex, &pmus_lock) {
+ if (!idr_cmpxchg(&pmu_idr, pmu->type, pmu, NULL))
+ return -EINVAL;
+
list_del_rcu(&pmu->entry);
- idr_remove(&pmu_idr, pmu->type);
}
/*
* We dereference the pmu list under both SRCU and regular RCU, so
* synchronize against both of those.
+ *
+ * Notably, the entirety of event creation, from perf_init_event()
+ * (which will now fail, because of the above) until
+ * perf_install_in_context() should be under SRCU such that
+ * this synchronizes against event creation. This avoids trying to
+ * detach events that are not fully formed.
*/
synchronize_srcu(&pmus_srcu);
synchronize_rcu();
+ if (pmu->event_unmapped && !pmu_empty(pmu)) {
+ /*
+ * Can't force remove events when pmu::event_unmapped()
+ * is used in perf_mmap_close().
+ */
+ guard(mutex)(&pmus_lock);
+ idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu);
+ list_add_rcu(&pmu->entry, &pmus);
+ return -EBUSY;
+ }
+
+ scoped_guard (mutex, &pmus_lock)
+ idr_remove(&pmu_idr, pmu->type);
+
+ /*
+ * PMU is removed from the pmus list, so no new events will
+ * be created, now take care of the existing ones.
+ */
+ pmu_detach_events(pmu);
+
+ /*
+ * PMU is unused, make it go away.
+ */
perf_pmu_free(pmu);
+ return 0;
}
EXPORT_SYMBOL_GPL(perf_pmu_unregister);
@@ -12107,7 +12281,7 @@ static struct pmu *perf_init_event(struc
struct pmu *pmu;
int type, ret;
- guard(srcu)(&pmus_srcu);
+ guard(srcu)(&pmus_srcu); /* pmu idr/list access */
/*
* Save original type before calling pmu->event_init() since certain
@@ -12331,6 +12505,7 @@ perf_event_alloc(struct perf_event_attr
INIT_LIST_HEAD(&event->active_entry);
INIT_LIST_HEAD(&event->addr_filters.list);
INIT_HLIST_NODE(&event->hlist_entry);
+ INIT_LIST_HEAD(&event->pmu_list);
init_waitqueue_head(&event->waitq);
@@ -12498,6 +12673,13 @@ perf_event_alloc(struct perf_event_attr
/* symmetric to unaccount_event() in _free_event() */
account_event(event);
+ /*
+ * Event creation should be under SRCU, see perf_pmu_unregister().
+ */
+ lockdep_assert_held(&pmus_srcu);
+ scoped_guard (spinlock, &pmu->events_lock)
+ list_add(&event->pmu_list, &pmu->events);
+
return_ptr(event);
}
@@ -12697,6 +12879,9 @@ perf_event_set_output(struct perf_event
goto unlock;
if (output_event) {
+ if (output_event->state <= PERF_EVENT_STATE_REVOKED)
+ goto unlock;
+
/* get the rb we want to redirect to */
rb = ring_buffer_get(output_event);
if (!rb)
@@ -12878,6 +13063,11 @@ SYSCALL_DEFINE5(perf_event_open,
if (event_fd < 0)
return event_fd;
+ /*
+ * Event creation should be under SRCU, see perf_pmu_unregister().
+ */
+ guard(srcu)(&pmus_srcu);
+
CLASS(fd, group)(group_fd); // group_fd == -1 => empty
if (group_fd != -1) {
if (!is_perf_file(group)) {
@@ -12885,6 +13075,10 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_fd;
}
group_leader = fd_file(group)->private_data;
+ if (group_leader->state <= PERF_EVENT_STATE_REVOKED) {
+ err = -ENODEV;
+ goto err_fd;
+ }
if (flags & PERF_FLAG_FD_OUTPUT)
output_event = group_leader;
if (flags & PERF_FLAG_FD_NO_GROUP)
@@ -13218,6 +13412,11 @@ perf_event_create_kernel_counter(struct
if (attr->aux_output || attr->aux_action)
return ERR_PTR(-EINVAL);
+ /*
+ * Event creation should be under SRCU, see perf_pmu_unregister().
+ */
+ guard(srcu)(&pmus_srcu);
+
event = perf_event_alloc(attr, cpu, task, NULL, NULL,
overflow_handler, context, -1);
if (IS_ERR(event)) {
@@ -13429,10 +13628,11 @@ static void sync_child_event(struct perf
}
static void
-perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
+perf_event_exit_event(struct perf_event *event,
+ struct perf_event_context *ctx, bool revoke)
{
struct perf_event *parent_event = event->parent;
- unsigned long detach_flags = 0;
+ unsigned long detach_flags = DETACH_EXIT;
if (parent_event) {
/*
@@ -13447,16 +13647,14 @@ perf_event_exit_event(struct perf_event
* Do destroy all inherited groups, we don't care about those
* and being thorough is better.
*/
- detach_flags = DETACH_GROUP | DETACH_CHILD;
+ detach_flags |= DETACH_GROUP | DETACH_CHILD;
mutex_lock(&parent_event->child_mutex);
}
- perf_remove_from_context(event, detach_flags);
+ if (revoke)
+ detach_flags |= DETACH_GROUP | DETACH_REVOKE;
- raw_spin_lock_irq(&ctx->lock);
- if (event->state > PERF_EVENT_STATE_EXIT)
- perf_event_set_state(event, PERF_EVENT_STATE_EXIT);
- raw_spin_unlock_irq(&ctx->lock);
+ perf_remove_from_context(event, detach_flags);
/*
* Child events can be freed.
@@ -13467,7 +13665,13 @@ perf_event_exit_event(struct perf_event
* Kick perf_poll() for is_event_hup();
*/
perf_event_wakeup(parent_event);
- free_event(event);
+ /*
+ * pmu_detach_event() will have an extra refcount.
+ */
+ if (revoke)
+ put_event(event);
+ else
+ free_event(event);
put_event(parent_event);
return;
}
@@ -13532,7 +13736,7 @@ static void perf_event_exit_task_context
perf_event_task(child, child_ctx, 0);
list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
- perf_event_exit_event(child_event, child_ctx);
+ perf_event_exit_event(child_event, child_ctx, false);
mutex_unlock(&child_ctx->mutex);
@@ -13722,6 +13926,14 @@ inherit_event(struct perf_event *parent_
if (parent_event->parent)
parent_event = parent_event->parent;
+ if (parent_event->state <= PERF_EVENT_STATE_REVOKED)
+ return NULL;
+
+ /*
+ * Event creation should be under SRCU, see perf_pmu_unregister().
+ */
+ guard(srcu)(&pmus_srcu);
+
child_event = perf_event_alloc(&parent_event->attr,
parent_event->cpu,
child,
On 05-Feb-25 3:51 PM, Peter Zijlstra wrote:
> Previously it was only safe to call perf_pmu_unregister() if there
> were no active events of that pmu around -- which was impossible to
> guarantee since it races all sorts against perf_init_event().
>
> Rework the whole thing by:
>
> - keeping track of all events for a given pmu
>
> - 'hiding' the pmu from perf_init_event()
>
> - waiting for the appropriate (s)rcu grace periods such that all
> prior references to the PMU will be completed
>
> - detaching all still existing events of that pmu (see first point)
> and moving them to a new REVOKED state.
>
> - actually freeing the pmu data.
>
> Where notably the new REVOKED state must inhibit all event actions
> from reaching code that wants to use event->pmu.
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Another race between perf_event_init_task() failure path and
perf_pmu_unregister():
CPU 1 CPU 2
perf_event_init_task()
perf_event_free_task()
perf_free_event(event1)
/* event1->refcount is 1 */
perf_pmu_unregister()
pmu_detach_events()
pmu_get_event(pmu) /* Picks event1 */
atomic_long_inc_not_zero(&event1->refcount) /* event1 */
/* event1->refcount became 2 (by CPU 2) */
free_event(event1)
WARN()
Thanks,
Ravi
On Mon, Feb 10, 2025 at 12:29:21PM +0530, Ravi Bangoria wrote: > On 05-Feb-25 3:51 PM, Peter Zijlstra wrote: > > Previously it was only safe to call perf_pmu_unregister() if there > > were no active events of that pmu around -- which was impossible to > > guarantee since it races all sorts against perf_init_event(). > > > > Rework the whole thing by: > > > > - keeping track of all events for a given pmu > > > > - 'hiding' the pmu from perf_init_event() > > > > - waiting for the appropriate (s)rcu grace periods such that all > > prior references to the PMU will be completed > > > > - detaching all still existing events of that pmu (see first point) > > and moving them to a new REVOKED state. > > > > - actually freeing the pmu data. > > > > Where notably the new REVOKED state must inhibit all event actions > > from reaching code that wants to use event->pmu. > > > > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> > > Another race between perf_event_init_task() failure path and > perf_pmu_unregister(): > > CPU 1 CPU 2 > > perf_event_init_task() > perf_event_free_task() > perf_free_event(event1) > /* event1->refcount is 1 */ > perf_pmu_unregister() > pmu_detach_events() > pmu_get_event(pmu) /* Picks event1 */ > atomic_long_inc_not_zero(&event1->refcount) /* event1 */ > /* event1->refcount became 2 (by CPU 2) */ > free_event(event1) > WARN() I've unified perf_event_free_task() and perf_event_exit_task(). This makes both use perf_event_exit_event() and as such should no longer have this free_event().
> @@ -13467,7 +13665,13 @@ perf_event_exit_event(struct perf_event
> * Kick perf_poll() for is_event_hup();
> */
> perf_event_wakeup(parent_event);
> - free_event(event);
> + /*
> + * pmu_detach_event() will have an extra refcount.
> + */
> + if (revoke)
> + put_event(event);
> + else
> + free_event(event);
> put_event(parent_event);
> return;
> }
There is a race between do_exit() and perf_pmu_unregister():
Assume: a Parent process with 'event1' and a Child process with an
inherited 'event2'.
CPU 1 CPU 2
Child process exiting ...
1 do_exit()
2 perf_event_exit_task()
3 perf_event_exit_task_context()
4 mutex_lock(&child_ctx->mutex);
5 list_for_each_entry_safe(&child_ctx->event_list)
6 /* picks event2. event2->refcount is 1 */
7 perf_pmu_unregister()
8 pmu_detach_events()
9 pmu_get_event(pmu) /* Picks event2 */
10 atomic_long_inc_not_zero(&event->refcount) /* event2 */
11 /* event2->refcount becomes 2 */
12 pmu_detach_event() /* event2 */
13 perf_event_ctx_lock(); /* event2 */
14 /* event2->refcount became 2 (by CPU 2) */ .
15 perf_event_exit_event() /* event2 */ . /* CPU 1 is holding ctx->mutex of
16 if (parent_event) { /* true */ . child process. Waiting ... */
17 if (revoke) /* false */ .
18 else .
19 free_event(); /* event2 */ .
20 WARN() .
^^^^^^
This race manifests as:
unexpected event refcount: 2; ptr=00000000c6ca2049
WARNING: CPU: 57 PID: 9729 at kernel/events/core.c:5439 free_event+0x48/0x60
RIP: 0010:free_event+0x48/0x60
Call Trace:
? free_event+0x48/0x60
perf_event_exit_event.isra.0+0x60/0xf0
perf_event_exit_task+0x1e1/0x280
do_exit+0x327/0xb00
The race continues... now, with the stale child2->parent pointer:
CPU 1 CPU 2
15 perf_event_exit_event() /* event2 */ . /* CPU 1 is holding ctx->mutex of
16 if (parent_event) { /* true */ . child process. Waiting ... */
17 if (revoke) /* false */ .
18 else .
19 free_event(); /* event2 */ .
20 WARN() .
21 put_event(parent_event); /* event1 */ .
22 /* event1->refcount becomes 1 */ .
23 } .
24 +- mutex_unlock(&child_ctx->mutex); . /* Acquired child's ctx->mutex */
25 __pmu_detach_event() /* event2 */
26 perf_event_exit_event() /* event2 */
27 if (parent_event) { /* true, BUT FALSE POSITIVE. */
28 if (revoke) /* true */
29 put_event(); /* event2 */
30 /* event2->refcount becomes 1 */
31 put_event(parent_event); /* event1 */
32 /* event1->refcount becomes 0 */
^^^^^^^^^^^^^^^^^^^^^^^^^^
This can manifest as some random bug. I guess, perf_event_exit_event()
should also check for (event->attach_state & PERF_ATTACH_CHILD) when
event->parent != NULL.
Thanks,
Ravi
On Mon, Feb 10, 2025 at 12:12:40PM +0530, Ravi Bangoria wrote:
> > @@ -13467,7 +13665,13 @@ perf_event_exit_event(struct perf_event
> > * Kick perf_poll() for is_event_hup();
> > */
> > perf_event_wakeup(parent_event);
> > - free_event(event);
> > + /*
> > + * pmu_detach_event() will have an extra refcount.
> > + */
> > + if (revoke)
> > + put_event(event);
> > + else
> > + free_event(event);
> > put_event(parent_event);
> > return;
> > }
>
> There is a race between do_exit() and perf_pmu_unregister():
>
> Assume: a Parent process with 'event1' and a Child process with an
> inherited 'event2'.
>
> CPU 1 CPU 2
>
> Child process exiting ...
> 1 do_exit()
> 2 perf_event_exit_task()
> 3 perf_event_exit_task_context()
> 4 mutex_lock(&child_ctx->mutex);
> 5 list_for_each_entry_safe(&child_ctx->event_list)
> 6 /* picks event2. event2->refcount is 1 */
> 7 perf_pmu_unregister()
> 8 pmu_detach_events()
> 9 pmu_get_event(pmu) /* Picks event2 */
> 10 atomic_long_inc_not_zero(&event->refcount) /* event2 */
> 11 /* event2->refcount becomes 2 */
> 12 pmu_detach_event() /* event2 */
> 13 perf_event_ctx_lock(); /* event2 */
> 14 /* event2->refcount became 2 (by CPU 2) */ .
> 15 perf_event_exit_event() /* event2 */ . /* CPU 1 is holding ctx->mutex of
> 16 if (parent_event) { /* true */ . child process. Waiting ... */
> 17 if (revoke) /* false */ .
> 18 else .
> 19 free_event(); /* event2 */ .
> 20 WARN() .
>
> ^^^^^^
>
> This race manifests as:
>
> unexpected event refcount: 2; ptr=00000000c6ca2049
> WARNING: CPU: 57 PID: 9729 at kernel/events/core.c:5439 free_event+0x48/0x60
> RIP: 0010:free_event+0x48/0x60
> Call Trace:
> ? free_event+0x48/0x60
> perf_event_exit_event.isra.0+0x60/0xf0
> perf_event_exit_task+0x1e1/0x280
> do_exit+0x327/0xb00
>
> The race continues... now, with the stale child2->parent pointer:
>
> CPU 1 CPU 2
>
> 15 perf_event_exit_event() /* event2 */ . /* CPU 1 is holding ctx->mutex of
> 16 if (parent_event) { /* true */ . child process. Waiting ... */
> 17 if (revoke) /* false */ .
> 18 else .
> 19 free_event(); /* event2 */ .
> 20 WARN() .
> 21 put_event(parent_event); /* event1 */ .
> 22 /* event1->refcount becomes 1 */ .
> 23 } .
> 24 +- mutex_unlock(&child_ctx->mutex); . /* Acquired child's ctx->mutex */
> 25 __pmu_detach_event() /* event2 */
> 26 perf_event_exit_event() /* event2 */
> 27 if (parent_event) { /* true, BUT FALSE POSITIVE. */
> 28 if (revoke) /* true */
> 29 put_event(); /* event2 */
> 30 /* event2->refcount becomes 1 */
> 31 put_event(parent_event); /* event1 */
> 32 /* event1->refcount becomes 0 */
>
> ^^^^^^^^^^^^^^^^^^^^^^^^^^
>
> This can manifest as some random bug. I guess, perf_event_exit_event()
> should also check for (event->attach_state & PERF_ATTACH_CHILD) when
> event->parent != NULL.
Does this work?
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2303,6 +2303,7 @@ static void perf_child_detach(struct per
sync_child_event(event);
list_del_init(&event->child_list);
+ event->parent = NULL;
}
static bool is_orphaned_event(struct perf_event *event)
> Does this work?
>
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -2303,6 +2303,7 @@ static void perf_child_detach(struct per
>
> sync_child_event(event);
> list_del_init(&event->child_list);
> + event->parent = NULL;
> }
>
> static bool is_orphaned_event(struct perf_event *event)
Apparently not, it ends up with:
------------[ cut here ]------------
WARNING: CPU: 145 PID: 5459 at kernel/events/core.c:281 event_function+0xd2/0xf0
Modules linked in: [...]
CPU: 145 UID: 1002 PID: 5459 Comm: perf_fuzzer Kdump: loaded Not tainted 6.14.0-rc1-pmu-unregister+ #244
Hardware name: AMD Corporation RUBY/RUBY, BIOS RRR1009C 07/21/2023
RIP: 0010:event_function+0xd2/0xf0
Code: [...]
RSP: 0018:ffa000000647fac8 EFLAGS: 00010046
RAX: 0000000000000000 RBX: ff1100100d4b0ee0 RCX: 0000000000000000
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
RBP: ffa000000647faf0 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: ffa000000647fbb8
R13: 0000000000000000 R14: ff11000104cc9e00 R15: ff110001216fcec0
FS: 000070acabcae740(0000) GS:ff1100100d480000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: ffffffffff600000 CR3: 000000011d920005 CR4: 0000000000f71ff0
PKRU: 55555554
Call Trace:
<TASK>
? show_regs+0x6c/0x80
? __warn+0x8d/0x150
? event_function+0xd2/0xf0
? report_bug+0x182/0x1b0
? handle_bug+0x6e/0xb0
? exc_invalid_op+0x18/0x80
? asm_exc_invalid_op+0x1b/0x20
? event_function+0xd2/0xf0
? event_function+0x3d/0xf0
remote_function+0x4f/0x70
? __pfx_remote_function+0x10/0x10
generic_exec_single+0x7f/0x160
smp_call_function_single+0x110/0x160
? __pfx_remote_function+0x10/0x10
? ktime_get_ts64+0x49/0x150
event_function_call+0x98/0x1d0
? __pfx___perf_event_disable+0x10/0x10
? __pfx___perf_event_disable+0x10/0x10
? __pfx_event_function+0x10/0x10
? __pfx__perf_event_disable+0x10/0x10
_perf_event_disable+0x41/0x70
perf_event_for_each_child+0x40/0x90
? __pfx__perf_event_disable+0x10/0x10
_perf_ioctl+0xac/0xb00
? srso_alias_return_thunk+0x5/0xfbef5
? place_entity+0xa7/0x170
? srso_alias_return_thunk+0x5/0xfbef5
? __x2apic_send_IPI_dest+0x32/0x50
? srso_alias_return_thunk+0x5/0xfbef5
? x2apic_send_IPI+0x26/0x40
? srso_alias_return_thunk+0x5/0xfbef5
? native_send_call_func_single_ipi+0x13/0x20
? srso_alias_return_thunk+0x5/0xfbef5
? __smp_call_single_queue+0xf7/0x180
? srso_alias_return_thunk+0x5/0xfbef5
? generic_exec_single+0x38/0x160
? srso_alias_return_thunk+0x5/0xfbef5
? smp_call_function_single_async+0x22/0x50
? srso_alias_return_thunk+0x5/0xfbef5
? srso_alias_return_thunk+0x5/0xfbef5
? mutex_lock+0x12/0x50
perf_ioctl+0x45/0x80
__x64_sys_ioctl+0xa7/0xe0
x64_sys_call+0x131e/0x2650
do_syscall_64+0x7e/0x170
? srso_alias_return_thunk+0x5/0xfbef5
? rcu_core+0x1d1/0x3a0
? srso_alias_return_thunk+0x5/0xfbef5
? rcu_core_si+0xe/0x20
? srso_alias_return_thunk+0x5/0xfbef5
? handle_softirqs+0xe7/0x330
? srso_alias_return_thunk+0x5/0xfbef5
? srso_alias_return_thunk+0x5/0xfbef5
? irqentry_exit_to_user_mode+0x43/0x250
? srso_alias_return_thunk+0x5/0xfbef5
? irqentry_exit+0x43/0x50
? srso_alias_return_thunk+0x5/0xfbef5
? sysvec_apic_timer_interrupt+0x4f/0xc0
entry_SYSCALL_64_after_hwframe+0x76/0x7e
RIP: 0033:0x70acabb24ded
Code: [...]
RSP: 002b:00007fffb3dd4a40 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
RAX: ffffffffffffffda RBX: 00007fffb3dd6f28 RCX: 000070acabb24ded
RDX: 0000000066f26a61 RSI: 0000000000002401 RDI: 0000000000000013
RBP: 00007fffb3dd4a90 R08: 000070acabc03084 R09: 000070acabc030a0
R10: 0000000000000008 R11: 0000000000000246 R12: 0000000000000001
R13: 0000000000000000 R14: 00005e38a9ec5b60 R15: 000070acabd00000
</TASK>
---[ end trace 0000000000000000 ]---
------------[ cut here ]------------
WARNING: CPU: 145 PID: 5459 at kernel/events/core.c:286 event_function+0xd6/0xf0
Modules linked in: [...]
CPU: 145 UID: 1002 PID: 5459 Comm: perf_fuzzer Kdump: loaded Tainted: G W 6.14.0-rc1-pmu-unregister+ #244
Tainted: [W]=WARN
Hardware name: AMD Corporation RUBY/RUBY, BIOS RRR1009C 07/21/2023
RIP: 0010:event_function+0xd6/0xf0
Code: [...]
RSP: 0018:ffa000000647fac8 EFLAGS: 00010086
RAX: 0000000000000000 RBX: ff1100100d4b0ee0 RCX: 0000000000000000
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
RBP: ffa000000647faf0 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: ffa000000647fbb8
R13: 0000000000000000 R14: ff11000104cc9e00 R15: ff110001216fcec0
FS: 000070acabcae740(0000) GS:ff1100100d480000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: ffffffffff600000 CR3: 000000011d920005 CR4: 0000000000f71ff0
PKRU: 55555554
Call Trace:
<TASK>
? show_regs+0x6c/0x80
? __warn+0x8d/0x150
? event_function+0xd6/0xf0
? report_bug+0x182/0x1b0
? handle_bug+0x6e/0xb0
? exc_invalid_op+0x18/0x80
? asm_exc_invalid_op+0x1b/0x20
? event_function+0xd6/0xf0
? event_function+0x3d/0xf0
remote_function+0x4f/0x70
? __pfx_remote_function+0x10/0x10
generic_exec_single+0x7f/0x160
smp_call_function_single+0x110/0x160
? __pfx_remote_function+0x10/0x10
? ktime_get_ts64+0x49/0x150
event_function_call+0x98/0x1d0
? __pfx___perf_event_disable+0x10/0x10
? __pfx___perf_event_disable+0x10/0x10
? __pfx_event_function+0x10/0x10
? __pfx__perf_event_disable+0x10/0x10
_perf_event_disable+0x41/0x70
perf_event_for_each_child+0x40/0x90
? __pfx__perf_event_disable+0x10/0x10
_perf_ioctl+0xac/0xb00
? srso_alias_return_thunk+0x5/0xfbef5
? place_entity+0xa7/0x170
? srso_alias_return_thunk+0x5/0xfbef5
? __x2apic_send_IPI_dest+0x32/0x50
? srso_alias_return_thunk+0x5/0xfbef5
? x2apic_send_IPI+0x26/0x40
? srso_alias_return_thunk+0x5/0xfbef5
? native_send_call_func_single_ipi+0x13/0x20
? srso_alias_return_thunk+0x5/0xfbef5
? __smp_call_single_queue+0xf7/0x180
? srso_alias_return_thunk+0x5/0xfbef5
? generic_exec_single+0x38/0x160
? srso_alias_return_thunk+0x5/0xfbef5
? smp_call_function_single_async+0x22/0x50
? srso_alias_return_thunk+0x5/0xfbef5
? srso_alias_return_thunk+0x5/0xfbef5
? mutex_lock+0x12/0x50
perf_ioctl+0x45/0x80
__x64_sys_ioctl+0xa7/0xe0
x64_sys_call+0x131e/0x2650
do_syscall_64+0x7e/0x170
? srso_alias_return_thunk+0x5/0xfbef5
? rcu_core+0x1d1/0x3a0
? srso_alias_return_thunk+0x5/0xfbef5
? rcu_core_si+0xe/0x20
? srso_alias_return_thunk+0x5/0xfbef5
? handle_softirqs+0xe7/0x330
? srso_alias_return_thunk+0x5/0xfbef5
? srso_alias_return_thunk+0x5/0xfbef5
? irqentry_exit_to_user_mode+0x43/0x250
? srso_alias_return_thunk+0x5/0xfbef5
? irqentry_exit+0x43/0x50
? srso_alias_return_thunk+0x5/0xfbef5
? sysvec_apic_timer_interrupt+0x4f/0xc0
entry_SYSCALL_64_after_hwframe+0x76/0x7e
RIP: 0033:0x70acabb24ded
Code: [...]
RSP: 002b:00007fffb3dd4a40 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
RAX: ffffffffffffffda RBX: 00007fffb3dd6f28 RCX: 000070acabb24ded
RDX: 0000000066f26a61 RSI: 0000000000002401 RDI: 0000000000000013
RBP: 00007fffb3dd4a90 R08: 000070acabc03084 R09: 000070acabc030a0
R10: 0000000000000008 R11: 0000000000000246 R12: 0000000000000001
R13: 0000000000000000 R14: 00005e38a9ec5b60 R15: 000070acabd00000
</TASK>
---[ end trace 0000000000000000 ]---
watchdog: BUG: soft lockup - CPU#88 stuck for 26s! [perf_fuzzer:5740]
Modules linked in: [...]
CPU: 88 UID: 1002 PID: 5740 Comm: perf_fuzzer Kdump: loaded Tainted: G W 6.14.0-rc1-pmu-unregister+ #244
Tainted: [W]=WARN
Hardware name: AMD Corporation RUBY/RUBY, BIOS RRR1009C 07/21/2023
RIP: 0010:smp_call_function_single+0x11f/0x160
Code: [...]
RSP: 0000:ffa0000004b57ba0 EFLAGS: 00000202
RAX: 0000000000000000 RBX: ff11000153ad8000 RCX: 0000000000000000
RDX: 0000000000000011 RSI: 0000000000000000 RDI: 0000000000000000
RBP: ffa0000004b57be8 R08: 0000000000000000 R09: ffa0000004b57c20
R10: 0000000000000001 R11: ffffffff81621ef0 R12: ff11000104cc9e00
R13: ffa0000004b57c08 R14: ff1100013732af40 R15: ff1100100b830ee0
FS: 0000000000000000(0000) GS:ff1100100b800000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00005e38a9ea8930 CR3: 0000000003440004 CR4: 0000000000f71ef0
PKRU: 55555554
Call Trace:
<IRQ>
? show_regs+0x6c/0x80
? watchdog_timer_fn+0x22b/0x2d0
? __pfx_watchdog_timer_fn+0x10/0x10
? __hrtimer_run_queues+0x112/0x2a0
? clockevents_program_event+0xc1/0x150
? hrtimer_interrupt+0xfd/0x260
? __sysvec_apic_timer_interrupt+0x56/0x130
? sysvec_apic_timer_interrupt+0x93/0xc0
</IRQ>
<TASK>
? asm_sysvec_apic_timer_interrupt+0x1b/0x20
? __pfx_remote_function+0x10/0x10
? smp_call_function_single+0x11f/0x160
? __pfx_remote_function+0x10/0x10
? event_function_call+0x115/0x1d0
? srso_alias_return_thunk+0x5/0xfbef5
event_function_call+0x98/0x1d0
? __pfx___perf_remove_from_context+0x10/0x10
? __pfx___perf_remove_from_context+0x10/0x10
? __pfx_event_function+0x10/0x10
perf_remove_from_context+0x46/0xa0
perf_event_release_kernel+0x1f3/0x210
perf_release+0x12/0x20
__fput+0xed/0x2d0
____fput+0x15/0x20
task_work_run+0x60/0xa0
do_exit+0x317/0xb00
? srso_alias_return_thunk+0x5/0xfbef5
do_group_exit+0x34/0x90
get_signal+0x934/0x940
arch_do_signal_or_restart+0x2f/0x250
? srso_alias_return_thunk+0x5/0xfbef5
? event_function+0xa4/0xf0
? __pfx_remote_function+0x10/0x10
? srso_alias_return_thunk+0x5/0xfbef5
? remote_function+0x4f/0x70
? srso_alias_return_thunk+0x5/0xfbef5
irqentry_exit_to_user_mode+0x1e0/0x250
irqentry_exit+0x43/0x50
sysvec_reschedule_ipi+0x5d/0x100
asm_sysvec_reschedule_ipi+0x1b/0x20
RIP: 0033:0x5e38a9eb87dd
Code: Unable to access opcode bytes at 0x5e38a9eb87b3.
RSP: 002b:00007fffb3dd4ae8 EFLAGS: 00000202
RAX: 0000000000001553 RBX: 00007fffb3dd6f28 RCX: 00000000000020da
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000001
RBP: 00007fffb3dd4b00 R08: 0000000000000001 R09: 000070acabcae740
R10: 000070acabd010b8 R11: 0000000000000246 R12: 0000000000000001
R13: 0000000000000000 R14: 00005e38a9ec5b60 R15: 000070acabd00000
</TASK>
Something like below instead? I haven't tested it thoroughly though.
---
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d2b87a425e75..4e131b1c37ad 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -13645,20 +13645,25 @@ perf_event_exit_event(struct perf_event *event,
unsigned long detach_flags = DETACH_EXIT;
if (parent_event) {
- /*
- * Do not destroy the 'original' grouping; because of the
- * context switch optimization the original events could've
- * ended up in a random child task.
- *
- * If we were to destroy the original group, all group related
- * operations would cease to function properly after this
- * random child dies.
- *
- * Do destroy all inherited groups, we don't care about those
- * and being thorough is better.
- */
- detach_flags |= DETACH_GROUP | DETACH_CHILD;
mutex_lock(&parent_event->child_mutex);
+ if (event->attach_state & PERF_ATTACH_CHILD) {
+ /*
+ * Do not destroy the 'original' grouping; because of the
+ * context switch optimization the original events could've
+ * ended up in a random child task.
+ *
+ * If we were to destroy the original group, all group related
+ * operations would cease to function properly after this
+ * random child dies.
+ *
+ * Do destroy all inherited groups, we don't care about those
+ * and being thorough is better.
+ */
+ detach_flags |= DETACH_GROUP | DETACH_CHILD;
+ } else {
+ mutex_unlock(&parent_event->child_mutex);
+ parent_event = NULL;
+ }
}
if (revoke)
---
Thanks,
Ravi
On Thu, Feb 13, 2025 at 01:22:55PM +0530, Ravi Bangoria wrote:
> Apparently not, it ends up with:
>
> ------------[ cut here ]------------
> WARNING: CPU: 145 PID: 5459 at kernel/events/core.c:281 event_function+0xd2/0xf0
> WARNING: CPU: 145 PID: 5459 at kernel/events/core.c:286 event_function+0xd6/0xf0
> remote_function+0x4f/0x70
> generic_exec_single+0x7f/0x160
> smp_call_function_single+0x110/0x160
> event_function_call+0x98/0x1d0
> _perf_event_disable+0x41/0x70
> perf_event_for_each_child+0x40/0x90
> _perf_ioctl+0xac/0xb00
> perf_ioctl+0x45/0x80
Took me a long while trying to blame this on the 'event->parent =
NULL;', but AFAICT this is a new, unrelated issue.
What I think happens is this perf_ioctl(DISABLE) vs pmu_detach_events()
race, where the crux is that perf_ioctl() path does not take
event2->mutex which allows the following interleave:
event1 <---> ctx1
| ^
child_list | | parent
v |
event2 <---> ctx2
perf_ioctl()
perf_event_ctx_lock(event1)
get_ctx(ctx1)
mutex_lock(ctx1->mutex)
_perf_ioctk()
perf_event_for_each_child()
mutex_lock(event1->child_mutex)
_perf_event_disable(event1)
_perf_event_disable(event2)
raw_spin_lock_irq(ctx2->lock)
raw_spin_unlock_irq()
event_function_call(event2, __perf_event_disable)
task_function_call()
<IPI __perf_event_disable>
pmu_detach_events()
event2 = pmu_get_event() <-- inc(event2->refcount);
pmu_detach_event(event2)
perf_event_ctx_lock(event2)
get_ctx(ctx2)
mutex_lock(ctx2->lock)
__pmu_detach_event()
perf_event_exit_event()
mutex_lock(event1->child_mutex)
perf_remove_from_context(event2, EXIT|GROUP|CHILD|REVOKE)
lockdep_assert_held(ctx2->mutex)
raw_spin_lock_irq(ctx2->lock)
raw_spin_unlock_irq(ctx2->lock)
event_function_call(event2, __perf_remove_from_context)
task_function_call(event_function)
<IPI __perf_remove_from_context>
remote_function()
event_function()
perf_ctx_lock(cpuctx, ctx2)
raw_spin_lock(ctx2->lock)
__perf_remove_from_context(event2)
event_sched_out()
perf_group_detach()
perf_child_detach()
list_del_event()
event->state = REVOKED;
cpc->task_epc = NULL; // event2 is last
ctx->is_active = 0; <--.
cpuctx->task_ctx = NULL; |
|
|
<IPI __perf_event_disable> |
remote_function() |
event_function() |
perf_ctx_lock(cpuctx, ctx2) |
raw_spin_lock(ctx2->lock) |
|
WARN_ON_ONCE(!ctx2->is_active) -'
WARN_ON_ONCE(cpuctx->task_ctx != ctx2)
Still trying to work out how best to avoid this.
Hi Peter, >> Apparently not, it ends up with: >> >> ------------[ cut here ]------------ >> WARNING: CPU: 145 PID: 5459 at kernel/events/core.c:281 event_function+0xd2/0xf0 >> WARNING: CPU: 145 PID: 5459 at kernel/events/core.c:286 event_function+0xd6/0xf0 > >> remote_function+0x4f/0x70 >> generic_exec_single+0x7f/0x160 >> smp_call_function_single+0x110/0x160 >> event_function_call+0x98/0x1d0 >> _perf_event_disable+0x41/0x70 >> perf_event_for_each_child+0x40/0x90 >> _perf_ioctl+0xac/0xb00 >> perf_ioctl+0x45/0x80 > > Took me a long while trying to blame this on the 'event->parent = > NULL;', but AFAICT this is a new, unrelated issue. > > What I think happens is this perf_ioctl(DISABLE) vs pmu_detach_events() > race, where the crux is that perf_ioctl() path does not take > event2->mutex which allows the following interleave: This one was only with perf_fuzzer, so pmu_detach_events() code path was not invoked. Thanks, Ravi
On 17-Feb-25 1:54 PM, Ravi Bangoria wrote: > Hi Peter, > >>> Apparently not, it ends up with: >>> >>> ------------[ cut here ]------------ >>> WARNING: CPU: 145 PID: 5459 at kernel/events/core.c:281 event_function+0xd2/0xf0 >>> WARNING: CPU: 145 PID: 5459 at kernel/events/core.c:286 event_function+0xd6/0xf0 >> >>> remote_function+0x4f/0x70 >>> generic_exec_single+0x7f/0x160 >>> smp_call_function_single+0x110/0x160 >>> event_function_call+0x98/0x1d0 >>> _perf_event_disable+0x41/0x70 >>> perf_event_for_each_child+0x40/0x90 >>> _perf_ioctl+0xac/0xb00 >>> perf_ioctl+0x45/0x80 >> >> Took me a long while trying to blame this on the 'event->parent = >> NULL;', but AFAICT this is a new, unrelated issue. >> >> What I think happens is this perf_ioctl(DISABLE) vs pmu_detach_events() >> race, where the crux is that perf_ioctl() path does not take >> event2->mutex which allows the following interleave: > > This one was only with perf_fuzzer, so pmu_detach_events() code path was > not invoked. I think the issue is, unaccount_event() gets called for the child event after the child is detached. Since event->parent is NULL, unaccount_event() abruptly corrupts the perf_sched_work. I haven't verified it. Will do it tomorrow. Thanks, Ravi
Hi Peter,
>>>> Apparently not, it ends up with:
>>>>
>>>> ------------[ cut here ]------------
>>>> WARNING: CPU: 145 PID: 5459 at kernel/events/core.c:281 event_function+0xd2/0xf0
>>>> WARNING: CPU: 145 PID: 5459 at kernel/events/core.c:286 event_function+0xd6/0xf0
>>>
>>>> remote_function+0x4f/0x70
>>>> generic_exec_single+0x7f/0x160
>>>> smp_call_function_single+0x110/0x160
>>>> event_function_call+0x98/0x1d0
>>>> _perf_event_disable+0x41/0x70
>>>> perf_event_for_each_child+0x40/0x90
>>>> _perf_ioctl+0xac/0xb00
>>>> perf_ioctl+0x45/0x80
>>>
>>> Took me a long while trying to blame this on the 'event->parent =
>>> NULL;', but AFAICT this is a new, unrelated issue.
>>>
>>> What I think happens is this perf_ioctl(DISABLE) vs pmu_detach_events()
>>> race, where the crux is that perf_ioctl() path does not take
>>> event2->mutex which allows the following interleave:
>>
>> This one was only with perf_fuzzer, so pmu_detach_events() code path was
>> not invoked.
>
> I think the issue is, unaccount_event() gets called for the child event
> after the child is detached. Since event->parent is NULL, unaccount_event()
> abruptly corrupts the perf_sched_work.
A different manifestation of the same underlying issue (perf_sched_work
is getting corrupted because of event->parent = NULL):
WARNING: CPU: 0 PID: 5799 at kernel/events/core.c:286 event_function+0xd6/0xf0
CPU: 0 UID: 1002 PID: 5799 Comm: perf_fuzzer Kdump: loaded Not tainted 6.14.0-rc1-pmu-unregister+ #263
RIP: 0010:event_function+0xd6/0xf0
RSP: 0018:ffa0000006a87828 EFLAGS: 00010086
RAX: 0000000000000007 RBX: ff11001008830ee0 RCX: 0000000000000000
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
RBP: ffa0000006a87850 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: ffa0000006a87918
R13: 0000000000000000 R14: ff1100010e347c00 R15: ff110001226c9a40
FS: 0000000000000000(0000) GS:ff11001008800000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000595fed7d3e12 CR3: 000000016954c005 CR4: 0000000000f71ef0
PKRU: 55555554
Call Trace:
<TASK>
? event_function+0xd6/0xf0
? event_function+0x3d/0xf0
remote_function+0x4c/0x70
generic_exec_single+0x7c/0x160
smp_call_function_single+0x110/0x160
event_function_call+0x98/0x1d0
perf_remove_from_context+0x46/0xa0
perf_event_release_kernel+0x1f3/0x210
perf_release+0x12/0x20
With the above trace as reference, something like below is what is
happening. (Code paths are based on my understanding of traces, I've
not verified each an every bit :P)
Assume:
task1: event1
task2 (child of task1): event2 (child of event1)
task3: event3
CPU 1 CPU 2 CPU 3 CPU 4
/* task3 sched in */
perf_event_task_sched_in()
if (static_branch_unlikely(&perf_sched_events)) /* true */
__perf_event_task_sched_in()
...
task2:
perf_event_exit_event() /* event2 */
event->parent = NULL /* by perf_perf_child_detach() */
free_event()
_free_event()
unaccount_event()
if (event->parent) /* false */
return; /* won't get executed */
perf_sched_work <== OFF ------.
|
| /* task3 sched out */
| perf_event_task_sched_out()
'-----> if (static_branch_unlikely(&perf_sched_events)) /* false */
| /* __perf_event_task_sched_out() won't get called */
|
| /* task3 sched in */
| perf_event_task_sched_in()
'--------------------------------------------------------------------------> if (static_branch_unlikely(&perf_sched_events)) /* false */
/* __perf_event_task_sched_in() won't get called
So cpuctx->task_ctx will remains NULL; */
close(event3)
perf_release()
perf_event_release_kernel()
perf_remove_from_context()
event_function_call()
/* IPI to CPU 3 ---> */
/* ---> IPI from CPU 4 */
event_function()
if (ctx->task) {
/* task_ctx is NULL whereas ctx is !NULL */
WARN_ON_ONCE(task_ctx != ctx);
^^^^^^^^^^^^
IIUC, event->parent is the only way to detect whether the event is a
child or not, so it makes sense to preserve the ->parent pointer even
after the child is detached.
Thanks,
Ravi
>>>>> Apparently not, it ends up with: >>>>> >>>>> ------------[ cut here ]------------ >>>>> WARNING: CPU: 145 PID: 5459 at kernel/events/core.c:281 event_function+0xd2/0xf0 >>>>> WARNING: CPU: 145 PID: 5459 at kernel/events/core.c:286 event_function+0xd6/0xf0 >>>> >>>>> remote_function+0x4f/0x70 >>>>> generic_exec_single+0x7f/0x160 >>>>> smp_call_function_single+0x110/0x160 >>>>> event_function_call+0x98/0x1d0 >>>>> _perf_event_disable+0x41/0x70 >>>>> perf_event_for_each_child+0x40/0x90 >>>>> _perf_ioctl+0xac/0xb00 >>>>> perf_ioctl+0x45/0x80 >>>> >>>> Took me a long while trying to blame this on the 'event->parent = >>>> NULL;', but AFAICT this is a new, unrelated issue. >>>> >>>> What I think happens is this perf_ioctl(DISABLE) vs pmu_detach_events() >>>> race, where the crux is that perf_ioctl() path does not take >>>> event2->mutex which allows the following interleave: >>> >>> This one was only with perf_fuzzer, so pmu_detach_events() code path was >>> not invoked. >> >> I think the issue is, unaccount_event() gets called for the child event >> after the child is detached. Since event->parent is NULL, unaccount_event() >> abruptly corrupts the perf_sched_work. > > A different manifestation of the same underlying issue (perf_sched_work > is getting corrupted because of event->parent = NULL): Duh! I meant perf_sched_events gets corrupted by invoking perf_sched_work. Thanks, Ravi
On Thu, Feb 13, 2025 at 01:22:55PM +0530, Ravi Bangoria wrote:
> > Does this work?
> >
> > --- a/kernel/events/core.c
> > +++ b/kernel/events/core.c
> > @@ -2303,6 +2303,7 @@ static void perf_child_detach(struct per
> >
> > sync_child_event(event);
> > list_del_init(&event->child_list);
> > + event->parent = NULL;
> > }
> >
> > static bool is_orphaned_event(struct perf_event *event)
>
> Apparently not, it ends up with:
>
> ------------[ cut here ]------------
> WARNING: CPU: 145 PID: 5459 at kernel/events/core.c:281 event_function+0xd2/0xf0
Durr, do you have an updated test case?
> Something like below instead? I haven't tested it thoroughly though.
>
> ---
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index d2b87a425e75..4e131b1c37ad 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -13645,20 +13645,25 @@ perf_event_exit_event(struct perf_event *event,
> unsigned long detach_flags = DETACH_EXIT;
>
> if (parent_event) {
> - /*
> - * Do not destroy the 'original' grouping; because of the
> - * context switch optimization the original events could've
> - * ended up in a random child task.
> - *
> - * If we were to destroy the original group, all group related
> - * operations would cease to function properly after this
> - * random child dies.
> - *
> - * Do destroy all inherited groups, we don't care about those
> - * and being thorough is better.
> - */
> - detach_flags |= DETACH_GROUP | DETACH_CHILD;
> mutex_lock(&parent_event->child_mutex);
> + if (event->attach_state & PERF_ATTACH_CHILD) {
> + /*
> + * Do not destroy the 'original' grouping; because of the
> + * context switch optimization the original events could've
> + * ended up in a random child task.
> + *
> + * If we were to destroy the original group, all group related
> + * operations would cease to function properly after this
> + * random child dies.
> + *
> + * Do destroy all inherited groups, we don't care about those
> + * and being thorough is better.
> + */
> + detach_flags |= DETACH_GROUP | DETACH_CHILD;
> + } else {
> + mutex_unlock(&parent_event->child_mutex);
> + parent_event = NULL;
> + }
> }
Yeah, that might do, but not really nice. But perhaps its the best we
can do. I'll give it some thought.
On 13-Feb-25 6:38 PM, Peter Zijlstra wrote: > On Thu, Feb 13, 2025 at 01:22:55PM +0530, Ravi Bangoria wrote: >>> Does this work? >>> >>> --- a/kernel/events/core.c >>> +++ b/kernel/events/core.c >>> @@ -2303,6 +2303,7 @@ static void perf_child_detach(struct per >>> >>> sync_child_event(event); >>> list_del_init(&event->child_list); >>> + event->parent = NULL; >>> } >>> >>> static bool is_orphaned_event(struct perf_event *event) >> >> Apparently not, it ends up with: >> >> ------------[ cut here ]------------ >> WARNING: CPU: 145 PID: 5459 at kernel/events/core.c:281 event_function+0xd2/0xf0 > > Durr, do you have an updated test case? This one triggered with just perf_fuzzer run (without my tinypmu test). But in general, to test the whole series, I ran perf_fuzzer simultaneously with my tinypmu tests: term1~$ echo -1 | sudo tee /proc/sys/kernel/perf_event_paranoid; while true; do ./perf_fuzzer; done term2~$ cd ~/tinypmu; make clean; make; while true; do sudo bash tinypmu-u.sh; done Thanks, Ravi
Hi Peter,
> @@ -1737,7 +1742,7 @@ static inline bool needs_branch_stack(st
>
> static inline bool has_aux(struct perf_event *event)
> {
> - return event->pmu->setup_aux;
> + return event->pmu && event->pmu->setup_aux;
> }
this ...
> @@ -6412,7 +6444,7 @@ static void perf_mmap_open(struct vm_are
> if (vma->vm_pgoff)
> atomic_inc(&event->rb->aux_mmap_count);
>
> - if (event->pmu->event_mapped)
> + if (event->pmu && event->pmu->event_mapped)
> event->pmu->event_mapped(event, vma->vm_mm);
> }
>
> @@ -6435,7 +6467,8 @@ static void perf_mmap_close(struct vm_ar
> unsigned long size = perf_data_size(rb);
> bool detach_rest = false;
>
> - if (event->pmu->event_unmapped)
> + /* FIXIES vs perf_pmu_unregister() */
> + if (event->pmu && event->pmu->event_unmapped)
> event->pmu->event_unmapped(event, vma->vm_mm);
these two ...
> @@ -6837,7 +6880,7 @@ static int perf_mmap(struct file *file,
> if (!ret)
> ret = map_range(rb, vma);
>
> - if (!ret && event->pmu->event_mapped)
> + if (!ret && event->pmu && event->pmu->event_mapped)
> event->pmu->event_mapped(event, vma->vm_mm);
... and this one.
There is no serialization at all these places. i.e. event->pmu can become
NULL in between two checks.
CPU0: event->pmu = NULL
|
V
CPU1: if (event->pmu && event->pmu->blahblah())
Thanks,
Ravi
On Mon, Feb 10, 2025 at 12:09:18PM +0530, Ravi Bangoria wrote:
> Hi Peter,
>
> > @@ -1737,7 +1742,7 @@ static inline bool needs_branch_stack(st
> >
> > static inline bool has_aux(struct perf_event *event)
> > {
> > - return event->pmu->setup_aux;
> > + return event->pmu && event->pmu->setup_aux;
> > }
>
> this ...
Bah, I'll try and trace that.
> > @@ -6412,7 +6444,7 @@ static void perf_mmap_open(struct vm_are
> > if (vma->vm_pgoff)
> > atomic_inc(&event->rb->aux_mmap_count);
> >
> > - if (event->pmu->event_mapped)
> > + if (event->pmu && event->pmu->event_mapped)
> > event->pmu->event_mapped(event, vma->vm_mm);
> > }
> >
> > @@ -6435,7 +6467,8 @@ static void perf_mmap_close(struct vm_ar
> > unsigned long size = perf_data_size(rb);
> > bool detach_rest = false;
> >
> > - if (event->pmu->event_unmapped)
> > + /* FIXIES vs perf_pmu_unregister() */
> > + if (event->pmu && event->pmu->event_unmapped)
> > event->pmu->event_unmapped(event, vma->vm_mm);
>
> these two ...
>
> > @@ -6837,7 +6880,7 @@ static int perf_mmap(struct file *file,
> > if (!ret)
> > ret = map_range(rb, vma);
> >
> > - if (!ret && event->pmu->event_mapped)
> > + if (!ret && event->pmu && event->pmu->event_mapped)
> > event->pmu->event_mapped(event, vma->vm_mm);
>
> ... and this one.
These are relatively simple to fix, something like so. This relies on
the fact that if there's mapped functions the whole unregister thing
won't happen.
---
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6432,9 +6432,22 @@ void ring_buffer_put(struct perf_buffer
call_rcu(&rb->rcu_head, rb_free_rcu);
}
+typedef void (*mapped_f)(struct perf_event *event, struct mm_struct *mm);
+
+#define get_mapped(event, func) \
+({ struct pmu *pmu; \
+ mapped_f f = NULL; \
+ guard(rcu)(); \
+ pmu = READ_ONCE(event->pmu); \
+ if (pmu) \
+ f = pmu->func; \
+ f; \
+})
+
static void perf_mmap_open(struct vm_area_struct *vma)
{
struct perf_event *event = vma->vm_file->private_data;
+ mapped_f mapped = get_mapped(event, event_mapped);
atomic_inc(&event->mmap_count);
atomic_inc(&event->rb->mmap_count);
@@ -6442,8 +6455,8 @@ static void perf_mmap_open(struct vm_are
if (vma->vm_pgoff)
atomic_inc(&event->rb->aux_mmap_count);
- if (event->pmu && event->pmu->event_mapped)
- event->pmu->event_mapped(event, vma->vm_mm);
+ if (mapped)
+ mapped(event, vma->vm_mm);
}
static void perf_pmu_output_stop(struct perf_event *event);
@@ -6459,6 +6472,7 @@ static void perf_pmu_output_stop(struct
static void perf_mmap_close(struct vm_area_struct *vma)
{
struct perf_event *event = vma->vm_file->private_data;
+ mapped_f unmapped = get_mapped(event, event_unmapped);
struct perf_buffer *rb = ring_buffer_get(event);
struct user_struct *mmap_user = rb->mmap_user;
int mmap_locked = rb->mmap_locked;
@@ -6466,8 +6480,8 @@ static void perf_mmap_close(struct vm_ar
bool detach_rest = false;
/* FIXIES vs perf_pmu_unregister() */
- if (event->pmu && event->pmu->event_unmapped)
- event->pmu->event_unmapped(event, vma->vm_mm);
+ if (unmapped)
+ unmapped(event, vma->vm_mm);
/*
* The AUX buffer is strictly a sub-buffer, serialize using aux_mutex
@@ -6660,6 +6674,7 @@ static int perf_mmap(struct file *file,
unsigned long nr_pages;
long user_extra = 0, extra = 0;
int ret, flags = 0;
+ mapped_f mapped;
/*
* Don't allow mmap() of inherited per-task counters. This would
@@ -6878,8 +6893,9 @@ static int perf_mmap(struct file *file,
if (!ret)
ret = map_range(rb, vma);
- if (!ret && event->pmu && event->pmu->event_mapped)
- event->pmu->event_mapped(event, vma->vm_mm);
+ mapped = get_mapped(event, event_mapped);
+ if (mapped)
+ mapped(event, vma->vm_mm);
return ret;
}
© 2016 - 2026 Red Hat, Inc.