Previously it was only safe to call perf_pmu_unregister() if there
were no active events of that pmu around -- which was impossible to
guarantee since it races all sorts against perf_init_event().
Rework the whole thing by:
- keeping track of all events for a given pmu
- 'hiding' the pmu from perf_init_event()
- waiting for the appropriate (s)rcu grace periods such that all
prior references to the PMU will be completed
- detaching all still existing events of that pmu (see first point)
and moving them to a new REVOKED state.
- actually freeing the pmu data.
Where notably the new REVOKED state must inhibit all event actions
from reaching code that wants to use event->pmu.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
include/linux/perf_event.h | 13 +-
kernel/events/core.c | 222 ++++++++++++++++++++++++++++++++++++++++-----
2 files changed, 210 insertions(+), 25 deletions(-)
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -318,6 +318,9 @@ struct perf_output_handle;
struct pmu {
struct list_head entry;
+ spinlock_t events_lock;
+ struct list_head events;
+
struct module *module;
struct device *dev;
struct device *parent;
@@ -611,9 +614,10 @@ struct perf_addr_filter_range {
* enum perf_event_state - the states of an event:
*/
enum perf_event_state {
- PERF_EVENT_STATE_DEAD = -4,
- PERF_EVENT_STATE_EXIT = -3,
- PERF_EVENT_STATE_ERROR = -2,
+ PERF_EVENT_STATE_DEAD = -5,
+ PERF_EVENT_STATE_REVOKED = -4, /* pmu gone, must not touch */
+ PERF_EVENT_STATE_EXIT = -3, /* task died, still inherit */
+ PERF_EVENT_STATE_ERROR = -2, /* scheduling error, can enable */
PERF_EVENT_STATE_OFF = -1,
PERF_EVENT_STATE_INACTIVE = 0,
PERF_EVENT_STATE_ACTIVE = 1,
@@ -854,6 +858,7 @@ struct perf_event {
void *security;
#endif
struct list_head sb_list;
+ struct list_head pmu_list;
/*
* Certain events gets forwarded to another pmu internally by over-
@@ -1105,7 +1110,7 @@ extern void perf_aux_output_flag(struct
extern void perf_event_itrace_started(struct perf_event *event);
extern int perf_pmu_register(struct pmu *pmu, const char *name, int type);
-extern void perf_pmu_unregister(struct pmu *pmu);
+extern int perf_pmu_unregister(struct pmu *pmu);
extern void __perf_event_task_sched_in(struct task_struct *prev,
struct task_struct *task);
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2412,7 +2412,9 @@ ctx_time_update_event(struct perf_event_
#define DETACH_GROUP 0x01UL
#define DETACH_CHILD 0x02UL
-#define DETACH_DEAD 0x04UL
+#define DETACH_EXIT 0x04UL
+#define DETACH_REVOKE 0x08UL
+#define DETACH_DEAD 0x10UL
/*
* Cross CPU call to remove a performance event
@@ -2427,6 +2429,7 @@ __perf_remove_from_context(struct perf_e
void *info)
{
struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
+ enum perf_event_state state = PERF_EVENT_STATE_OFF;
unsigned long flags = (unsigned long)info;
ctx_time_update(cpuctx, ctx);
@@ -2435,16 +2438,22 @@ __perf_remove_from_context(struct perf_e
* Ensure event_sched_out() switches to OFF, at the very least
* this avoids raising perf_pending_task() at this time.
*/
- if (flags & DETACH_DEAD)
+ if (flags & DETACH_EXIT)
+ state = PERF_EVENT_STATE_EXIT;
+ if (flags & DETACH_REVOKE)
+ state = PERF_EVENT_STATE_REVOKED;
+ if (flags & DETACH_DEAD) {
event->pending_disable = 1;
+ state = PERF_EVENT_STATE_DEAD;
+ }
event_sched_out(event, ctx);
if (flags & DETACH_GROUP)
perf_group_detach(event);
if (flags & DETACH_CHILD)
perf_child_detach(event);
list_del_event(event, ctx);
- if (flags & DETACH_DEAD)
- event->state = PERF_EVENT_STATE_DEAD;
+
+ event->state = state;
if (!pmu_ctx->nr_events) {
pmu_ctx->rotate_necessary = 0;
@@ -4511,7 +4520,8 @@ static void perf_event_enable_on_exec(st
static void perf_remove_from_owner(struct perf_event *event);
static void perf_event_exit_event(struct perf_event *event,
- struct perf_event_context *ctx);
+ struct perf_event_context *ctx,
+ bool revoke);
/*
* Removes all events from the current task that have been marked
@@ -4538,7 +4548,7 @@ static void perf_event_remove_on_exec(st
modified = true;
- perf_event_exit_event(event, ctx);
+ perf_event_exit_event(event, ctx, false);
}
raw_spin_lock_irqsave(&ctx->lock, flags);
@@ -5138,6 +5148,7 @@ static bool is_sb_event(struct perf_even
attr->context_switch || attr->text_poke ||
attr->bpf_event)
return true;
+
return false;
}
@@ -5339,6 +5350,8 @@ static void perf_pending_task_sync(struc
/* vs perf_event_alloc() error */
static void __free_event(struct perf_event *event)
{
+ struct pmu *pmu = event->pmu;
+
if (event->attach_state & PERF_ATTACH_CALLCHAIN)
put_callchain_buffers();
@@ -5365,6 +5378,7 @@ static void __free_event(struct perf_eve
* put_pmu_ctx() needs an event->ctx reference, because of
* epc->ctx.
*/
+ WARN_ON_ONCE(!pmu);
WARN_ON_ONCE(!event->ctx);
WARN_ON_ONCE(event->pmu_ctx->ctx != event->ctx);
put_pmu_ctx(event->pmu_ctx);
@@ -5377,8 +5391,13 @@ static void __free_event(struct perf_eve
if (event->ctx)
put_ctx(event->ctx);
- if (event->pmu)
- module_put(event->pmu->module);
+ if (pmu) {
+ module_put(pmu->module);
+ scoped_guard (spinlock, &pmu->events_lock) {
+ list_del(&event->pmu_list);
+ wake_up_var(pmu);
+ }
+ }
call_rcu(&event->rcu_head, free_event_rcu);
}
@@ -5397,6 +5416,7 @@ static void _free_event(struct perf_even
security_perf_event_free(event);
if (event->rb) {
+ WARN_ON_ONCE(!event->pmu);
/*
* Can happen when we close an event with re-directed output.
*
@@ -5527,7 +5547,11 @@ int perf_event_release_kernel(struct per
* Thus this guarantees that we will in fact observe and kill _ALL_
* child events.
*/
- perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);
+ if (event->state > PERF_EVENT_STATE_REVOKED) {
+ perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);
+ } else {
+ event->state = PERF_EVENT_STATE_DEAD;
+ }
perf_event_ctx_unlock(event, ctx);
@@ -5838,7 +5862,7 @@ __perf_read(struct perf_event *event, ch
* error state (i.e. because it was pinned but it couldn't be
* scheduled on to the CPU at some point).
*/
- if (event->state == PERF_EVENT_STATE_ERROR)
+ if (event->state <= PERF_EVENT_STATE_ERROR)
return 0;
if (count < event->read_size)
@@ -5877,8 +5901,14 @@ static __poll_t perf_poll(struct file *f
struct perf_buffer *rb;
__poll_t events = EPOLLHUP;
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return EPOLLERR;
+
poll_wait(file, &event->waitq, wait);
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return EPOLLERR;
+
if (is_event_hup(event))
return events;
@@ -6058,6 +6088,9 @@ static long _perf_ioctl(struct perf_even
void (*func)(struct perf_event *);
u32 flags = arg;
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return -ENODEV;
+
switch (cmd) {
case PERF_EVENT_IOC_ENABLE:
func = _perf_event_enable;
@@ -6507,6 +6540,7 @@ static void perf_mmap_close(struct vm_ar
unsigned long size = perf_data_size(rb);
bool detach_rest = false;
+ /* FIXIES vs perf_pmu_unregister() */
if (event->pmu->event_unmapped)
event->pmu->event_unmapped(event, vma->vm_mm);
@@ -6657,6 +6691,16 @@ static int perf_mmap(struct file *file,
mutex_lock(&event->mmap_mutex);
ret = -EINVAL;
+ /*
+ * This relies on __pmu_detach_event() taking mmap_mutex after marking
+ * the event REVOKED. Either we observe the state, or __pmu_detach_event()
+ * will detach the rb created here.
+ */
+ if (event->state <= PERF_EVENT_STATE_REVOKED) {
+ ret = -ENODEV;
+ goto unlock;
+ }
+
if (vma->vm_pgoff == 0) {
nr_pages -= 1;
@@ -6840,6 +6884,9 @@ static int perf_fasync(int fd, struct fi
struct perf_event *event = filp->private_data;
int retval;
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return -ENODEV;
+
inode_lock(inode);
retval = fasync_helper(fd, filp, on, &event->fasync);
inode_unlock(inode);
@@ -11892,6 +11939,9 @@ int perf_pmu_register(struct pmu *_pmu,
if (!pmu->event_idx)
pmu->event_idx = perf_event_idx_default;
+ INIT_LIST_HEAD(&pmu->events);
+ spin_lock_init(&pmu->events_lock);
+
/*
* Now that the PMU is complete, make it visible to perf_try_init_event().
*/
@@ -11905,11 +11955,100 @@ int perf_pmu_register(struct pmu *_pmu,
}
EXPORT_SYMBOL_GPL(perf_pmu_register);
-void perf_pmu_unregister(struct pmu *pmu)
+static void __pmu_detach_event(struct pmu *pmu, struct perf_event *event,
+ struct perf_event_context *ctx)
+{
+ /*
+ * De-schedule the event and mark it REVOKED.
+ */
+ perf_event_exit_event(event, ctx, true);
+
+ /*
+ * All _free_event() bits that rely on event->pmu:
+ *
+ * Notably, perf_mmap() relies on the ordering here.
+ */
+ scoped_guard (mutex, &event->mmap_mutex) {
+ WARN_ON_ONCE(pmu->event_unmapped);
+ ring_buffer_attach(event, NULL);
+ }
+
+ perf_event_free_bpf_prog(event);
+ perf_free_addr_filters(event);
+
+ if (event->destroy) {
+ event->destroy(event);
+ event->destroy = NULL;
+ }
+
+ if (event->pmu_ctx) {
+ put_pmu_ctx(event->pmu_ctx);
+ event->pmu_ctx = NULL;
+ }
+
+ exclusive_event_destroy(event);
+ module_put(pmu->module);
+
+ event->pmu = NULL; /* force fault instead of UAF */
+}
+
+static void pmu_detach_event(struct pmu *pmu, struct perf_event *event)
+{
+ struct perf_event_context *ctx;
+
+ ctx = perf_event_ctx_lock(event);
+ __pmu_detach_event(pmu, event, ctx);
+ perf_event_ctx_unlock(event, ctx);
+
+ scoped_guard (spinlock, &pmu->events_lock)
+ list_del(&event->pmu_list);
+}
+
+static struct perf_event *pmu_get_event(struct pmu *pmu)
+{
+ struct perf_event *event;
+
+ guard(spinlock)(&pmu->events_lock);
+ list_for_each_entry(event, &pmu->events, pmu_list) {
+ if (atomic_long_inc_not_zero(&event->refcount))
+ return event;
+ }
+
+ return NULL;
+}
+
+static bool pmu_empty(struct pmu *pmu)
+{
+ guard(spinlock)(&pmu->events_lock);
+ return list_empty(&pmu->events);
+}
+
+static void pmu_detach_events(struct pmu *pmu)
+{
+ struct perf_event *event;
+
+ for (;;) {
+ event = pmu_get_event(pmu);
+ if (!event)
+ break;
+
+ pmu_detach_event(pmu, event);
+ put_event(event);
+ }
+
+ /*
+ * wait for pending _free_event()s
+ */
+ wait_var_event(pmu, pmu_empty(pmu));
+}
+
+int perf_pmu_unregister(struct pmu *pmu)
{
scoped_guard (mutex, &pmus_lock) {
+ if (!idr_cmpxchg(&pmu_idr, pmu->type, pmu, NULL))
+ return -EINVAL;
+
list_del_rcu(&pmu->entry);
- idr_remove(&pmu_idr, pmu->type);
}
/*
@@ -11919,7 +12058,31 @@ void perf_pmu_unregister(struct pmu *pmu
synchronize_srcu(&pmus_srcu);
synchronize_rcu();
+ if (pmu->event_unmapped && !pmu_empty(pmu)) {
+ /*
+ * Can't force remove events when pmu::event_unmapped()
+ * is used in perf_mmap_close().
+ */
+ guard(mutex)(&pmus_lock);
+ idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu);
+ list_add_rcu(&pmu->entry, &pmus);
+ return -EBUSY;
+ }
+
+ scoped_guard (mutex, &pmus_lock)
+ idr_remove(&pmu_idr, pmu->type);
+
+ /*
+ * PMU is removed from the pmus list, so no new events will
+ * be created, now take care of the existing ones.
+ */
+ pmu_detach_events(pmu);
+
+ /*
+ * PMU is unused, make it go away.
+ */
perf_pmu_free(pmu);
+ return 0;
}
EXPORT_SYMBOL_GPL(perf_pmu_unregister);
@@ -12226,6 +12389,7 @@ perf_event_alloc(struct perf_event_attr
INIT_LIST_HEAD(&event->active_entry);
INIT_LIST_HEAD(&event->addr_filters.list);
INIT_HLIST_NODE(&event->hlist_entry);
+ INIT_LIST_HEAD(&event->pmu_list);
init_waitqueue_head(&event->waitq);
@@ -12294,6 +12458,13 @@ perf_event_alloc(struct perf_event_attr
perf_event__state_init(event);
+ /*
+ * Hold SRCU critical section around perf_init_event(), until returning
+ * the fully formed event put on pmu->events_list. This ensures that
+ * perf_pmu_unregister() will see any in-progress event creation that
+ * races.
+ */
+ guard(srcu)(&pmus_srcu);
pmu = NULL;
hwc = &event->hw;
@@ -12383,6 +12554,9 @@ perf_event_alloc(struct perf_event_attr
/* symmetric to unaccount_event() in _free_event() */
account_event(event);
+ scoped_guard (spinlock, &pmu->events_lock)
+ list_add(&event->pmu_list, &pmu->events);
+
return_ptr(event);
}
@@ -12769,6 +12943,10 @@ SYSCALL_DEFINE5(perf_event_open,
if (err)
goto err_fd;
group_leader = fd_file(group)->private_data;
+ if (group_leader->state <= PERF_EVENT_STATE_REVOKED) {
+ err = -ENODEV;
+ goto err_group_fd;
+ }
if (flags & PERF_FLAG_FD_OUTPUT)
output_event = group_leader;
if (flags & PERF_FLAG_FD_NO_GROUP)
@@ -13316,10 +13494,11 @@ static void sync_child_event(struct perf
}
static void
-perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
+perf_event_exit_event(struct perf_event *event,
+ struct perf_event_context *ctx, bool revoke)
{
struct perf_event *parent_event = event->parent;
- unsigned long detach_flags = 0;
+ unsigned long detach_flags = DETACH_EXIT;
if (parent_event) {
/*
@@ -13334,16 +13513,14 @@ perf_event_exit_event(struct perf_event
* Do destroy all inherited groups, we don't care about those
* and being thorough is better.
*/
- detach_flags = DETACH_GROUP | DETACH_CHILD;
+ detach_flags |= DETACH_GROUP | DETACH_CHILD;
mutex_lock(&parent_event->child_mutex);
}
- perf_remove_from_context(event, detach_flags);
+ if (revoke)
+ detach_flags |= DETACH_GROUP | DETACH_REVOKE;
- raw_spin_lock_irq(&ctx->lock);
- if (event->state > PERF_EVENT_STATE_EXIT)
- perf_event_set_state(event, PERF_EVENT_STATE_EXIT);
- raw_spin_unlock_irq(&ctx->lock);
+ perf_remove_from_context(event, detach_flags);
/*
* Child events can be freed.
@@ -13419,7 +13596,7 @@ static void perf_event_exit_task_context
perf_event_task(child, child_ctx, 0);
list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
- perf_event_exit_event(child_event, child_ctx);
+ perf_event_exit_event(child_event, child_ctx, false);
mutex_unlock(&child_ctx->mutex);
@@ -13609,6 +13786,9 @@ inherit_event(struct perf_event *parent_
if (parent_event->parent)
parent_event = parent_event->parent;
+ if (parent_event->state <= PERF_EVENT_STATE_REVOKED)
+ return NULL;
+
child_event = perf_event_alloc(&parent_event->attr,
parent_event->cpu,
child,
Hi Peter, > @@ -12294,6 +12458,13 @@ perf_event_alloc(struct perf_event_attr > > perf_event__state_init(event); > > + /* > + * Hold SRCU critical section around perf_init_event(), until returning > + * the fully formed event put on pmu->events_list. This ensures that > + * perf_pmu_unregister() will see any in-progress event creation that > + * races. > + */ > + guard(srcu)(&pmus_srcu); Minor nit. This can go down a bit, just right before perf_init_event() ? Thanks, Ravi
Hi Peter,
> @@ -6507,6 +6540,7 @@ static void perf_mmap_close(struct vm_ar
> unsigned long size = perf_data_size(rb);
> bool detach_rest = false;
>
> + /* FIXIES vs perf_pmu_unregister() */
> if (event->pmu->event_unmapped)
> event->pmu->event_unmapped(event, vma->vm_mm);
I assume you are already aware of the race between __pmu_detach_event()
and perf_mmap_close() since you have put that comment. In any case, below
sequence of operations triggers a splat when perf_mmap_close() tries to
access event->rb, event->pmu etc. which were already freed by
__pmu_detach_event().
Sequence:
Kernel Userspace
------ ---------
perf_pmu_register()
fd = perf_event_open()
p = mmap(fd)
perf_pmu_unregister()
munmap(p)
close(fd)
Splat:
BUG: kernel NULL pointer dereference, address: 0000000000000088
#PF: supervisor read access in kernel mode
#PF: error_code(0x0000) - not-present page
PGD 105f90067 P4D 105f90067 PUD 11a94a067 PMD 0
Oops: Oops: 0000 [#1] PREEMPT SMP NOPTI
CPU: 49 UID: 0 PID: 3456 Comm: perf-event-mmap Tainted: G OE 6.12.0-vanilla-dirty #273
Tainted: [O]=OOT_MODULE, [E]=UNSIGNED_MODULE
Hardware name: Dell Inc. PowerEdge R6515/07PXPY, BIOS 2.7.3 03/31/2022
RIP: 0010:perf_mmap_close+0x69/0x316
Code: [...]
RSP: 0018:ffffc90003773970 EFLAGS: 00010246
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
RDX: ffff888125c2d400 RSI: ffff88bf7c8a1900 RDI: ffff888125c2d400
RBP: ffff888103ccaf40 R08: 0000000000000000 R09: 0000000000000000
R10: 3030303030303030 R11: 3030303030303030 R12: ffff88811f58d080
R13: ffffc90003773a70 R14: ffffc90003773a28 R15: 00007fcc1df1d000
FS: 00007fcc1e72e6c0(0000) GS:ffff88bf7c880000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000088 CR3: 000000010396c003 CR4: 0000000000f70ef0
PKRU: 55555554
Call Trace:
<TASK>
? __die_body.cold+0x19/0x27
? page_fault_oops+0x15a/0x2f0
? exc_page_fault+0x7e/0x180
? asm_exc_page_fault+0x26/0x30
? perf_mmap_close+0x69/0x316
remove_vma+0x2f/0x70
vms_complete_munmap_vmas+0xdc/0x190
do_vmi_align_munmap+0x1d7/0x250
do_vmi_munmap+0xd0/0x180
__vm_munmap+0xa2/0x170
? hrtimer_start_range_ns+0x26f/0x3b0
__x64_sys_munmap+0x1b/0x30
do_syscall_64+0x82/0x160
? srso_alias_return_thunk+0x5/0xfbef5
? tty_insert_flip_string_and_push_buffer+0x8d/0xc0
? srso_alias_return_thunk+0x5/0xfbef5
? remove_wait_queue+0x24/0x60
? srso_alias_return_thunk+0x5/0xfbef5
? n_tty_write+0x36f/0x520
? srso_alias_return_thunk+0x5/0xfbef5
? __wake_up+0x44/0x60
? srso_alias_return_thunk+0x5/0xfbef5
? file_tty_write.isra.0+0x20c/0x2c0
? srso_alias_return_thunk+0x5/0xfbef5
? vfs_write+0x290/0x450
? srso_alias_return_thunk+0x5/0xfbef5
? srso_alias_return_thunk+0x5/0xfbef5
? syscall_exit_to_user_mode+0x10/0x210
? srso_alias_return_thunk+0x5/0xfbef5
? do_syscall_64+0x8e/0x160
? __rseq_handle_notify_resume+0xa6/0x4e0
? __pfx_hrtimer_wakeup+0x10/0x10
? srso_alias_return_thunk+0x5/0xfbef5
? syscall_exit_to_user_mode+0x1d5/0x210
? srso_alias_return_thunk+0x5/0xfbef5
? do_syscall_64+0x8e/0x160
? exc_page_fault+0x7e/0x180
entry_SYSCALL_64_after_hwframe+0x76/0x7e
RIP: 0033:0x7fcc1e849eab
Code: [...]
RSP: 002b:00007fcc1e72de08 EFLAGS: 00000202 ORIG_RAX: 000000000000000b
RAX: ffffffffffffffda RBX: 00007fcc1e72ecdc RCX: 00007fcc1e849eab
RDX: 0000000000011000 RSI: 0000000000011000 RDI: 00007fcc1df1d000
RBP: 00007fcc1e72dec0 R08: 0000000000000000 R09: 00000000ffffffff
R10: 0000000000000000 R11: 0000000000000202 R12: 00007fcc1e72e6c0
R13: ffffffffffffff88 R14: 0000000000000000 R15: 00007ffd1a4ce000
</TASK>
Oh sorry, I seem to have missed this email :/ On Mon, Nov 25, 2024 at 09:40:28AM +0530, Ravi Bangoria wrote: > Hi Peter, > > > @@ -6507,6 +6540,7 @@ static void perf_mmap_close(struct vm_ar > > unsigned long size = perf_data_size(rb); > > bool detach_rest = false; > > > > + /* FIXIES vs perf_pmu_unregister() */ > > if (event->pmu->event_unmapped) > > event->pmu->event_unmapped(event, vma->vm_mm); > > I assume you are already aware of the race between __pmu_detach_event() > and perf_mmap_close() since you have put that comment. That comment was mostly about how we can't fix up the whole ->event_unmapped() thing and have to abort pmu_unregister for it. > In any case, below sequence of operations triggers a splat when > perf_mmap_close() tries to access event->rb, event->pmu etc. which > were already freed by __pmu_detach_event(). > > Sequence: > > Kernel Userspace > ------ --------- > perf_pmu_register() > fd = perf_event_open() > p = mmap(fd) > perf_pmu_unregister() > munmap(p) > close(fd) Right, let me go have a look. Thanks!
On Tue, Dec 17, 2024 at 10:12:16AM +0100, Peter Zijlstra wrote:
>
>
> Oh sorry, I seem to have missed this email :/
>
> On Mon, Nov 25, 2024 at 09:40:28AM +0530, Ravi Bangoria wrote:
> > Hi Peter,
> >
> > > @@ -6507,6 +6540,7 @@ static void perf_mmap_close(struct vm_ar
> > > unsigned long size = perf_data_size(rb);
> > > bool detach_rest = false;
> > >
> > > + /* FIXIES vs perf_pmu_unregister() */
> > > if (event->pmu->event_unmapped)
> > > event->pmu->event_unmapped(event, vma->vm_mm);
> >
> > I assume you are already aware of the race between __pmu_detach_event()
> > and perf_mmap_close() since you have put that comment.
>
> That comment was mostly about how we can't fix up the whole
> ->event_unmapped() thing and have to abort pmu_unregister for it.
>
> > In any case, below sequence of operations triggers a splat when
> > perf_mmap_close() tries to access event->rb, event->pmu etc. which
> > were already freed by __pmu_detach_event().
> >
> > Sequence:
> >
> > Kernel Userspace
> > ------ ---------
> > perf_pmu_register()
> > fd = perf_event_open()
> > p = mmap(fd)
> > perf_pmu_unregister()
> > munmap(p)
> > close(fd)
>
> Right, let me go have a look. Thanks!
Bah, that's a right mess indeed, however did I miss all that.
The easiest solution is probably to leave the RB around on detach, but
now I need to remember why I did that in the first place :/
Oh.. I think I mostly that to serialize against perf_mmap(), which
should reject creating further maps. But we can do that without actually
detaching the RB -- we only need to acquire and release mmap_mutex.
Ah, there's that perf_event_stop() inside of ring_buffer_attach(), that
must not happen after detach, obviously. So that must be dealt with.
Hmm, also if we leave ->rb around, then we need to deal with
perf_event_set_output(), someone could try and redirect their things
into our buffer -- which isn't technically broken, but still weird.
Something like the below.
How did you test; perf-fuzzer or something?
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1742,7 +1742,7 @@ static inline bool needs_branch_stack(st
static inline bool has_aux(struct perf_event *event)
{
- return event->pmu->setup_aux;
+ return event->pmu && event->pmu->setup_aux;
}
static inline bool has_aux_action(struct perf_event *event)
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5409,7 +5409,6 @@ static void _free_event(struct perf_even
security_perf_event_free(event);
if (event->rb) {
- WARN_ON_ONCE(!event->pmu);
/*
* Can happen when we close an event with re-directed output.
*
@@ -12023,7 +12022,10 @@ static void __pmu_detach_event(struct pm
*/
scoped_guard (mutex, &event->mmap_mutex) {
WARN_ON_ONCE(pmu->event_unmapped);
- ring_buffer_attach(event, NULL);
+ /*
+ * Mostly an empy lock sequence, such that perf_mmap(), which
+ * relies on mmap_mutex, is sure to observe the state change.
+ */
}
perf_event_free_bpf_prog(event);
@@ -12823,6 +12825,9 @@ perf_event_set_output(struct perf_event
goto unlock;
if (output_event) {
+ if (output_event->state <= PERF_EVENT_STATE_REVOKED)
+ goto unlock;
+
/* get the rb we want to redirect to */
rb = ring_buffer_get(output_event);
if (!rb)
Hi Peter,
>>> In any case, below sequence of operations triggers a splat when
>>> perf_mmap_close() tries to access event->rb, event->pmu etc. which
>>> were already freed by __pmu_detach_event().
>>>
>>> Sequence:
>>>
>>> Kernel Userspace
>>> ------ ---------
>>> perf_pmu_register()
>>> fd = perf_event_open()
>>> p = mmap(fd)
>>> perf_pmu_unregister()
>>> munmap(p)
>>> close(fd)
>>
>> Right, let me go have a look. Thanks!
>
> Bah, that's a right mess indeed, however did I miss all that.
>
> The easiest solution is probably to leave the RB around on detach, but
> now I need to remember why I did that in the first place :/
>
> Oh.. I think I mostly that to serialize against perf_mmap(), which
> should reject creating further maps. But we can do that without actually
> detaching the RB -- we only need to acquire and release mmap_mutex.
>
> Ah, there's that perf_event_stop() inside of ring_buffer_attach(), that
> must not happen after detach, obviously. So that must be dealt with.
>
> Hmm, also if we leave ->rb around, then we need to deal with
> perf_event_set_output(), someone could try and redirect their things
> into our buffer -- which isn't technically broken, but still weird.
>
> Something like the below.
>
> How did you test; perf-fuzzer or something?
Prepared a simple test that does pmu register(), unregister() and
"perf record" in parallel. It's quite dirty, I'll clean it up and
share it here.
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -1742,7 +1742,7 @@ static inline bool needs_branch_stack(st
>
> static inline bool has_aux(struct perf_event *event)
> {
> - return event->pmu->setup_aux;
> + return event->pmu && event->pmu->setup_aux;
> }
>
> static inline bool has_aux_action(struct perf_event *event)
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -5409,7 +5409,6 @@ static void _free_event(struct perf_even
> security_perf_event_free(event);
>
> if (event->rb) {
> - WARN_ON_ONCE(!event->pmu);
> /*
> * Can happen when we close an event with re-directed output.
> *
> @@ -12023,7 +12022,10 @@ static void __pmu_detach_event(struct pm
> */
> scoped_guard (mutex, &event->mmap_mutex) {
> WARN_ON_ONCE(pmu->event_unmapped);
> - ring_buffer_attach(event, NULL);
> + /*
> + * Mostly an empy lock sequence, such that perf_mmap(), which
> + * relies on mmap_mutex, is sure to observe the state change.
> + */
> }
>
> perf_event_free_bpf_prog(event);
> @@ -12823,6 +12825,9 @@ perf_event_set_output(struct perf_event
> goto unlock;
>
> if (output_event) {
> + if (output_event->state <= PERF_EVENT_STATE_REVOKED)
> + goto unlock;
> +
> /* get the rb we want to redirect to */
> rb = ring_buffer_get(output_event);
> if (!rb)
I needed this additional diff on top of your change. With this, it survives
my test. perf_mmap_close() change seems correct. Not sure about perf_mmap().
I'll inspect the code further.
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6540,7 +6540,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
bool detach_rest = false;
/* FIXIES vs perf_pmu_unregister() */
- if (event->pmu->event_unmapped)
+ if (event->pmu && event->pmu->event_unmapped)
event->pmu->event_unmapped(event, vma->vm_mm);
/*
@@ -6873,7 +6873,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP);
vma->vm_ops = &perf_mmap_vmops;
- if (!ret && event->pmu->event_mapped)
+ if (!ret && event->pmu && event->pmu->event_mapped)
event->pmu->event_mapped(event, vma->vm_mm);
return ret;
Thanks,
Ravi
Hi Peter,
Sorry for the delay. Was on vacation.
>>>> In any case, below sequence of operations triggers a splat when
>>>> perf_mmap_close() tries to access event->rb, event->pmu etc. which
>>>> were already freed by __pmu_detach_event().
>>>>
>>>> Sequence:
>>>>
>>>> Kernel Userspace
>>>> ------ ---------
>>>> perf_pmu_register()
>>>> fd = perf_event_open()
>>>> p = mmap(fd)
>>>> perf_pmu_unregister()
>>>> munmap(p)
>>>> close(fd)
>>>
>>> Right, let me go have a look. Thanks!
>>
>> Bah, that's a right mess indeed, however did I miss all that.
>>
>> The easiest solution is probably to leave the RB around on detach, but
>> now I need to remember why I did that in the first place :/
>>
>> Oh.. I think I mostly that to serialize against perf_mmap(), which
>> should reject creating further maps. But we can do that without actually
>> detaching the RB -- we only need to acquire and release mmap_mutex.
>>
>> Ah, there's that perf_event_stop() inside of ring_buffer_attach(), that
>> must not happen after detach, obviously. So that must be dealt with.
>>
>> Hmm, also if we leave ->rb around, then we need to deal with
>> perf_event_set_output(), someone could try and redirect their things
>> into our buffer -- which isn't technically broken, but still weird.
>>
>> Something like the below.
>>
>> How did you test; perf-fuzzer or something?
>
> Prepared a simple test that does pmu register(), unregister() and
> "perf record" in parallel. It's quite dirty, I'll clean it up and
> share it here.
>
>> --- a/include/linux/perf_event.h
>> +++ b/include/linux/perf_event.h
>> @@ -1742,7 +1742,7 @@ static inline bool needs_branch_stack(st
>>
>> static inline bool has_aux(struct perf_event *event)
>> {
>> - return event->pmu->setup_aux;
>> + return event->pmu && event->pmu->setup_aux;
>> }
>>
>> static inline bool has_aux_action(struct perf_event *event)
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -5409,7 +5409,6 @@ static void _free_event(struct perf_even
>> security_perf_event_free(event);
>>
>> if (event->rb) {
>> - WARN_ON_ONCE(!event->pmu);
>> /*
>> * Can happen when we close an event with re-directed output.
>> *
>> @@ -12023,7 +12022,10 @@ static void __pmu_detach_event(struct pm
>> */
>> scoped_guard (mutex, &event->mmap_mutex) {
>> WARN_ON_ONCE(pmu->event_unmapped);
>> - ring_buffer_attach(event, NULL);
>> + /*
>> + * Mostly an empy lock sequence, such that perf_mmap(), which
>> + * relies on mmap_mutex, is sure to observe the state change.
>> + */
>> }
>>
>> perf_event_free_bpf_prog(event);
>> @@ -12823,6 +12825,9 @@ perf_event_set_output(struct perf_event
>> goto unlock;
>>
>> if (output_event) {
>> + if (output_event->state <= PERF_EVENT_STATE_REVOKED)
>> + goto unlock;
>> +
>> /* get the rb we want to redirect to */
>> rb = ring_buffer_get(output_event);
>> if (!rb)
>
> I needed this additional diff on top of your change. With this, it survives
> my test. perf_mmap_close() change seems correct. Not sure about perf_mmap().
> I'll inspect the code further.
>
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -6540,7 +6540,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
> bool detach_rest = false;
>
> /* FIXIES vs perf_pmu_unregister() */
> - if (event->pmu->event_unmapped)
> + if (event->pmu && event->pmu->event_unmapped)
> event->pmu->event_unmapped(event, vma->vm_mm);
>
> /*
> @@ -6873,7 +6873,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
> vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP);
> vma->vm_ops = &perf_mmap_vmops;
>
> - if (!ret && event->pmu->event_mapped)
> + if (!ret && event->pmu && event->pmu->event_mapped)
> event->pmu->event_mapped(event, vma->vm_mm);
>
> return ret;
Both of these are incorrect. They just reduce the race window, doesn't
actually solve the race. Anyway, I could spot few other races:
1) A race between event creation and perf_pmu_unregister(). Any event
create code path (perf_event_open(), perf_event_create_kernel_counter()
and inherit_event()) allocates event with perf_event_alloc() which adds
an event to the pmu->events list. However, the event is still immature,
for ex, event->ctx is still NULL. In the mean time, perf_pmu_unregister()
finds this event and tries to detach it.
perf_event_open() perf_pmu_unregister()
event = perf_event_alloc() pmu_detach_event(event)
list_add(&event->pmu_list, &pmu->events); perf_event_ctx_lock(event)
/* perf_event_ctx_lock_nested(ctx)
* event->ctx is NULL. ctx = READ_ONCE(event->ctx); /* event->ctx is NULL */
*/ if (!refcount_inc_not_zero(&ctx->refcount)) { /* Crash */
perf_install_in_context(ctx, event);
2) A race with perf_event_release_kernel(). perf_event_release_kernel()
prepares a separate "free_list" of all children events under ctx->mutex
and event->child_mutex. However, the "free_list" uses the same
"event->child_list" for entries. OTOH, perf_pmu_unregister() ultimately
calls __perf_remove_from_context() with DETACH_CHILD, which checks if
the event being removed is a child event, and if so, it will try to
detach the child from parent using list_del_init(&event->child_list);
i.e. two code path doing list_del on the same list entry.
perf_event_release_kernel() perf_pmu_unregister()
/* Move children events to free_list */ ...
list_for_each_entry_safe(child, tmp, &free_list, child_list) { perf_remove_from_context() /* with DETACH_CHILD */
... __perf_remove_from_context()
list_del(&child->child_list); perf_child_detach()
list_del_init(&event->child_list);
3) A WARN(), not a race. perf_pmu_unregister() increments event->refcount
before detaching the event. If perf_pmu_unregister() picks up a child
event, perf_event_exit_event() called through perf_pmu_unregister()
will try to free it. Since event->refcount would be 2, free_event()
will trigger a WARN().
perf_pmu_unregister()
event = pmu_get_event() /* event->refcount => 2 */
...
perf_event_exit_event()
if (parent_event) { /* true, because `event` is a child */
free_event(event);
if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
"unexpected event refcount: %ld; ptr=%p\n",
atomic_long_read(&event->refcount), event))
4) A race with perf_event_set_bpf_prog(). perf_event_set_bpf_prog() might
be in process of setting event->prog, where as perf_pmu_unregister(),
which internally calls perf_event_free_bpf_prog(), will clear the
event->prog pointer.
perf_pmu_unregister() perf_event_set_bpf_prog()
... perf_event_set_bpf_handler()
perf_event_free_bpf_prog() event->prog = prog;
event->prog = NULL;
I've yet to inspect other code paths, so there might be more races.
Thinking loud, a plausible brute force solution is to introduce "event
specific lock" and acquire it right at the beginning of all code paths
and release it at the end. event->lock shouldn't create any contention,
since event would mostly be going through only one code path at any
point in time.
Thanks,
Ravi
On Fri, Jan 03, 2025 at 09:54:09AM +0530, Ravi Bangoria wrote:
> Hi Peter,
>
> Sorry for the delay. Was on vacation.
Yeah, me too :-)
> Both of these are incorrect. They just reduce the race window, doesn't
> actually solve the race. Anyway, I could spot few other races:
>
> 1) A race between event creation and perf_pmu_unregister(). Any event
> create code path (perf_event_open(), perf_event_create_kernel_counter()
> and inherit_event()) allocates event with perf_event_alloc() which adds
> an event to the pmu->events list. However, the event is still immature,
> for ex, event->ctx is still NULL. In the mean time, perf_pmu_unregister()
> finds this event and tries to detach it.
>
> perf_event_open() perf_pmu_unregister()
> event = perf_event_alloc() pmu_detach_event(event)
> list_add(&event->pmu_list, &pmu->events); perf_event_ctx_lock(event)
> /* perf_event_ctx_lock_nested(ctx)
> * event->ctx is NULL. ctx = READ_ONCE(event->ctx); /* event->ctx is NULL */
> */ if (!refcount_inc_not_zero(&ctx->refcount)) { /* Crash */
> perf_install_in_context(ctx, event);
Ah, that puts the lie to the guard(srcu) comment there, doesn't it :/
So the intent was for that SRCU section to cover the creation, so that
perf_pmu_unregister() can take out the pmu to avoid creating more
events, then srcu-sync to wait on all in-progress creation and then go
detach everything.
I suppose the simplest thing here is to grow that SRCU section.
> 2) A race with perf_event_release_kernel(). perf_event_release_kernel()
> prepares a separate "free_list" of all children events under ctx->mutex
> and event->child_mutex. However, the "free_list" uses the same
> "event->child_list" for entries. OTOH, perf_pmu_unregister() ultimately
> calls __perf_remove_from_context() with DETACH_CHILD, which checks if
> the event being removed is a child event, and if so, it will try to
> detach the child from parent using list_del_init(&event->child_list);
> i.e. two code path doing list_del on the same list entry.
>
> perf_event_release_kernel() perf_pmu_unregister()
> /* Move children events to free_list */ ...
> list_for_each_entry_safe(child, tmp, &free_list, child_list) { perf_remove_from_context() /* with DETACH_CHILD */
> ... __perf_remove_from_context()
> list_del(&child->child_list); perf_child_detach()
> list_del_init(&event->child_list);
Bah, I had figured it was taken care of, because perf_event_exit_event()
has a similar race. I'll try and figure out what to do there.
> 3) A WARN(), not a race. perf_pmu_unregister() increments event->refcount
> before detaching the event. If perf_pmu_unregister() picks up a child
> event, perf_event_exit_event() called through perf_pmu_unregister()
> will try to free it. Since event->refcount would be 2, free_event()
> will trigger a WARN().
>
> perf_pmu_unregister()
> event = pmu_get_event() /* event->refcount => 2 */
> ...
> perf_event_exit_event()
> if (parent_event) { /* true, because `event` is a child */
> free_event(event);
> if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
> "unexpected event refcount: %ld; ptr=%p\n",
> atomic_long_read(&event->refcount), event))
I'll make that something like:
if (revoke)
put_event(event);
else
free_event(event);
or so.
> 4) A race with perf_event_set_bpf_prog(). perf_event_set_bpf_prog() might
> be in process of setting event->prog, where as perf_pmu_unregister(),
> which internally calls perf_event_free_bpf_prog(), will clear the
> event->prog pointer.
>
> perf_pmu_unregister() perf_event_set_bpf_prog()
> ... perf_event_set_bpf_handler()
> perf_event_free_bpf_prog() event->prog = prog;
> event->prog = NULL;
>
> I've yet to inspect other code paths, so there might be more races.
Weird, that should be serialized by perf_event_ctx_lock(), both
__pmu_detach_event() and _perf_ioctl() are called under that.
Thanks for going over this!
On Fri, Jan 17, 2025 at 01:03:16AM +0100, Peter Zijlstra wrote:
> > 2) A race with perf_event_release_kernel(). perf_event_release_kernel()
> > prepares a separate "free_list" of all children events under ctx->mutex
> > and event->child_mutex. However, the "free_list" uses the same
> > "event->child_list" for entries. OTOH, perf_pmu_unregister() ultimately
> > calls __perf_remove_from_context() with DETACH_CHILD, which checks if
> > the event being removed is a child event, and if so, it will try to
> > detach the child from parent using list_del_init(&event->child_list);
> > i.e. two code path doing list_del on the same list entry.
> >
> > perf_event_release_kernel() perf_pmu_unregister()
> > /* Move children events to free_list */ ...
> > list_for_each_entry_safe(child, tmp, &free_list, child_list) { perf_remove_from_context() /* with DETACH_CHILD */
> > ... __perf_remove_from_context()
> > list_del(&child->child_list); perf_child_detach()
> > list_del_init(&event->child_list);
>
> Bah, I had figured it was taken care of, because perf_event_exit_event()
> has a similar race. I'll try and figure out what to do there.
So, the problem appears to be that perf_event_release_kernel() does not
use DETACH_CHILD, doing so will clear PERF_ATTACH_CHILD, at which point
the above is fully serialized by parent->child_mutex.
Then the next problem is that since pmu_detach_events() can hold an
extra ref on things, the free_event() from free_list will WARN, like
before.
Easily fixed by making that put_event(), except that messes up the whole
wait_var_event() scheme -- since __free_event() does the final
put_ctx().
This in turn can be fixed by pushing that wake_up_var() nonsense into
put_ctx() itself.
Which then gives me something like so.
But also, I think we can get rid of that free_list entirely.
Anyway, let me go break this up into individual patches and go test
this -- after lunch!
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1229,8 +1229,14 @@ static void put_ctx(struct perf_event_co
if (refcount_dec_and_test(&ctx->refcount)) {
if (ctx->parent_ctx)
put_ctx(ctx->parent_ctx);
- if (ctx->task && ctx->task != TASK_TOMBSTONE)
- put_task_struct(ctx->task);
+ if (ctx->task) {
+ if (ctx->task == TASK_TOMBSTONE) {
+ smp_mb();
+ wake_up_var(&ctx->refcount);
+ } else {
+ put_task_struct(ctx->task);
+ }
+ }
call_rcu(&ctx->rcu_head, free_ctx);
}
}
@@ -5550,8 +5556,6 @@ int perf_event_release_kernel(struct per
again:
mutex_lock(&event->child_mutex);
list_for_each_entry(child, &event->child_list, child_list) {
- void *var = NULL;
-
/*
* Cannot change, child events are not migrated, see the
* comment with perf_event_ctx_lock_nested().
@@ -5584,46 +5588,32 @@ int perf_event_release_kernel(struct per
tmp = list_first_entry_or_null(&event->child_list,
struct perf_event, child_list);
if (tmp == child) {
- perf_remove_from_context(child, DETACH_GROUP);
- list_move(&child->child_list, &free_list);
+ perf_remove_from_context(child, DETACH_GROUP | DETACH_CHILD);
+ /*
+ * Can't risk calling into free_event() here, since
+ * event->destroy() might invert with the currently
+ * held locks, see 82d94856fa22 ("perf/core: Fix lock
+ * inversion between perf,trace,cpuhp")
+ */
+ list_add(&child->child_list, &free_list);
/*
* This matches the refcount bump in inherit_event();
* this can't be the last reference.
*/
put_event(event);
- } else {
- var = &ctx->refcount;
}
mutex_unlock(&event->child_mutex);
mutex_unlock(&ctx->mutex);
put_ctx(ctx);
- if (var) {
- /*
- * If perf_event_free_task() has deleted all events from the
- * ctx while the child_mutex got released above, make sure to
- * notify about the preceding put_ctx().
- */
- smp_mb(); /* pairs with wait_var_event() */
- wake_up_var(var);
- }
goto again;
}
mutex_unlock(&event->child_mutex);
list_for_each_entry_safe(child, tmp, &free_list, child_list) {
- void *var = &child->ctx->refcount;
-
list_del(&child->child_list);
- free_event(child);
-
- /*
- * Wake any perf_event_free_task() waiting for this event to be
- * freed.
- */
- smp_mb(); /* pairs with wait_var_event() */
- wake_up_var(var);
+ put_event(child);
}
no_ctx:
On Fri, Jan 17, 2025 at 02:04:23PM +0100, Peter Zijlstra wrote: > Anyway, let me go break this up into individual patches and go test > this -- after lunch! OK, so aside from a few dumb mistakes, the result seems to hold up with your tinypmu testcase. I left it running for about 30 minutes. I pushed out the latest patches to queue/perf/pmu-unregister
Hi Peter, On 18-Jan-25 2:34 AM, Peter Zijlstra wrote: > On Fri, Jan 17, 2025 at 02:04:23PM +0100, Peter Zijlstra wrote: > >> Anyway, let me go break this up into individual patches and go test >> this -- after lunch! > > OK, so aside from a few dumb mistakes, the result seems to hold up with > your tinypmu testcase. I left it running for about 30 minutes. > > I pushed out the latest patches to queue/perf/pmu-unregister I'll spend some time to go through the changes. I ran fuzzer over the weekend with latest queue/perf/pmu-unregister and I saw this kernel crash: BUG: kernel NULL pointer dereference, address: 00000000000000d0 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 12a922067 P4D 0 Oops: Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 8 UID: 1002 PID: 8505 Comm: perf_fuzzer Kdump: loaded Tainted: G W O 6.13.0-rc1-pmu-unregister+ #171 Tainted: [W]=WARN, [O]=OOT_MODULE Hardware name: AMD Corporation RUBY/RUBY, BIOS RRR1009C 07/21/2023 RIP: 0010:perf_mmap_to_page+0x6/0xc0 Code: ... RSP: 0018:ffa0000003aff910 EFLAGS: 00010206 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000008 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: ffa0000003aff980 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: 00007b2b02ca8000 R14: ff1100014f9cc5c0 R15: 0000000000000009 FS: 00007b2b02e03740(0000) GS:ff11001009000000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000000000d0 CR3: 000000014f99c001 CR4: 0000000000f71ef0 PKRU: 55555554 Call Trace: <TASK> ? show_regs+0x6c/0x80 ? __die+0x24/0x80 ? page_fault_oops+0x155/0x570 ? do_user_addr_fault+0x4b2/0x870 ? srso_alias_return_thunk+0x5/0xfbef5 ? get_page_from_freelist+0x3c7/0x1680 ? exc_page_fault+0x82/0x1b0 ? asm_exc_page_fault+0x27/0x30 ? perf_mmap_to_page+0x6/0xc0 ? perf_mmap+0x237/0x710 __mmap_region+0x6d5/0xb90 mmap_region+0x8d/0xc0 do_mmap+0x349/0x630 vm_mmap_pgoff+0xf4/0x1c0 ksys_mmap_pgoff+0x177/0x240 __x64_sys_mmap+0x33/0x70 x64_sys_call+0x24b9/0x2650 do_syscall_64+0x7e/0x170 ? srso_alias_return_thunk+0x5/0xfbef5 ? srso_alias_return_thunk+0x5/0xfbef5 ? sched_tick+0x119/0x320 ? srso_alias_return_thunk+0x5/0xfbef5 ? sysvec_irq_work+0x4f/0xc0 ? srso_alias_return_thunk+0x5/0xfbef5 ? rcu_report_qs_rnp+0xd1/0x140 ? srso_alias_return_thunk+0x5/0xfbef5 ? rcu_core+0x1c2/0x380 ? srso_alias_return_thunk+0x5/0xfbef5 ? rcu_core_si+0xe/0x20 ? srso_alias_return_thunk+0x5/0xfbef5 ? handle_softirqs+0xe7/0x330 ? srso_alias_return_thunk+0x5/0xfbef5 ? srso_alias_return_thunk+0x5/0xfbef5 ? irqentry_exit_to_user_mode+0x43/0x250 ? srso_alias_return_thunk+0x5/0xfbef5 ? irqentry_exit+0x43/0x50 ? srso_alias_return_thunk+0x5/0xfbef5 ? sysvec_apic_timer_interrupt+0x4f/0xc0 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7b2b02b2531c Code: ... RSP: 002b:00007ffc3b177880 EFLAGS: 00000246 ORIG_RAX: 0000000000000009 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007b2b02b2531c RDX: 0000000000000003 RSI: 0000000000009000 RDI: 0000000000000000 RBP: 00007ffc3b177890 R08: 0000000000000003 R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000246 R12: 0000000000000001 R13: 0000000000000000 R14: 0000636a69c7bb60 R15: 00007b2b02e55000 </TASK> Modules linked in: ... CR2: 00000000000000d0 ---[ end trace 0000000000000000 ]--- Thanks, Ravi
Hi Peter,
>> 4) A race with perf_event_set_bpf_prog(). perf_event_set_bpf_prog() might
>> be in process of setting event->prog, where as perf_pmu_unregister(),
>> which internally calls perf_event_free_bpf_prog(), will clear the
>> event->prog pointer.
>>
>> perf_pmu_unregister() perf_event_set_bpf_prog()
>> ... perf_event_set_bpf_handler()
>> perf_event_free_bpf_prog() event->prog = prog;
>> event->prog = NULL;
>>
>> I've yet to inspect other code paths, so there might be more races.
>
> Weird, that should be serialized by perf_event_ctx_lock(), both
> __pmu_detach_event() and _perf_ioctl() are called under that.
There are multiple code paths leading to perf_event_set_bpf_prog(). The
one starting from _perf_ioctl() is serialized. However, this is not:
__sys_bpf()
link_create()
bpf_perf_link_attach()
perf_event_set_bpf_prog()
perf_event_set_bpf_handler()
event->prog = prog;
Thanks,
Ravi
On Fri, Jan 17, 2025 at 10:50:26AM +0530, Ravi Bangoria wrote: > Hi Peter, > > >> 4) A race with perf_event_set_bpf_prog(). perf_event_set_bpf_prog() might > >> be in process of setting event->prog, where as perf_pmu_unregister(), > >> which internally calls perf_event_free_bpf_prog(), will clear the > >> event->prog pointer. > >> > >> perf_pmu_unregister() perf_event_set_bpf_prog() > >> ... perf_event_set_bpf_handler() > >> perf_event_free_bpf_prog() event->prog = prog; > >> event->prog = NULL; > >> > >> I've yet to inspect other code paths, so there might be more races. > > > > Weird, that should be serialized by perf_event_ctx_lock(), both > > __pmu_detach_event() and _perf_ioctl() are called under that. > > There are multiple code paths leading to perf_event_set_bpf_prog(). The > one starting from _perf_ioctl() is serialized. However, this is not: > > __sys_bpf() > link_create() > bpf_perf_link_attach() > perf_event_set_bpf_prog() > perf_event_set_bpf_handler() > event->prog = prog; > Urgh yeah, that's broken. Damn bpf stuff :-/
>> How did you test; perf-fuzzer or something?
>
> Prepared a simple test that does pmu register(), unregister() and
> "perf record" in parallel. It's quite dirty, I'll clean it up and
> share it here.
Attaching the testsuite here.
It contains:
- Kernel module that allows user to register and unregister a pmu
- Shell script to that runs "perf record" on test pmu
- README explains how to run the test
Thanks,
Ravi#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
# Author: Ravi Bangoria <ravi.bangoria@amd.com>
# Must not run directly. Run it via tinypmu-u.sh
if [ "$EUID" -ne 0 ]; then
echo "Please run as root"
exit
fi
while true; do
echo 1 > /dev/tinypmu_register
done
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
# Author: Ravi Bangoria <ravi.bangoria@amd.com>
# Must not run directly. Run it via tinypmu-u.sh
if [ "$EUID" -ne 0 ]; then
echo "Please run as root"
exit
fi
while true; do
echo 1 > /dev/tinypmu_unregister
done
# SPDX-License-Identifier: GPL-2.0
# Author: Ravi Bangoria <ravi.bangoria@amd.com>
obj-m := tinypmu-k.o
KDIR := /lib/modules/$(shell uname -r)/build
PWD := $(shell pwd)
default:
$(MAKE) -C $(KDIR) M=$(PWD) modules
clean:
rm -rf *.o *.mod.c *.mod *.ko *.order *.symvers \.*.o \.*.ko \.*.cmd \.tmp_versions
A simple testsuite to stress test pmu register / unregister code:
https://lore.kernel.org/r/20241104133909.669111662@infradead.org
Build:
$ make
Clean:
$ make clean
Test perf pmu register and unregister:
term1~$ while true; do sudo bash tinypmu-u.sh; done
Test event creation / mmap / unmap / event deletion etc along with
pmu register / unregister, (run this in parallel to above command):
term2~$ sudo bash tinypmu-u-events.sh
// SPDX-License-Identifier: GPL-2.0
/*
* Provide an interface /dev/tinypmu_register and /dev/tinypmu_unregister to
* stress test perf_pmu_register() and perf_pmu_unregister() functions.
*
* Author: Ravi Bangoria <ravi.bangoria@amd.com>
*/
#define pr_fmt(fmt) "tinypmu: " fmt
#include <linux/module.h>
#include <linux/perf_event.h>
#include <linux/fs.h>
#include <linux/miscdevice.h>
#define NR_TINYPMUS 1
static int tinypmu_event_init(struct perf_event *event)
{
return 0;
}
static void tinypmu_del(struct perf_event *event, int flags)
{
}
static int tinypmu_add(struct perf_event *event, int flags)
{
return 0;
}
static void tinypmu_start(struct perf_event *event, int flags)
{
}
static void tinypmu_stop(struct perf_event *event, int flags)
{
}
static void tinypmu_read(struct perf_event *event)
{
}
PMU_FORMAT_ATTR(event, "config:0-20");
static struct attribute *tinypmu_events_attr[] = {
&format_attr_event.attr,
NULL,
};
static struct attribute_group tinypmu_events_group = {
.name = "format",
.attrs = tinypmu_events_attr,
};
static const struct attribute_group *tinypmu_attr_groups[] = {
&tinypmu_events_group,
NULL,
};
static struct pmu *alloc_tinypmu(void)
{
struct pmu *tinypmu = kzalloc(sizeof(struct pmu), GFP_KERNEL);
if (!tinypmu)
return NULL;
tinypmu->task_ctx_nr = perf_invalid_context;
tinypmu->event_init = tinypmu_event_init;
tinypmu->add = tinypmu_add;
tinypmu->del = tinypmu_del;
tinypmu->start = tinypmu_start;
tinypmu->stop = tinypmu_stop;
tinypmu->read = tinypmu_read;
tinypmu->attr_groups = tinypmu_attr_groups;
return tinypmu;
}
static DEFINE_MUTEX(lock);
static struct pmu *tinypmus[NR_TINYPMUS];
static char pmu_name[NR_TINYPMUS][11];
static void register_pmu(unsigned int idx)
{
struct pmu *temp;
int ret;
if (idx >= NR_TINYPMUS)
return;
temp = alloc_tinypmu();
if (!temp) {
mutex_unlock(&lock);
return;
}
mutex_lock(&lock);
if (tinypmus[idx]) {
mutex_unlock(&lock);
kfree(temp);
return;
}
ret = perf_pmu_register(temp, pmu_name[idx], -1);
if (!ret) {
tinypmus[idx] = temp;
mutex_unlock(&lock);
return;
}
mutex_unlock(&lock);
kfree(temp);
return;
}
static void unregister_pmu(unsigned int idx)
{
struct pmu *temp;
if (idx >= NR_TINYPMUS)
return;
mutex_lock(&lock);
if (!tinypmus[idx]) {
mutex_unlock(&lock);
return;
}
/*
* Must call perf_pmu_unregister() inside atomic section. If I try
* to reduce atomic section by using temp to cache tinypmus[idx]
* plus clear tinypmus[idx] and do unregister after mutex_unlock(),
* register_pmu() will assume no pmu exists with "tinypmu<idx>" name
* since tinypmus[idx] is NULL and try to register new pmu although
* this function is yet to unregistered pmu with the same name.
*/
perf_pmu_unregister(tinypmus[idx]);
temp = tinypmus[idx];
tinypmus[idx] = NULL;
mutex_unlock(&lock);
kfree(temp);
}
static ssize_t register_write(struct file *f, const char *data, size_t size,
loff_t *ppos)
{
unsigned int idx;
get_random_bytes(&idx, sizeof(idx));
idx %= NR_TINYPMUS;
register_pmu(idx);
return 1;
}
static ssize_t unregister_write(struct file *f, const char *data, size_t size,
loff_t *ppos)
{
unsigned int idx;
get_random_bytes(&idx, sizeof(idx));
idx %= NR_TINYPMUS;
unregister_pmu(idx);
return 1;
}
static const struct file_operations register_fops = {
.owner = THIS_MODULE,
.write = register_write,
};
static const struct file_operations unregister_fops = {
.owner = THIS_MODULE,
.write = unregister_write,
};
static struct miscdevice register_dev = {
MISC_DYNAMIC_MINOR,
"tinypmu_register",
®ister_fops,
};
static struct miscdevice unregister_dev = {
MISC_DYNAMIC_MINOR,
"tinypmu_unregister",
&unregister_fops,
};
static int __init hello_init(void)
{
int ret;
int i;
ret = misc_register(®ister_dev);
if (ret) {
pr_err("Failed to register register_dev\n");
return ret;
}
ret = misc_register(&unregister_dev);
if (ret) {
pr_err("Failed to register unregister_dev\n");
misc_deregister(®ister_dev);
return ret;
}
for (i = 0; i < NR_TINYPMUS; i++)
sprintf(pmu_name[i], "tinypmu%d", i);
for (i = 0; i < NR_TINYPMUS; i++)
register_pmu(i);
return 0;
}
module_init(hello_init);
static void __exit hello_exit(void)
{
int i;
for (i = 0; i < NR_TINYPMUS; i++)
unregister_pmu(i);
misc_deregister(®ister_dev);
misc_deregister(&unregister_dev);
}
module_exit(hello_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Ravi Bangoria");
MODULE_DESCRIPTION("PMU register/unregister stress test");
MODULE_VERSION("dev");
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
# Author: Ravi Bangoria <ravi.bangoria@amd.com>
if [ "$EUID" -ne 0 ]; then
echo "Please run as root"
exit
fi
sudo insmod tinypmu-k.ko
cleanup() {
for i in "$@"; do
echo -n "${i} "
kill ${i}
done
wait
rmmod tinypmu_k
rm -rf /dev/tinypmu_register
rm -rf /dev/tinypmu_unregister
echo ""
exit
}
bash _tinypmu-u-register.sh &
reg_pid=$!
bash _tinypmu-u-unregister.sh &
unreg_pid=$!
# register Ctrl+C cleanup if aborted inbetween
#trap "cleanup '${reg_pid}' '${unreg_pid}' ${event_pids[@]}" 2
echo ${reg_pid} ${unreg_pid}
sleep 10
cleanup ${reg_pid} ${unreg_pid}
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
# Author: Ravi Bangoria <ravi.bangoria@amd.com>
pmu_nr=${1}
if [ "$EUID" -ne 0 ]; then
echo "Please run as root"
exit
fi
while true; do
perf record -a -e tinypmu${pmu_nr}// -- sleep 1 >> /dev/null 2>&1 &
done
On 2024-11-04 8:39 a.m., Peter Zijlstra wrote:
> Previously it was only safe to call perf_pmu_unregister() if there
> were no active events of that pmu around -- which was impossible to
> guarantee since it races all sorts against perf_init_event().
>
> Rework the whole thing by:
>
> - keeping track of all events for a given pmu
>
> - 'hiding' the pmu from perf_init_event()
>
> - waiting for the appropriate (s)rcu grace periods such that all
> prior references to the PMU will be completed
>
> - detaching all still existing events of that pmu (see first point)
> and moving them to a new REVOKED state.
>
> - actually freeing the pmu data.
>
> Where notably the new REVOKED state must inhibit all event actions
> from reaching code that wants to use event->pmu.
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
> include/linux/perf_event.h | 13 +-
> kernel/events/core.c | 222 ++++++++++++++++++++++++++++++++++++++++-----
> 2 files changed, 210 insertions(+), 25 deletions(-)
>
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -318,6 +318,9 @@ struct perf_output_handle;
> struct pmu {
> struct list_head entry;
>
> + spinlock_t events_lock;
> + struct list_head events;
> +
> struct module *module;
> struct device *dev;
> struct device *parent;
> @@ -611,9 +614,10 @@ struct perf_addr_filter_range {
> * enum perf_event_state - the states of an event:
> */
> enum perf_event_state {
> - PERF_EVENT_STATE_DEAD = -4,
> - PERF_EVENT_STATE_EXIT = -3,
> - PERF_EVENT_STATE_ERROR = -2,
> + PERF_EVENT_STATE_DEAD = -5,
> + PERF_EVENT_STATE_REVOKED = -4, /* pmu gone, must not touch */
> + PERF_EVENT_STATE_EXIT = -3, /* task died, still inherit */
> + PERF_EVENT_STATE_ERROR = -2, /* scheduling error, can enable */
> PERF_EVENT_STATE_OFF = -1,
> PERF_EVENT_STATE_INACTIVE = 0,
> PERF_EVENT_STATE_ACTIVE = 1,
> @@ -854,6 +858,7 @@ struct perf_event {
> void *security;
> #endif
> struct list_head sb_list;
> + struct list_head pmu_list;
>
> /*
> * Certain events gets forwarded to another pmu internally by over-
> @@ -1105,7 +1110,7 @@ extern void perf_aux_output_flag(struct
> extern void perf_event_itrace_started(struct perf_event *event);
>
> extern int perf_pmu_register(struct pmu *pmu, const char *name, int type);
> -extern void perf_pmu_unregister(struct pmu *pmu);
> +extern int perf_pmu_unregister(struct pmu *pmu);
>
> extern void __perf_event_task_sched_in(struct task_struct *prev,
> struct task_struct *task);
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -2412,7 +2412,9 @@ ctx_time_update_event(struct perf_event_
>
> #define DETACH_GROUP 0x01UL
> #define DETACH_CHILD 0x02UL
> -#define DETACH_DEAD 0x04UL
> +#define DETACH_EXIT 0x04UL
> +#define DETACH_REVOKE 0x08UL
> +#define DETACH_DEAD 0x10UL
>
> /*
> * Cross CPU call to remove a performance event
> @@ -2427,6 +2429,7 @@ __perf_remove_from_context(struct perf_e
> void *info)
> {
> struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
> + enum perf_event_state state = PERF_EVENT_STATE_OFF;
Set the PERF_EVENT_STATE_OFF as default seems dangerous.
If the event was in an error state, the state will be overwritten to the
PERF_EVENT_STATE_OFF later.
One example may be the perf_pmu_migrate_context(). After the migration,
it looks like all the error state will be cleared.
Thanks,
Kan
> unsigned long flags = (unsigned long)info;
>
> ctx_time_update(cpuctx, ctx);
> @@ -2435,16 +2438,22 @@ __perf_remove_from_context(struct perf_e
> * Ensure event_sched_out() switches to OFF, at the very least
> * this avoids raising perf_pending_task() at this time.
> */
> - if (flags & DETACH_DEAD)
> + if (flags & DETACH_EXIT)
> + state = PERF_EVENT_STATE_EXIT;
> + if (flags & DETACH_REVOKE)
> + state = PERF_EVENT_STATE_REVOKED;
> + if (flags & DETACH_DEAD) {
> event->pending_disable = 1;
> + state = PERF_EVENT_STATE_DEAD;
> + }
> event_sched_out(event, ctx);
> if (flags & DETACH_GROUP)
> perf_group_detach(event);
> if (flags & DETACH_CHILD)
> perf_child_detach(event);
> list_del_event(event, ctx);
> - if (flags & DETACH_DEAD)
> - event->state = PERF_EVENT_STATE_DEAD;
> +
> + event->state = state;
>
> if (!pmu_ctx->nr_events) {
> pmu_ctx->rotate_necessary = 0;
> @@ -4511,7 +4520,8 @@ static void perf_event_enable_on_exec(st
>
> static void perf_remove_from_owner(struct perf_event *event);
> static void perf_event_exit_event(struct perf_event *event,
> - struct perf_event_context *ctx);
> + struct perf_event_context *ctx,
> + bool revoke);
>
> /*
> * Removes all events from the current task that have been marked
> @@ -4538,7 +4548,7 @@ static void perf_event_remove_on_exec(st
>
> modified = true;
>
> - perf_event_exit_event(event, ctx);
> + perf_event_exit_event(event, ctx, false);
> }
>
> raw_spin_lock_irqsave(&ctx->lock, flags);
> @@ -5138,6 +5148,7 @@ static bool is_sb_event(struct perf_even
> attr->context_switch || attr->text_poke ||
> attr->bpf_event)
> return true;
> +
> return false;
> }
>
> @@ -5339,6 +5350,8 @@ static void perf_pending_task_sync(struc
> /* vs perf_event_alloc() error */
> static void __free_event(struct perf_event *event)
> {
> + struct pmu *pmu = event->pmu;
> +
> if (event->attach_state & PERF_ATTACH_CALLCHAIN)
> put_callchain_buffers();
>
> @@ -5365,6 +5378,7 @@ static void __free_event(struct perf_eve
> * put_pmu_ctx() needs an event->ctx reference, because of
> * epc->ctx.
> */
> + WARN_ON_ONCE(!pmu);
> WARN_ON_ONCE(!event->ctx);
> WARN_ON_ONCE(event->pmu_ctx->ctx != event->ctx);
> put_pmu_ctx(event->pmu_ctx);
> @@ -5377,8 +5391,13 @@ static void __free_event(struct perf_eve
> if (event->ctx)
> put_ctx(event->ctx);
>
> - if (event->pmu)
> - module_put(event->pmu->module);
> + if (pmu) {
> + module_put(pmu->module);
> + scoped_guard (spinlock, &pmu->events_lock) {
> + list_del(&event->pmu_list);
> + wake_up_var(pmu);
> + }
> + }
>
> call_rcu(&event->rcu_head, free_event_rcu);
> }
> @@ -5397,6 +5416,7 @@ static void _free_event(struct perf_even
> security_perf_event_free(event);
>
> if (event->rb) {
> + WARN_ON_ONCE(!event->pmu);
> /*
> * Can happen when we close an event with re-directed output.
> *
> @@ -5527,7 +5547,11 @@ int perf_event_release_kernel(struct per
> * Thus this guarantees that we will in fact observe and kill _ALL_
> * child events.
> */
> - perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);
> + if (event->state > PERF_EVENT_STATE_REVOKED) {
> + perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);
> + } else {
> + event->state = PERF_EVENT_STATE_DEAD;
> + }
>
> perf_event_ctx_unlock(event, ctx);
>
> @@ -5838,7 +5862,7 @@ __perf_read(struct perf_event *event, ch
> * error state (i.e. because it was pinned but it couldn't be
> * scheduled on to the CPU at some point).
> */
> - if (event->state == PERF_EVENT_STATE_ERROR)
> + if (event->state <= PERF_EVENT_STATE_ERROR)
> return 0;
>
> if (count < event->read_size)
> @@ -5877,8 +5901,14 @@ static __poll_t perf_poll(struct file *f
> struct perf_buffer *rb;
> __poll_t events = EPOLLHUP;
>
> + if (event->state <= PERF_EVENT_STATE_REVOKED)
> + return EPOLLERR;
> +
> poll_wait(file, &event->waitq, wait);
>
> + if (event->state <= PERF_EVENT_STATE_REVOKED)
> + return EPOLLERR;
> +
> if (is_event_hup(event))
> return events;
>
> @@ -6058,6 +6088,9 @@ static long _perf_ioctl(struct perf_even
> void (*func)(struct perf_event *);
> u32 flags = arg;
>
> + if (event->state <= PERF_EVENT_STATE_REVOKED)
> + return -ENODEV;
> +
> switch (cmd) {
> case PERF_EVENT_IOC_ENABLE:
> func = _perf_event_enable;
> @@ -6507,6 +6540,7 @@ static void perf_mmap_close(struct vm_ar
> unsigned long size = perf_data_size(rb);
> bool detach_rest = false;
>
> + /* FIXIES vs perf_pmu_unregister() */
> if (event->pmu->event_unmapped)
> event->pmu->event_unmapped(event, vma->vm_mm);
>
> @@ -6657,6 +6691,16 @@ static int perf_mmap(struct file *file,
> mutex_lock(&event->mmap_mutex);
> ret = -EINVAL;
>
> + /*
> + * This relies on __pmu_detach_event() taking mmap_mutex after marking
> + * the event REVOKED. Either we observe the state, or __pmu_detach_event()
> + * will detach the rb created here.
> + */
> + if (event->state <= PERF_EVENT_STATE_REVOKED) {
> + ret = -ENODEV;
> + goto unlock;
> + }
> +
> if (vma->vm_pgoff == 0) {
> nr_pages -= 1;
>
> @@ -6840,6 +6884,9 @@ static int perf_fasync(int fd, struct fi
> struct perf_event *event = filp->private_data;
> int retval;
>
> + if (event->state <= PERF_EVENT_STATE_REVOKED)
> + return -ENODEV;
> +
> inode_lock(inode);
> retval = fasync_helper(fd, filp, on, &event->fasync);
> inode_unlock(inode);
> @@ -11892,6 +11939,9 @@ int perf_pmu_register(struct pmu *_pmu,
> if (!pmu->event_idx)
> pmu->event_idx = perf_event_idx_default;
>
> + INIT_LIST_HEAD(&pmu->events);
> + spin_lock_init(&pmu->events_lock);
> +
> /*
> * Now that the PMU is complete, make it visible to perf_try_init_event().
> */
> @@ -11905,11 +11955,100 @@ int perf_pmu_register(struct pmu *_pmu,
> }
> EXPORT_SYMBOL_GPL(perf_pmu_register);
>
> -void perf_pmu_unregister(struct pmu *pmu)
> +static void __pmu_detach_event(struct pmu *pmu, struct perf_event *event,
> + struct perf_event_context *ctx)
> +{
> + /*
> + * De-schedule the event and mark it REVOKED.
> + */
> + perf_event_exit_event(event, ctx, true);
> +
> + /*
> + * All _free_event() bits that rely on event->pmu:
> + *
> + * Notably, perf_mmap() relies on the ordering here.
> + */
> + scoped_guard (mutex, &event->mmap_mutex) {
> + WARN_ON_ONCE(pmu->event_unmapped);
> + ring_buffer_attach(event, NULL);
> + }
> +
> + perf_event_free_bpf_prog(event);
> + perf_free_addr_filters(event);
> +
> + if (event->destroy) {
> + event->destroy(event);
> + event->destroy = NULL;
> + }
> +
> + if (event->pmu_ctx) {
> + put_pmu_ctx(event->pmu_ctx);
> + event->pmu_ctx = NULL;
> + }
> +
> + exclusive_event_destroy(event);
> + module_put(pmu->module);
> +
> + event->pmu = NULL; /* force fault instead of UAF */
> +}
> +
> +static void pmu_detach_event(struct pmu *pmu, struct perf_event *event)
> +{
> + struct perf_event_context *ctx;
> +
> + ctx = perf_event_ctx_lock(event);
> + __pmu_detach_event(pmu, event, ctx);
> + perf_event_ctx_unlock(event, ctx);
> +
> + scoped_guard (spinlock, &pmu->events_lock)
> + list_del(&event->pmu_list);
> +}
> +
> +static struct perf_event *pmu_get_event(struct pmu *pmu)
> +{
> + struct perf_event *event;
> +
> + guard(spinlock)(&pmu->events_lock);
> + list_for_each_entry(event, &pmu->events, pmu_list) {
> + if (atomic_long_inc_not_zero(&event->refcount))
> + return event;
> + }
> +
> + return NULL;
> +}
> +
> +static bool pmu_empty(struct pmu *pmu)
> +{
> + guard(spinlock)(&pmu->events_lock);
> + return list_empty(&pmu->events);
> +}
> +
> +static void pmu_detach_events(struct pmu *pmu)
> +{
> + struct perf_event *event;
> +
> + for (;;) {
> + event = pmu_get_event(pmu);
> + if (!event)
> + break;
> +
> + pmu_detach_event(pmu, event);
> + put_event(event);
> + }
> +
> + /*
> + * wait for pending _free_event()s
> + */
> + wait_var_event(pmu, pmu_empty(pmu));
> +}
> +
> +int perf_pmu_unregister(struct pmu *pmu)
> {
> scoped_guard (mutex, &pmus_lock) {
> + if (!idr_cmpxchg(&pmu_idr, pmu->type, pmu, NULL))
> + return -EINVAL;
> +
> list_del_rcu(&pmu->entry);
> - idr_remove(&pmu_idr, pmu->type);
> }
>
> /*
> @@ -11919,7 +12058,31 @@ void perf_pmu_unregister(struct pmu *pmu
> synchronize_srcu(&pmus_srcu);
> synchronize_rcu();
>
> + if (pmu->event_unmapped && !pmu_empty(pmu)) {
> + /*
> + * Can't force remove events when pmu::event_unmapped()
> + * is used in perf_mmap_close().
> + */
> + guard(mutex)(&pmus_lock);
> + idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu);
> + list_add_rcu(&pmu->entry, &pmus);
> + return -EBUSY;
> + }
> +
> + scoped_guard (mutex, &pmus_lock)
> + idr_remove(&pmu_idr, pmu->type);
> +
> + /*
> + * PMU is removed from the pmus list, so no new events will
> + * be created, now take care of the existing ones.
> + */
> + pmu_detach_events(pmu);
> +
> + /*
> + * PMU is unused, make it go away.
> + */
> perf_pmu_free(pmu);
> + return 0;
> }
> EXPORT_SYMBOL_GPL(perf_pmu_unregister);
>
> @@ -12226,6 +12389,7 @@ perf_event_alloc(struct perf_event_attr
> INIT_LIST_HEAD(&event->active_entry);
> INIT_LIST_HEAD(&event->addr_filters.list);
> INIT_HLIST_NODE(&event->hlist_entry);
> + INIT_LIST_HEAD(&event->pmu_list);
>
>
> init_waitqueue_head(&event->waitq);
> @@ -12294,6 +12458,13 @@ perf_event_alloc(struct perf_event_attr
>
> perf_event__state_init(event);
>
> + /*
> + * Hold SRCU critical section around perf_init_event(), until returning
> + * the fully formed event put on pmu->events_list. This ensures that
> + * perf_pmu_unregister() will see any in-progress event creation that
> + * races.
> + */
> + guard(srcu)(&pmus_srcu);
> pmu = NULL;
>
> hwc = &event->hw;
> @@ -12383,6 +12554,9 @@ perf_event_alloc(struct perf_event_attr
> /* symmetric to unaccount_event() in _free_event() */
> account_event(event);
>
> + scoped_guard (spinlock, &pmu->events_lock)
> + list_add(&event->pmu_list, &pmu->events);
> +
> return_ptr(event);
> }
>
> @@ -12769,6 +12943,10 @@ SYSCALL_DEFINE5(perf_event_open,
> if (err)
> goto err_fd;
> group_leader = fd_file(group)->private_data;
> + if (group_leader->state <= PERF_EVENT_STATE_REVOKED) {
> + err = -ENODEV;
> + goto err_group_fd;
> + }
> if (flags & PERF_FLAG_FD_OUTPUT)
> output_event = group_leader;
> if (flags & PERF_FLAG_FD_NO_GROUP)
> @@ -13316,10 +13494,11 @@ static void sync_child_event(struct perf
> }
>
> static void
> -perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
> +perf_event_exit_event(struct perf_event *event,
> + struct perf_event_context *ctx, bool revoke)
> {
> struct perf_event *parent_event = event->parent;
> - unsigned long detach_flags = 0;
> + unsigned long detach_flags = DETACH_EXIT;
>
> if (parent_event) {
> /*
> @@ -13334,16 +13513,14 @@ perf_event_exit_event(struct perf_event
> * Do destroy all inherited groups, we don't care about those
> * and being thorough is better.
> */
> - detach_flags = DETACH_GROUP | DETACH_CHILD;
> + detach_flags |= DETACH_GROUP | DETACH_CHILD;
> mutex_lock(&parent_event->child_mutex);
> }
>
> - perf_remove_from_context(event, detach_flags);
> + if (revoke)
> + detach_flags |= DETACH_GROUP | DETACH_REVOKE;
>
> - raw_spin_lock_irq(&ctx->lock);
> - if (event->state > PERF_EVENT_STATE_EXIT)
> - perf_event_set_state(event, PERF_EVENT_STATE_EXIT);
> - raw_spin_unlock_irq(&ctx->lock);
> + perf_remove_from_context(event, detach_flags);
>
> /*
> * Child events can be freed.
> @@ -13419,7 +13596,7 @@ static void perf_event_exit_task_context
> perf_event_task(child, child_ctx, 0);
>
> list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
> - perf_event_exit_event(child_event, child_ctx);
> + perf_event_exit_event(child_event, child_ctx, false);
>
> mutex_unlock(&child_ctx->mutex);
>
> @@ -13609,6 +13786,9 @@ inherit_event(struct perf_event *parent_
> if (parent_event->parent)
> parent_event = parent_event->parent;
>
> + if (parent_event->state <= PERF_EVENT_STATE_REVOKED)
> + return NULL;
> +
> child_event = perf_event_alloc(&parent_event->attr,
> parent_event->cpu,
> child,
>
>
>
On Tue, Nov 05, 2024 at 10:08:54AM -0500, Liang, Kan wrote:
> > @@ -2427,6 +2429,7 @@ __perf_remove_from_context(struct perf_e
> > void *info)
> > {
> > struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
> > + enum perf_event_state state = PERF_EVENT_STATE_OFF;
>
> Set the PERF_EVENT_STATE_OFF as default seems dangerous.
> If the event was in an error state, the state will be overwritten to the
> PERF_EVENT_STATE_OFF later.
>
> One example may be the perf_pmu_migrate_context(). After the migration,
> it looks like all the error state will be cleared.
>
> Thanks,
> Kan
>
> > unsigned long flags = (unsigned long)info;
> >
> > ctx_time_update(cpuctx, ctx);
> > @@ -2435,16 +2438,22 @@ __perf_remove_from_context(struct perf_e
> > * Ensure event_sched_out() switches to OFF, at the very least
> > * this avoids raising perf_pending_task() at this time.
> > */
> > - if (flags & DETACH_DEAD)
> > + if (flags & DETACH_EXIT)
> > + state = PERF_EVENT_STATE_EXIT;
> > + if (flags & DETACH_REVOKE)
> > + state = PERF_EVENT_STATE_REVOKED;
> > + if (flags & DETACH_DEAD) {
> > event->pending_disable = 1;
> > + state = PERF_EVENT_STATE_DEAD;
> > + }
> > event_sched_out(event, ctx);
> > if (flags & DETACH_GROUP)
> > perf_group_detach(event);
> > if (flags & DETACH_CHILD)
> > perf_child_detach(event);
> > list_del_event(event, ctx);
> > - if (flags & DETACH_DEAD)
> > - event->state = PERF_EVENT_STATE_DEAD;
> > +
> > + event->state = state;
How about we make this:
event->state = min(event->state, state);
?
On 2024-11-05 10:16 a.m., Peter Zijlstra wrote:
> On Tue, Nov 05, 2024 at 10:08:54AM -0500, Liang, Kan wrote:
>>> @@ -2427,6 +2429,7 @@ __perf_remove_from_context(struct perf_e
>>> void *info)
>>> {
>>> struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
>>> + enum perf_event_state state = PERF_EVENT_STATE_OFF;
>>
>> Set the PERF_EVENT_STATE_OFF as default seems dangerous.
>> If the event was in an error state, the state will be overwritten to the
>> PERF_EVENT_STATE_OFF later.
>>
>> One example may be the perf_pmu_migrate_context(). After the migration,
>> it looks like all the error state will be cleared.
>>
>> Thanks,
>> Kan
>>
>>> unsigned long flags = (unsigned long)info;
>>>
>>> ctx_time_update(cpuctx, ctx);
>>> @@ -2435,16 +2438,22 @@ __perf_remove_from_context(struct perf_e
>>> * Ensure event_sched_out() switches to OFF, at the very least
>>> * this avoids raising perf_pending_task() at this time.
>>> */
>>> - if (flags & DETACH_DEAD)
>>> + if (flags & DETACH_EXIT)
>>> + state = PERF_EVENT_STATE_EXIT;
>>> + if (flags & DETACH_REVOKE)
>>> + state = PERF_EVENT_STATE_REVOKED;
>>> + if (flags & DETACH_DEAD) {
>>> event->pending_disable = 1;
>>> + state = PERF_EVENT_STATE_DEAD;
>>> + }
>>> event_sched_out(event, ctx);
>>> if (flags & DETACH_GROUP)
>>> perf_group_detach(event);
>>> if (flags & DETACH_CHILD)
>>> perf_child_detach(event);
>>> list_del_event(event, ctx);
>>> - if (flags & DETACH_DEAD)
>>> - event->state = PERF_EVENT_STATE_DEAD;
>>> +
>>> + event->state = state;
>
> How about we make this:
>
> event->state = min(event->state, state);
>
> ?
Yep, it looks good to me.
Thanks,
Kan
© 2016 - 2026 Red Hat, Inc.