[v1] perf: Make perf_pmu_unregister() usable

[PATCH 19/19] perf: Make perf_pmu_unregister() useable

Posted by Peter Zijlstra 1 year, 3 months ago

Previously it was only safe to call perf_pmu_unregister() if there
were no active events of that pmu around -- which was impossible to
guarantee since it races all sorts against perf_init_event().

Rework the whole thing by:

 - keeping track of all events for a given pmu

 - 'hiding' the pmu from perf_init_event()

 - waiting for the appropriate (s)rcu grace periods such that all
   prior references to the PMU will be completed

 - detaching all still existing events of that pmu (see first point)
   and moving them to a new REVOKED state.

 - actually freeing the pmu data.

Where notably the new REVOKED state must inhibit all event actions
from reaching code that wants to use event->pmu.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/perf_event.h |   13 +-
 kernel/events/core.c       |  222 ++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 210 insertions(+), 25 deletions(-)

--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -318,6 +318,9 @@ struct perf_output_handle;
 struct pmu {
 	struct list_head		entry;
 
+	spinlock_t			events_lock;
+	struct list_head		events;
+
 	struct module			*module;
 	struct device			*dev;
 	struct device			*parent;
@@ -611,9 +614,10 @@ struct perf_addr_filter_range {
  * enum perf_event_state - the states of an event:
  */
 enum perf_event_state {
-	PERF_EVENT_STATE_DEAD		= -4,
-	PERF_EVENT_STATE_EXIT		= -3,
-	PERF_EVENT_STATE_ERROR		= -2,
+	PERF_EVENT_STATE_DEAD		= -5,
+	PERF_EVENT_STATE_REVOKED	= -4, /* pmu gone, must not touch */
+	PERF_EVENT_STATE_EXIT		= -3, /* task died, still inherit */
+	PERF_EVENT_STATE_ERROR		= -2, /* scheduling error, can enable */
 	PERF_EVENT_STATE_OFF		= -1,
 	PERF_EVENT_STATE_INACTIVE	=  0,
 	PERF_EVENT_STATE_ACTIVE		=  1,
@@ -854,6 +858,7 @@ struct perf_event {
 	void *security;
 #endif
 	struct list_head		sb_list;
+	struct list_head		pmu_list;
 
 	/*
 	 * Certain events gets forwarded to another pmu internally by over-
@@ -1105,7 +1110,7 @@ extern void perf_aux_output_flag(struct
 extern void perf_event_itrace_started(struct perf_event *event);
 
 extern int perf_pmu_register(struct pmu *pmu, const char *name, int type);
-extern void perf_pmu_unregister(struct pmu *pmu);
+extern int perf_pmu_unregister(struct pmu *pmu);
 
 extern void __perf_event_task_sched_in(struct task_struct *prev,
 				       struct task_struct *task);
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2412,7 +2412,9 @@ ctx_time_update_event(struct perf_event_
 
 #define DETACH_GROUP	0x01UL
 #define DETACH_CHILD	0x02UL
-#define DETACH_DEAD	0x04UL
+#define DETACH_EXIT	0x04UL
+#define DETACH_REVOKE	0x08UL
+#define DETACH_DEAD	0x10UL
 
 /*
  * Cross CPU call to remove a performance event
@@ -2427,6 +2429,7 @@ __perf_remove_from_context(struct perf_e
 			   void *info)
 {
 	struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
+	enum perf_event_state state = PERF_EVENT_STATE_OFF;
 	unsigned long flags = (unsigned long)info;
 
 	ctx_time_update(cpuctx, ctx);
@@ -2435,16 +2438,22 @@ __perf_remove_from_context(struct perf_e
 	 * Ensure event_sched_out() switches to OFF, at the very least
 	 * this avoids raising perf_pending_task() at this time.
 	 */
-	if (flags & DETACH_DEAD)
+	if (flags & DETACH_EXIT)
+		state = PERF_EVENT_STATE_EXIT;
+	if (flags & DETACH_REVOKE)
+		state = PERF_EVENT_STATE_REVOKED;
+	if (flags & DETACH_DEAD) {
 		event->pending_disable = 1;
+		state = PERF_EVENT_STATE_DEAD;
+	}
 	event_sched_out(event, ctx);
 	if (flags & DETACH_GROUP)
 		perf_group_detach(event);
 	if (flags & DETACH_CHILD)
 		perf_child_detach(event);
 	list_del_event(event, ctx);
-	if (flags & DETACH_DEAD)
-		event->state = PERF_EVENT_STATE_DEAD;
+
+	event->state = state;
 
 	if (!pmu_ctx->nr_events) {
 		pmu_ctx->rotate_necessary = 0;
@@ -4511,7 +4520,8 @@ static void perf_event_enable_on_exec(st
 
 static void perf_remove_from_owner(struct perf_event *event);
 static void perf_event_exit_event(struct perf_event *event,
-				  struct perf_event_context *ctx);
+				  struct perf_event_context *ctx,
+				  bool revoke);
 
 /*
  * Removes all events from the current task that have been marked
@@ -4538,7 +4548,7 @@ static void perf_event_remove_on_exec(st
 
 		modified = true;
 
-		perf_event_exit_event(event, ctx);
+		perf_event_exit_event(event, ctx, false);
 	}
 
 	raw_spin_lock_irqsave(&ctx->lock, flags);
@@ -5138,6 +5148,7 @@ static bool is_sb_event(struct perf_even
 	    attr->context_switch || attr->text_poke ||
 	    attr->bpf_event)
 		return true;
+
 	return false;
 }
 
@@ -5339,6 +5350,8 @@ static void perf_pending_task_sync(struc
 /* vs perf_event_alloc() error */
 static void __free_event(struct perf_event *event)
 {
+	struct pmu *pmu = event->pmu;
+
 	if (event->attach_state & PERF_ATTACH_CALLCHAIN)
 		put_callchain_buffers();
 
@@ -5365,6 +5378,7 @@ static void __free_event(struct perf_eve
 		 * put_pmu_ctx() needs an event->ctx reference, because of
 		 * epc->ctx.
 		 */
+		WARN_ON_ONCE(!pmu);
 		WARN_ON_ONCE(!event->ctx);
 		WARN_ON_ONCE(event->pmu_ctx->ctx != event->ctx);
 		put_pmu_ctx(event->pmu_ctx);
@@ -5377,8 +5391,13 @@ static void __free_event(struct perf_eve
 	if (event->ctx)
 		put_ctx(event->ctx);
 
-	if (event->pmu)
-		module_put(event->pmu->module);
+	if (pmu) {
+		module_put(pmu->module);
+		scoped_guard (spinlock, &pmu->events_lock) {
+			list_del(&event->pmu_list);
+			wake_up_var(pmu);
+		}
+	}
 
 	call_rcu(&event->rcu_head, free_event_rcu);
 }
@@ -5397,6 +5416,7 @@ static void _free_event(struct perf_even
 	security_perf_event_free(event);
 
 	if (event->rb) {
+		WARN_ON_ONCE(!event->pmu);
 		/*
 		 * Can happen when we close an event with re-directed output.
 		 *
@@ -5527,7 +5547,11 @@ int perf_event_release_kernel(struct per
 	 * Thus this guarantees that we will in fact observe and kill _ALL_
 	 * child events.
 	 */
-	perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);
+	if (event->state > PERF_EVENT_STATE_REVOKED) {
+		perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);
+	} else {
+		event->state = PERF_EVENT_STATE_DEAD;
+	}
 
 	perf_event_ctx_unlock(event, ctx);
 
@@ -5838,7 +5862,7 @@ __perf_read(struct perf_event *event, ch
 	 * error state (i.e. because it was pinned but it couldn't be
 	 * scheduled on to the CPU at some point).
 	 */
-	if (event->state == PERF_EVENT_STATE_ERROR)
+	if (event->state <= PERF_EVENT_STATE_ERROR)
 		return 0;
 
 	if (count < event->read_size)
@@ -5877,8 +5901,14 @@ static __poll_t perf_poll(struct file *f
 	struct perf_buffer *rb;
 	__poll_t events = EPOLLHUP;
 
+	if (event->state <= PERF_EVENT_STATE_REVOKED)
+		return EPOLLERR;
+
 	poll_wait(file, &event->waitq, wait);
 
+	if (event->state <= PERF_EVENT_STATE_REVOKED)
+		return EPOLLERR;
+
 	if (is_event_hup(event))
 		return events;
 
@@ -6058,6 +6088,9 @@ static long _perf_ioctl(struct perf_even
 	void (*func)(struct perf_event *);
 	u32 flags = arg;
 
+	if (event->state <= PERF_EVENT_STATE_REVOKED)
+		return -ENODEV;
+
 	switch (cmd) {
 	case PERF_EVENT_IOC_ENABLE:
 		func = _perf_event_enable;
@@ -6507,6 +6540,7 @@ static void perf_mmap_close(struct vm_ar
 	unsigned long size = perf_data_size(rb);
 	bool detach_rest = false;
 
+	/* FIXIES vs perf_pmu_unregister() */
 	if (event->pmu->event_unmapped)
 		event->pmu->event_unmapped(event, vma->vm_mm);
 
@@ -6657,6 +6691,16 @@ static int perf_mmap(struct file *file,
 	mutex_lock(&event->mmap_mutex);
 	ret = -EINVAL;
 
+	/*
+	 * This relies on __pmu_detach_event() taking mmap_mutex after marking
+	 * the event REVOKED. Either we observe the state, or __pmu_detach_event()
+	 * will detach the rb created here.
+	 */
+	if (event->state <= PERF_EVENT_STATE_REVOKED) {
+		ret = -ENODEV;
+		goto unlock;
+	}
+
 	if (vma->vm_pgoff == 0) {
 		nr_pages -= 1;
 
@@ -6840,6 +6884,9 @@ static int perf_fasync(int fd, struct fi
 	struct perf_event *event = filp->private_data;
 	int retval;
 
+	if (event->state <= PERF_EVENT_STATE_REVOKED)
+		return -ENODEV;
+
 	inode_lock(inode);
 	retval = fasync_helper(fd, filp, on, &event->fasync);
 	inode_unlock(inode);
@@ -11892,6 +11939,9 @@ int perf_pmu_register(struct pmu *_pmu,
 	if (!pmu->event_idx)
 		pmu->event_idx = perf_event_idx_default;
 
+	INIT_LIST_HEAD(&pmu->events);
+	spin_lock_init(&pmu->events_lock);
+
 	/*
 	 * Now that the PMU is complete, make it visible to perf_try_init_event().
 	 */
@@ -11905,11 +11955,100 @@ int perf_pmu_register(struct pmu *_pmu,
 }
 EXPORT_SYMBOL_GPL(perf_pmu_register);
 
-void perf_pmu_unregister(struct pmu *pmu)
+static void __pmu_detach_event(struct pmu *pmu, struct perf_event *event,
+			       struct perf_event_context *ctx)
+{
+	/*
+	 * De-schedule the event and mark it REVOKED.
+	 */
+	perf_event_exit_event(event, ctx, true);
+
+	/*
+	 * All _free_event() bits that rely on event->pmu:
+	 *
+	 * Notably, perf_mmap() relies on the ordering here.
+	 */
+	scoped_guard (mutex, &event->mmap_mutex) {
+		WARN_ON_ONCE(pmu->event_unmapped);
+		ring_buffer_attach(event, NULL);
+	}
+
+	perf_event_free_bpf_prog(event);
+	perf_free_addr_filters(event);
+
+	if (event->destroy) {
+		event->destroy(event);
+		event->destroy = NULL;
+	}
+
+	if (event->pmu_ctx) {
+		put_pmu_ctx(event->pmu_ctx);
+		event->pmu_ctx = NULL;
+	}
+
+	exclusive_event_destroy(event);
+	module_put(pmu->module);
+
+	event->pmu = NULL; /* force fault instead of UAF */
+}
+
+static void pmu_detach_event(struct pmu *pmu, struct perf_event *event)
+{
+	struct perf_event_context *ctx;
+
+	ctx = perf_event_ctx_lock(event);
+	__pmu_detach_event(pmu, event, ctx);
+	perf_event_ctx_unlock(event, ctx);
+
+	scoped_guard (spinlock, &pmu->events_lock)
+		list_del(&event->pmu_list);
+}
+
+static struct perf_event *pmu_get_event(struct pmu *pmu)
+{
+	struct perf_event *event;
+
+	guard(spinlock)(&pmu->events_lock);
+	list_for_each_entry(event, &pmu->events, pmu_list) {
+		if (atomic_long_inc_not_zero(&event->refcount))
+			return event;
+	}
+
+	return NULL;
+}
+
+static bool pmu_empty(struct pmu *pmu)
+{
+	guard(spinlock)(&pmu->events_lock);
+	return list_empty(&pmu->events);
+}
+
+static void pmu_detach_events(struct pmu *pmu)
+{
+	struct perf_event *event;
+
+	for (;;) {
+		event = pmu_get_event(pmu);
+		if (!event)
+			break;
+
+		pmu_detach_event(pmu, event);
+		put_event(event);
+	}
+
+	/*
+	 * wait for pending _free_event()s
+	 */
+	wait_var_event(pmu, pmu_empty(pmu));
+}
+
+int perf_pmu_unregister(struct pmu *pmu)
 {
 	scoped_guard (mutex, &pmus_lock) {
+		if (!idr_cmpxchg(&pmu_idr, pmu->type, pmu, NULL))
+			return -EINVAL;
+
 		list_del_rcu(&pmu->entry);
-		idr_remove(&pmu_idr, pmu->type);
 	}
 
 	/*
@@ -11919,7 +12058,31 @@ void perf_pmu_unregister(struct pmu *pmu
 	synchronize_srcu(&pmus_srcu);
 	synchronize_rcu();
 
+	if (pmu->event_unmapped && !pmu_empty(pmu)) {
+		/*
+		 * Can't force remove events when pmu::event_unmapped()
+		 * is used in perf_mmap_close().
+		 */
+		guard(mutex)(&pmus_lock);
+		idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu);
+		list_add_rcu(&pmu->entry, &pmus);
+		return -EBUSY;
+	}
+
+	scoped_guard (mutex, &pmus_lock)
+		idr_remove(&pmu_idr, pmu->type);
+
+	/*
+	 * PMU is removed from the pmus list, so no new events will
+	 * be created, now take care of the existing ones.
+	 */
+	pmu_detach_events(pmu);
+
+	/*
+	 * PMU is unused, make it go away.
+	 */
 	perf_pmu_free(pmu);
+	return 0;
 }
 EXPORT_SYMBOL_GPL(perf_pmu_unregister);
 
@@ -12226,6 +12389,7 @@ perf_event_alloc(struct perf_event_attr
 	INIT_LIST_HEAD(&event->active_entry);
 	INIT_LIST_HEAD(&event->addr_filters.list);
 	INIT_HLIST_NODE(&event->hlist_entry);
+	INIT_LIST_HEAD(&event->pmu_list);
 
 
 	init_waitqueue_head(&event->waitq);
@@ -12294,6 +12458,13 @@ perf_event_alloc(struct perf_event_attr
 
 	perf_event__state_init(event);
 
+	/*
+	 * Hold SRCU critical section around perf_init_event(), until returning
+	 * the fully formed event put on pmu->events_list. This ensures that
+	 * perf_pmu_unregister() will see any in-progress event creation that
+	 * races.
+	 */
+	guard(srcu)(&pmus_srcu);
 	pmu = NULL;
 
 	hwc = &event->hw;
@@ -12383,6 +12554,9 @@ perf_event_alloc(struct perf_event_attr
 	/* symmetric to unaccount_event() in _free_event() */
 	account_event(event);
 
+	scoped_guard (spinlock, &pmu->events_lock)
+		list_add(&event->pmu_list, &pmu->events);
+
 	return_ptr(event);
 }
 
@@ -12769,6 +12943,10 @@ SYSCALL_DEFINE5(perf_event_open,
 		if (err)
 			goto err_fd;
 		group_leader = fd_file(group)->private_data;
+		if (group_leader->state <= PERF_EVENT_STATE_REVOKED) {
+			err = -ENODEV;
+			goto err_group_fd;
+		}
 		if (flags & PERF_FLAG_FD_OUTPUT)
 			output_event = group_leader;
 		if (flags & PERF_FLAG_FD_NO_GROUP)
@@ -13316,10 +13494,11 @@ static void sync_child_event(struct perf
 }
 
 static void
-perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
+perf_event_exit_event(struct perf_event *event,
+		      struct perf_event_context *ctx, bool revoke)
 {
 	struct perf_event *parent_event = event->parent;
-	unsigned long detach_flags = 0;
+	unsigned long detach_flags = DETACH_EXIT;
 
 	if (parent_event) {
 		/*
@@ -13334,16 +13513,14 @@ perf_event_exit_event(struct perf_event
 		 * Do destroy all inherited groups, we don't care about those
 		 * and being thorough is better.
 		 */
-		detach_flags = DETACH_GROUP | DETACH_CHILD;
+		detach_flags |= DETACH_GROUP | DETACH_CHILD;
 		mutex_lock(&parent_event->child_mutex);
 	}
 
-	perf_remove_from_context(event, detach_flags);
+	if (revoke)
+		detach_flags |= DETACH_GROUP | DETACH_REVOKE;
 
-	raw_spin_lock_irq(&ctx->lock);
-	if (event->state > PERF_EVENT_STATE_EXIT)
-		perf_event_set_state(event, PERF_EVENT_STATE_EXIT);
-	raw_spin_unlock_irq(&ctx->lock);
+	perf_remove_from_context(event, detach_flags);
 
 	/*
 	 * Child events can be freed.
@@ -13419,7 +13596,7 @@ static void perf_event_exit_task_context
 	perf_event_task(child, child_ctx, 0);
 
 	list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
-		perf_event_exit_event(child_event, child_ctx);
+		perf_event_exit_event(child_event, child_ctx, false);
 
 	mutex_unlock(&child_ctx->mutex);
 
@@ -13609,6 +13786,9 @@ inherit_event(struct perf_event *parent_
 	if (parent_event->parent)
 		parent_event = parent_event->parent;
 
+	if (parent_event->state <= PERF_EVENT_STATE_REVOKED)
+		return NULL;
+
 	child_event = perf_event_alloc(&parent_event->attr,
 					   parent_event->cpu,
 					   child,

Re: [PATCH 19/19] perf: Make perf_pmu_unregister() useable

Posted by Ravi Bangoria 1 year, 1 month ago

Hi Peter,

> @@ -12294,6 +12458,13 @@ perf_event_alloc(struct perf_event_attr
>  
>  	perf_event__state_init(event);
>  
> +	/*
> +	 * Hold SRCU critical section around perf_init_event(), until returning
> +	 * the fully formed event put on pmu->events_list. This ensures that
> +	 * perf_pmu_unregister() will see any in-progress event creation that
> +	 * races.
> +	 */
> +	guard(srcu)(&pmus_srcu);

Minor nit. This can go down a bit, just right before perf_init_event() ?

Thanks,
Ravi

Re: [PATCH 19/19] perf: Make perf_pmu_unregister() useable

Posted by Ravi Bangoria 1 year, 2 months ago

Hi Peter,

> @@ -6507,6 +6540,7 @@ static void perf_mmap_close(struct vm_ar
>  	unsigned long size = perf_data_size(rb);
>  	bool detach_rest = false;
>  
> +	/* FIXIES vs perf_pmu_unregister() */
>  	if (event->pmu->event_unmapped)
>  		event->pmu->event_unmapped(event, vma->vm_mm);

I assume you are already aware of the race between __pmu_detach_event()
and perf_mmap_close() since you have put that comment. In any case, below
sequence of operations triggers a splat when perf_mmap_close() tries to
access event->rb, event->pmu etc. which were already freed by
__pmu_detach_event().

Sequence:

    Kernel                       Userspace
    ------                       ---------
    perf_pmu_register()
                                fd = perf_event_open()
                                p = mmap(fd)
    perf_pmu_unregister()
                                munmap(p)
                                close(fd)

Splat:

    BUG: kernel NULL pointer dereference, address: 0000000000000088
    #PF: supervisor read access in kernel mode
    #PF: error_code(0x0000) - not-present page
    PGD 105f90067 P4D 105f90067 PUD 11a94a067 PMD 0
    Oops: Oops: 0000 [#1] PREEMPT SMP NOPTI
    CPU: 49 UID: 0 PID: 3456 Comm: perf-event-mmap Tainted: G           OE      6.12.0-vanilla-dirty #273
    Tainted: [O]=OOT_MODULE, [E]=UNSIGNED_MODULE
    Hardware name: Dell Inc. PowerEdge R6515/07PXPY, BIOS 2.7.3 03/31/2022
    RIP: 0010:perf_mmap_close+0x69/0x316
    Code: [...]
    RSP: 0018:ffffc90003773970 EFLAGS: 00010246
    RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
    RDX: ffff888125c2d400 RSI: ffff88bf7c8a1900 RDI: ffff888125c2d400
    RBP: ffff888103ccaf40 R08: 0000000000000000 R09: 0000000000000000
    R10: 3030303030303030 R11: 3030303030303030 R12: ffff88811f58d080
    R13: ffffc90003773a70 R14: ffffc90003773a28 R15: 00007fcc1df1d000
    FS:  00007fcc1e72e6c0(0000) GS:ffff88bf7c880000(0000) knlGS:0000000000000000
    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    CR2: 0000000000000088 CR3: 000000010396c003 CR4: 0000000000f70ef0
    PKRU: 55555554
    Call Trace:
     <TASK>
     ? __die_body.cold+0x19/0x27
     ? page_fault_oops+0x15a/0x2f0
     ? exc_page_fault+0x7e/0x180
     ? asm_exc_page_fault+0x26/0x30
     ? perf_mmap_close+0x69/0x316
     remove_vma+0x2f/0x70
     vms_complete_munmap_vmas+0xdc/0x190
     do_vmi_align_munmap+0x1d7/0x250
     do_vmi_munmap+0xd0/0x180
     __vm_munmap+0xa2/0x170
     ? hrtimer_start_range_ns+0x26f/0x3b0
     __x64_sys_munmap+0x1b/0x30
     do_syscall_64+0x82/0x160
     ? srso_alias_return_thunk+0x5/0xfbef5
     ? tty_insert_flip_string_and_push_buffer+0x8d/0xc0
     ? srso_alias_return_thunk+0x5/0xfbef5
     ? remove_wait_queue+0x24/0x60
     ? srso_alias_return_thunk+0x5/0xfbef5
     ? n_tty_write+0x36f/0x520
     ? srso_alias_return_thunk+0x5/0xfbef5
     ? __wake_up+0x44/0x60
     ? srso_alias_return_thunk+0x5/0xfbef5
     ? file_tty_write.isra.0+0x20c/0x2c0
     ? srso_alias_return_thunk+0x5/0xfbef5
     ? vfs_write+0x290/0x450
     ? srso_alias_return_thunk+0x5/0xfbef5
     ? srso_alias_return_thunk+0x5/0xfbef5
     ? syscall_exit_to_user_mode+0x10/0x210
     ? srso_alias_return_thunk+0x5/0xfbef5
     ? do_syscall_64+0x8e/0x160
     ? __rseq_handle_notify_resume+0xa6/0x4e0
     ? __pfx_hrtimer_wakeup+0x10/0x10
     ? srso_alias_return_thunk+0x5/0xfbef5
     ? syscall_exit_to_user_mode+0x1d5/0x210
     ? srso_alias_return_thunk+0x5/0xfbef5
     ? do_syscall_64+0x8e/0x160
     ? exc_page_fault+0x7e/0x180
     entry_SYSCALL_64_after_hwframe+0x76/0x7e
    RIP: 0033:0x7fcc1e849eab
    Code: [...]
    RSP: 002b:00007fcc1e72de08 EFLAGS: 00000202 ORIG_RAX: 000000000000000b
    RAX: ffffffffffffffda RBX: 00007fcc1e72ecdc RCX: 00007fcc1e849eab
    RDX: 0000000000011000 RSI: 0000000000011000 RDI: 00007fcc1df1d000
    RBP: 00007fcc1e72dec0 R08: 0000000000000000 R09: 00000000ffffffff
    R10: 0000000000000000 R11: 0000000000000202 R12: 00007fcc1e72e6c0
    R13: ffffffffffffff88 R14: 0000000000000000 R15: 00007ffd1a4ce000
     </TASK>

Re: [PATCH 19/19] perf: Make perf_pmu_unregister() useable

Posted by Peter Zijlstra 1 year, 1 month ago


Oh sorry, I seem to have missed this email :/

On Mon, Nov 25, 2024 at 09:40:28AM +0530, Ravi Bangoria wrote:
> Hi Peter,
> 
> > @@ -6507,6 +6540,7 @@ static void perf_mmap_close(struct vm_ar
> >  	unsigned long size = perf_data_size(rb);
> >  	bool detach_rest = false;
> >  
> > +	/* FIXIES vs perf_pmu_unregister() */
> >  	if (event->pmu->event_unmapped)
> >  		event->pmu->event_unmapped(event, vma->vm_mm);
> 
> I assume you are already aware of the race between __pmu_detach_event()
> and perf_mmap_close() since you have put that comment. 

That comment was mostly about how we can't fix up the whole
->event_unmapped() thing and have to abort pmu_unregister for it.

> In any case, below sequence of operations triggers a splat when
> perf_mmap_close() tries to access event->rb, event->pmu etc. which
> were already freed by __pmu_detach_event().
> 
> Sequence:
> 
>     Kernel                       Userspace
>     ------                       ---------
>     perf_pmu_register()
>                                 fd = perf_event_open()
>                                 p = mmap(fd)
>     perf_pmu_unregister()
>                                 munmap(p)
>                                 close(fd)

Right, let me go have a look. Thanks!

Re: [PATCH 19/19] perf: Make perf_pmu_unregister() useable

Posted by Peter Zijlstra 1 year, 1 month ago

On Tue, Dec 17, 2024 at 10:12:16AM +0100, Peter Zijlstra wrote:
> 
> 
> Oh sorry, I seem to have missed this email :/
> 
> On Mon, Nov 25, 2024 at 09:40:28AM +0530, Ravi Bangoria wrote:
> > Hi Peter,
> > 
> > > @@ -6507,6 +6540,7 @@ static void perf_mmap_close(struct vm_ar
> > >  	unsigned long size = perf_data_size(rb);
> > >  	bool detach_rest = false;
> > >  
> > > +	/* FIXIES vs perf_pmu_unregister() */
> > >  	if (event->pmu->event_unmapped)
> > >  		event->pmu->event_unmapped(event, vma->vm_mm);
> > 
> > I assume you are already aware of the race between __pmu_detach_event()
> > and perf_mmap_close() since you have put that comment. 
> 
> That comment was mostly about how we can't fix up the whole
> ->event_unmapped() thing and have to abort pmu_unregister for it.
> 
> > In any case, below sequence of operations triggers a splat when
> > perf_mmap_close() tries to access event->rb, event->pmu etc. which
> > were already freed by __pmu_detach_event().
> > 
> > Sequence:
> > 
> >     Kernel                       Userspace
> >     ------                       ---------
> >     perf_pmu_register()
> >                                 fd = perf_event_open()
> >                                 p = mmap(fd)
> >     perf_pmu_unregister()
> >                                 munmap(p)
> >                                 close(fd)
> 
> Right, let me go have a look. Thanks!

Bah, that's a right mess indeed, however did I miss all that.

The easiest solution is probably to leave the RB around on detach, but
now I need to remember why I did that in the first place :/

Oh.. I think I mostly that to serialize against perf_mmap(), which
should reject creating further maps. But we can do that without actually
detaching the RB -- we only need to acquire and release mmap_mutex.

Ah, there's that perf_event_stop() inside of ring_buffer_attach(), that
must not happen after detach, obviously. So that must be dealt with.

Hmm, also if we leave ->rb around, then we need to deal with
perf_event_set_output(), someone could try and redirect their things
into our buffer -- which isn't technically broken, but still weird.

Something like the below.

How did you test; perf-fuzzer or something?

--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1742,7 +1742,7 @@ static inline bool needs_branch_stack(st
 
 static inline bool has_aux(struct perf_event *event)
 {
-	return event->pmu->setup_aux;
+	return event->pmu && event->pmu->setup_aux;
 }
 
 static inline bool has_aux_action(struct perf_event *event)
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5409,7 +5409,6 @@ static void _free_event(struct perf_even
 	security_perf_event_free(event);
 
 	if (event->rb) {
-		WARN_ON_ONCE(!event->pmu);
 		/*
 		 * Can happen when we close an event with re-directed output.
 		 *
@@ -12023,7 +12022,10 @@ static void __pmu_detach_event(struct pm
 	 */
 	scoped_guard (mutex, &event->mmap_mutex) {
 		WARN_ON_ONCE(pmu->event_unmapped);
-		ring_buffer_attach(event, NULL);
+		/* 
+		 * Mostly an empy lock sequence, such that perf_mmap(), which
+		 * relies on mmap_mutex, is sure to observe the state change.
+		 */
 	}
 
 	perf_event_free_bpf_prog(event);
@@ -12823,6 +12825,9 @@ perf_event_set_output(struct perf_event
 		goto unlock;
 
 	if (output_event) {
+		if (output_event->state <= PERF_EVENT_STATE_REVOKED)
+			goto unlock;
+
 		/* get the rb we want to redirect to */
 		rb = ring_buffer_get(output_event);
 		if (!rb)

Re: [PATCH 19/19] perf: Make perf_pmu_unregister() useable

Posted by Ravi Bangoria 1 year, 1 month ago

Hi Peter,

>>> In any case, below sequence of operations triggers a splat when
>>> perf_mmap_close() tries to access event->rb, event->pmu etc. which
>>> were already freed by __pmu_detach_event().
>>>
>>> Sequence:
>>>
>>>     Kernel                       Userspace
>>>     ------                       ---------
>>>     perf_pmu_register()
>>>                                 fd = perf_event_open()
>>>                                 p = mmap(fd)
>>>     perf_pmu_unregister()
>>>                                 munmap(p)
>>>                                 close(fd)
>>
>> Right, let me go have a look. Thanks!
> 
> Bah, that's a right mess indeed, however did I miss all that.
> 
> The easiest solution is probably to leave the RB around on detach, but
> now I need to remember why I did that in the first place :/
> 
> Oh.. I think I mostly that to serialize against perf_mmap(), which
> should reject creating further maps. But we can do that without actually
> detaching the RB -- we only need to acquire and release mmap_mutex.
> 
> Ah, there's that perf_event_stop() inside of ring_buffer_attach(), that
> must not happen after detach, obviously. So that must be dealt with.
> 
> Hmm, also if we leave ->rb around, then we need to deal with
> perf_event_set_output(), someone could try and redirect their things
> into our buffer -- which isn't technically broken, but still weird.
> 
> Something like the below.
> 
> How did you test; perf-fuzzer or something?

Prepared a simple test that does pmu register(), unregister() and
"perf record" in parallel. It's quite dirty, I'll clean it up and
share it here.

> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -1742,7 +1742,7 @@ static inline bool needs_branch_stack(st
>  
>  static inline bool has_aux(struct perf_event *event)
>  {
> -	return event->pmu->setup_aux;
> +	return event->pmu && event->pmu->setup_aux;
>  }
>  
>  static inline bool has_aux_action(struct perf_event *event)
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -5409,7 +5409,6 @@ static void _free_event(struct perf_even
>  	security_perf_event_free(event);
>  
>  	if (event->rb) {
> -		WARN_ON_ONCE(!event->pmu);
>  		/*
>  		 * Can happen when we close an event with re-directed output.
>  		 *
> @@ -12023,7 +12022,10 @@ static void __pmu_detach_event(struct pm
>  	 */
>  	scoped_guard (mutex, &event->mmap_mutex) {
>  		WARN_ON_ONCE(pmu->event_unmapped);
> -		ring_buffer_attach(event, NULL);
> +		/* 
> +		 * Mostly an empy lock sequence, such that perf_mmap(), which
> +		 * relies on mmap_mutex, is sure to observe the state change.
> +		 */
>  	}
>  
>  	perf_event_free_bpf_prog(event);
> @@ -12823,6 +12825,9 @@ perf_event_set_output(struct perf_event
>  		goto unlock;
>  
>  	if (output_event) {
> +		if (output_event->state <= PERF_EVENT_STATE_REVOKED)
> +			goto unlock;
> +
>  		/* get the rb we want to redirect to */
>  		rb = ring_buffer_get(output_event);
>  		if (!rb)

I needed this additional diff on top of your change. With this, it survives
my test. perf_mmap_close() change seems correct. Not sure about perf_mmap().
I'll inspect the code further.

--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6540,7 +6540,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 	bool detach_rest = false;
 
 	/* FIXIES vs perf_pmu_unregister() */
-	if (event->pmu->event_unmapped)
+	if (event->pmu && event->pmu->event_unmapped)
 		event->pmu->event_unmapped(event, vma->vm_mm);
 
 	/*
@@ -6873,7 +6873,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP);
 	vma->vm_ops = &perf_mmap_vmops;
 
-	if (!ret && event->pmu->event_mapped)
+	if (!ret && event->pmu && event->pmu->event_mapped)
 		event->pmu->event_mapped(event, vma->vm_mm);
 
 	return ret;

Thanks,
Ravi

Re: [PATCH 19/19] perf: Make perf_pmu_unregister() useable

Posted by Ravi Bangoria 1 year, 1 month ago

Hi Peter,

Sorry for the delay. Was on vacation.

>>>> In any case, below sequence of operations triggers a splat when
>>>> perf_mmap_close() tries to access event->rb, event->pmu etc. which
>>>> were already freed by __pmu_detach_event().
>>>>
>>>> Sequence:
>>>>
>>>>     Kernel                       Userspace
>>>>     ------                       ---------
>>>>     perf_pmu_register()
>>>>                                 fd = perf_event_open()
>>>>                                 p = mmap(fd)
>>>>     perf_pmu_unregister()
>>>>                                 munmap(p)
>>>>                                 close(fd)
>>>
>>> Right, let me go have a look. Thanks!
>>
>> Bah, that's a right mess indeed, however did I miss all that.
>>
>> The easiest solution is probably to leave the RB around on detach, but
>> now I need to remember why I did that in the first place :/
>>
>> Oh.. I think I mostly that to serialize against perf_mmap(), which
>> should reject creating further maps. But we can do that without actually
>> detaching the RB -- we only need to acquire and release mmap_mutex.
>>
>> Ah, there's that perf_event_stop() inside of ring_buffer_attach(), that
>> must not happen after detach, obviously. So that must be dealt with.
>>
>> Hmm, also if we leave ->rb around, then we need to deal with
>> perf_event_set_output(), someone could try and redirect their things
>> into our buffer -- which isn't technically broken, but still weird.
>>
>> Something like the below.
>>
>> How did you test; perf-fuzzer or something?
> 
> Prepared a simple test that does pmu register(), unregister() and
> "perf record" in parallel. It's quite dirty, I'll clean it up and
> share it here.
> 
>> --- a/include/linux/perf_event.h
>> +++ b/include/linux/perf_event.h
>> @@ -1742,7 +1742,7 @@ static inline bool needs_branch_stack(st
>>  
>>  static inline bool has_aux(struct perf_event *event)
>>  {
>> -	return event->pmu->setup_aux;
>> +	return event->pmu && event->pmu->setup_aux;
>>  }
>>  
>>  static inline bool has_aux_action(struct perf_event *event)
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -5409,7 +5409,6 @@ static void _free_event(struct perf_even
>>  	security_perf_event_free(event);
>>  
>>  	if (event->rb) {
>> -		WARN_ON_ONCE(!event->pmu);
>>  		/*
>>  		 * Can happen when we close an event with re-directed output.
>>  		 *
>> @@ -12023,7 +12022,10 @@ static void __pmu_detach_event(struct pm
>>  	 */
>>  	scoped_guard (mutex, &event->mmap_mutex) {
>>  		WARN_ON_ONCE(pmu->event_unmapped);
>> -		ring_buffer_attach(event, NULL);
>> +		/* 
>> +		 * Mostly an empy lock sequence, such that perf_mmap(), which
>> +		 * relies on mmap_mutex, is sure to observe the state change.
>> +		 */
>>  	}
>>  
>>  	perf_event_free_bpf_prog(event);
>> @@ -12823,6 +12825,9 @@ perf_event_set_output(struct perf_event
>>  		goto unlock;
>>  
>>  	if (output_event) {
>> +		if (output_event->state <= PERF_EVENT_STATE_REVOKED)
>> +			goto unlock;
>> +
>>  		/* get the rb we want to redirect to */
>>  		rb = ring_buffer_get(output_event);
>>  		if (!rb)
> 
> I needed this additional diff on top of your change. With this, it survives
> my test. perf_mmap_close() change seems correct. Not sure about perf_mmap().
> I'll inspect the code further.
> 
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -6540,7 +6540,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
>  	bool detach_rest = false;
>  
>  	/* FIXIES vs perf_pmu_unregister() */
> -	if (event->pmu->event_unmapped)
> +	if (event->pmu && event->pmu->event_unmapped)
>  		event->pmu->event_unmapped(event, vma->vm_mm);
>  
>  	/*
> @@ -6873,7 +6873,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
>  	vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP);
>  	vma->vm_ops = &perf_mmap_vmops;
>  
> -	if (!ret && event->pmu->event_mapped)
> +	if (!ret && event->pmu && event->pmu->event_mapped)
>  		event->pmu->event_mapped(event, vma->vm_mm);
>  
>  	return ret;

Both of these are incorrect. They just reduce the race window, doesn't
actually solve the race. Anyway, I could spot few other races:

1) A race between event creation and perf_pmu_unregister(). Any event
   create code path (perf_event_open(), perf_event_create_kernel_counter()
   and inherit_event()) allocates event with perf_event_alloc() which adds
   an event to the pmu->events list. However, the event is still immature,
   for ex, event->ctx is still NULL. In the mean time, perf_pmu_unregister()
   finds this event and tries to detach it.

   perf_event_open()                              perf_pmu_unregister()
     event = perf_event_alloc()                     pmu_detach_event(event)
       list_add(&event->pmu_list, &pmu->events);      perf_event_ctx_lock(event)
       /*                                               perf_event_ctx_lock_nested(ctx)
        * event->ctx is NULL.                             ctx = READ_ONCE(event->ctx); /* event->ctx is NULL */
        */                                                if (!refcount_inc_not_zero(&ctx->refcount)) { /* Crash */
     perf_install_in_context(ctx, event);

2) A race with perf_event_release_kernel(). perf_event_release_kernel()
   prepares a separate "free_list" of all children events under ctx->mutex
   and event->child_mutex. However, the "free_list" uses the same
   "event->child_list" for entries. OTOH, perf_pmu_unregister() ultimately
   calls __perf_remove_from_context() with DETACH_CHILD, which checks if
   the event being removed is a child event, and if so, it will try to
   detach the child from parent using list_del_init(&event->child_list);
   i.e. two code path doing list_del on the same list entry.

   perf_event_release_kernel()                                        perf_pmu_unregister()
     /* Move children events to free_list */                            ...
     list_for_each_entry_safe(child, tmp, &free_list, child_list) {       perf_remove_from_context() /* with DETACH_CHILD */
       ...                                                                  __perf_remove_from_context()
       list_del(&child->child_list);                                          perf_child_detach()
                                                                                list_del_init(&event->child_list);

3) A WARN(), not a race. perf_pmu_unregister() increments event->refcount
   before detaching the event. If perf_pmu_unregister() picks up a child
   event, perf_event_exit_event() called through perf_pmu_unregister()
   will try to free it. Since event->refcount would be 2, free_event()
   will trigger a WARN().

   perf_pmu_unregister()
     event = pmu_get_event() /* event->refcount => 2 */
       ...
         perf_event_exit_event()
	   if (parent_event) { /* true, because `event` is a child */
	     free_event(event);
	       if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
                        "unexpected event refcount: %ld; ptr=%p\n",
                        atomic_long_read(&event->refcount), event))

4) A race with perf_event_set_bpf_prog(). perf_event_set_bpf_prog() might
   be in process of setting event->prog, where as perf_pmu_unregister(),
   which internally calls perf_event_free_bpf_prog(), will clear the
   event->prog pointer.

   perf_pmu_unregister()                perf_event_set_bpf_prog()
     ...                                  perf_event_set_bpf_handler()
       perf_event_free_bpf_prog()           event->prog = prog;
         event->prog = NULL;

I've yet to inspect other code paths, so there might be more races.

Thinking loud, a plausible brute force solution is to introduce "event
specific lock" and acquire it right at the beginning of all code paths
and release it at the end. event->lock shouldn't create any contention,
since event would mostly be going through only one code path at any
point in time.

Thanks,
Ravi

Re: [PATCH 19/19] perf: Make perf_pmu_unregister() useable

Posted by Peter Zijlstra 1 year ago

On Fri, Jan 03, 2025 at 09:54:09AM +0530, Ravi Bangoria wrote:
> Hi Peter,
> 
> Sorry for the delay. Was on vacation.

Yeah, me too :-)

> Both of these are incorrect. They just reduce the race window, doesn't
> actually solve the race. Anyway, I could spot few other races:
> 
> 1) A race between event creation and perf_pmu_unregister(). Any event
>    create code path (perf_event_open(), perf_event_create_kernel_counter()
>    and inherit_event()) allocates event with perf_event_alloc() which adds
>    an event to the pmu->events list. However, the event is still immature,
>    for ex, event->ctx is still NULL. In the mean time, perf_pmu_unregister()
>    finds this event and tries to detach it.
> 
>    perf_event_open()                              perf_pmu_unregister()
>      event = perf_event_alloc()                     pmu_detach_event(event)
>        list_add(&event->pmu_list, &pmu->events);      perf_event_ctx_lock(event)
>        /*                                               perf_event_ctx_lock_nested(ctx)
>         * event->ctx is NULL.                             ctx = READ_ONCE(event->ctx); /* event->ctx is NULL */
>         */                                                if (!refcount_inc_not_zero(&ctx->refcount)) { /* Crash */
>      perf_install_in_context(ctx, event);

Ah, that puts the lie to the guard(srcu) comment there, doesn't it :/

So the intent was for that SRCU section to cover the creation, so that
perf_pmu_unregister() can take out the pmu to avoid creating more
events, then srcu-sync to wait on all in-progress creation and then go
detach everything.

I suppose the simplest thing here is to grow that SRCU section.

> 2) A race with perf_event_release_kernel(). perf_event_release_kernel()
>    prepares a separate "free_list" of all children events under ctx->mutex
>    and event->child_mutex. However, the "free_list" uses the same
>    "event->child_list" for entries. OTOH, perf_pmu_unregister() ultimately
>    calls __perf_remove_from_context() with DETACH_CHILD, which checks if
>    the event being removed is a child event, and if so, it will try to
>    detach the child from parent using list_del_init(&event->child_list);
>    i.e. two code path doing list_del on the same list entry.
> 
>    perf_event_release_kernel()                                        perf_pmu_unregister()
>      /* Move children events to free_list */                            ...
>      list_for_each_entry_safe(child, tmp, &free_list, child_list) {       perf_remove_from_context() /* with DETACH_CHILD */
>        ...                                                                  __perf_remove_from_context()
>        list_del(&child->child_list);                                          perf_child_detach()
>                                                                                 list_del_init(&event->child_list);

Bah, I had figured it was taken care of, because perf_event_exit_event()
has a similar race. I'll try and figure out what to do there.

> 3) A WARN(), not a race. perf_pmu_unregister() increments event->refcount
>    before detaching the event. If perf_pmu_unregister() picks up a child
>    event, perf_event_exit_event() called through perf_pmu_unregister()
>    will try to free it. Since event->refcount would be 2, free_event()
>    will trigger a WARN().
> 
>    perf_pmu_unregister()
>      event = pmu_get_event() /* event->refcount => 2 */
>        ...
>          perf_event_exit_event()
> 	   if (parent_event) { /* true, because `event` is a child */
> 	     free_event(event);
> 	       if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
>                         "unexpected event refcount: %ld; ptr=%p\n",
>                         atomic_long_read(&event->refcount), event))

I'll make that something like:

	if (revoke)
		put_event(event);
	else
		free_event(event);

or so.

> 4) A race with perf_event_set_bpf_prog(). perf_event_set_bpf_prog() might
>    be in process of setting event->prog, where as perf_pmu_unregister(),
>    which internally calls perf_event_free_bpf_prog(), will clear the
>    event->prog pointer.
> 
>    perf_pmu_unregister()                perf_event_set_bpf_prog()
>      ...                                  perf_event_set_bpf_handler()
>        perf_event_free_bpf_prog()           event->prog = prog;
>          event->prog = NULL;
> 
> I've yet to inspect other code paths, so there might be more races.

Weird, that should be serialized by perf_event_ctx_lock(), both
__pmu_detach_event() and _perf_ioctl() are called under that.

Thanks for going over this!

Re: [PATCH 19/19] perf: Make perf_pmu_unregister() useable

Posted by Peter Zijlstra 1 year ago

On Fri, Jan 17, 2025 at 01:03:16AM +0100, Peter Zijlstra wrote:

> > 2) A race with perf_event_release_kernel(). perf_event_release_kernel()
> >    prepares a separate "free_list" of all children events under ctx->mutex
> >    and event->child_mutex. However, the "free_list" uses the same
> >    "event->child_list" for entries. OTOH, perf_pmu_unregister() ultimately
> >    calls __perf_remove_from_context() with DETACH_CHILD, which checks if
> >    the event being removed is a child event, and if so, it will try to
> >    detach the child from parent using list_del_init(&event->child_list);
> >    i.e. two code path doing list_del on the same list entry.
> > 
> >    perf_event_release_kernel()                                        perf_pmu_unregister()
> >      /* Move children events to free_list */                            ...
> >      list_for_each_entry_safe(child, tmp, &free_list, child_list) {       perf_remove_from_context() /* with DETACH_CHILD */
> >        ...                                                                  __perf_remove_from_context()
> >        list_del(&child->child_list);                                          perf_child_detach()
> >                                                                                 list_del_init(&event->child_list);
> 
> Bah, I had figured it was taken care of, because perf_event_exit_event()
> has a similar race. I'll try and figure out what to do there.

So, the problem appears to be that perf_event_release_kernel() does not
use DETACH_CHILD, doing so will clear PERF_ATTACH_CHILD, at which point
the above is fully serialized by parent->child_mutex.

Then the next problem is that since pmu_detach_events() can hold an
extra ref on things, the free_event() from free_list will WARN, like
before.

Easily fixed by making that put_event(), except that messes up the whole
wait_var_event() scheme -- since __free_event() does the final
put_ctx().

This in turn can be fixed by pushing that wake_up_var() nonsense into
put_ctx() itself.

Which then gives me something like so.

But also, I think we can get rid of that free_list entirely.

Anyway, let me go break this up into individual patches and go test
this -- after lunch!

--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1229,8 +1229,14 @@ static void put_ctx(struct perf_event_co
 	if (refcount_dec_and_test(&ctx->refcount)) {
 		if (ctx->parent_ctx)
 			put_ctx(ctx->parent_ctx);
-		if (ctx->task && ctx->task != TASK_TOMBSTONE)
-			put_task_struct(ctx->task);
+		if (ctx->task) {
+			if (ctx->task == TASK_TOMBSTONE) {
+				smp_mb();
+				wake_up_var(&ctx->refcount);
+			} else {
+				put_task_struct(ctx->task);
+			}
+		}
 		call_rcu(&ctx->rcu_head, free_ctx);
 	}
 }
@@ -5550,8 +5556,6 @@ int perf_event_release_kernel(struct per
 again:
 	mutex_lock(&event->child_mutex);
 	list_for_each_entry(child, &event->child_list, child_list) {
-		void *var = NULL;
-
 		/*
 		 * Cannot change, child events are not migrated, see the
 		 * comment with perf_event_ctx_lock_nested().
@@ -5584,46 +5588,32 @@ int perf_event_release_kernel(struct per
 		tmp = list_first_entry_or_null(&event->child_list,
 					       struct perf_event, child_list);
 		if (tmp == child) {
-			perf_remove_from_context(child, DETACH_GROUP);
-			list_move(&child->child_list, &free_list);
+			perf_remove_from_context(child, DETACH_GROUP | DETACH_CHILD);
+			/*
+			 * Can't risk calling into free_event() here, since
+			 * event->destroy() might invert with the currently
+			 * held locks, see 82d94856fa22 ("perf/core: Fix lock
+			 * inversion between perf,trace,cpuhp")
+			 */
+			list_add(&child->child_list, &free_list);
 			/*
 			 * This matches the refcount bump in inherit_event();
 			 * this can't be the last reference.
 			 */
 			put_event(event);
-		} else {
-			var = &ctx->refcount;
 		}
 
 		mutex_unlock(&event->child_mutex);
 		mutex_unlock(&ctx->mutex);
 		put_ctx(ctx);
 
-		if (var) {
-			/*
-			 * If perf_event_free_task() has deleted all events from the
-			 * ctx while the child_mutex got released above, make sure to
-			 * notify about the preceding put_ctx().
-			 */
-			smp_mb(); /* pairs with wait_var_event() */
-			wake_up_var(var);
-		}
 		goto again;
 	}
 	mutex_unlock(&event->child_mutex);
 
 	list_for_each_entry_safe(child, tmp, &free_list, child_list) {
-		void *var = &child->ctx->refcount;
-
 		list_del(&child->child_list);
-		free_event(child);
-
-		/*
-		 * Wake any perf_event_free_task() waiting for this event to be
-		 * freed.
-		 */
-		smp_mb(); /* pairs with wait_var_event() */
-		wake_up_var(var);
+		put_event(child);
 	}
 
 no_ctx:

Re: [PATCH 19/19] perf: Make perf_pmu_unregister() useable

Posted by Peter Zijlstra 1 year ago

On Fri, Jan 17, 2025 at 02:04:23PM +0100, Peter Zijlstra wrote:

> Anyway, let me go break this up into individual patches and go test
> this -- after lunch!

OK, so aside from a few dumb mistakes, the result seems to hold up with
your tinypmu testcase. I left it running for about 30 minutes.

I pushed out the latest patches to queue/perf/pmu-unregister

Re: [PATCH 19/19] perf: Make perf_pmu_unregister() useable

Posted by Ravi Bangoria 1 year ago

Hi Peter,

On 18-Jan-25 2:34 AM, Peter Zijlstra wrote:
> On Fri, Jan 17, 2025 at 02:04:23PM +0100, Peter Zijlstra wrote:
> 
>> Anyway, let me go break this up into individual patches and go test
>> this -- after lunch!
> 
> OK, so aside from a few dumb mistakes, the result seems to hold up with
> your tinypmu testcase. I left it running for about 30 minutes.
> 
> I pushed out the latest patches to queue/perf/pmu-unregister

I'll spend some time to go through the changes.

I ran fuzzer over the weekend with latest queue/perf/pmu-unregister
and I saw this kernel crash:

  BUG: kernel NULL pointer dereference, address: 00000000000000d0
  #PF: supervisor read access in kernel mode
  #PF: error_code(0x0000) - not-present page
  PGD 12a922067 P4D 0
  Oops: Oops: 0000 [#1] PREEMPT SMP NOPTI
  CPU: 8 UID: 1002 PID: 8505 Comm: perf_fuzzer Kdump: loaded Tainted: G        W  O       6.13.0-rc1-pmu-unregister+ #171
  Tainted: [W]=WARN, [O]=OOT_MODULE
  Hardware name: AMD Corporation RUBY/RUBY, BIOS RRR1009C 07/21/2023
  RIP: 0010:perf_mmap_to_page+0x6/0xc0
  Code: ...
  RSP: 0018:ffa0000003aff910 EFLAGS: 00010206
  RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000008
  RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
  RBP: ffa0000003aff980 R08: 0000000000000000 R09: 0000000000000000
  R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
  R13: 00007b2b02ca8000 R14: ff1100014f9cc5c0 R15: 0000000000000009
  FS:  00007b2b02e03740(0000) GS:ff11001009000000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 00000000000000d0 CR3: 000000014f99c001 CR4: 0000000000f71ef0
  PKRU: 55555554
  Call Trace:
   <TASK>
   ? show_regs+0x6c/0x80
   ? __die+0x24/0x80
   ? page_fault_oops+0x155/0x570
   ? do_user_addr_fault+0x4b2/0x870
   ? srso_alias_return_thunk+0x5/0xfbef5
   ? get_page_from_freelist+0x3c7/0x1680
   ? exc_page_fault+0x82/0x1b0
   ? asm_exc_page_fault+0x27/0x30
   ? perf_mmap_to_page+0x6/0xc0
   ? perf_mmap+0x237/0x710
   __mmap_region+0x6d5/0xb90
   mmap_region+0x8d/0xc0
   do_mmap+0x349/0x630
   vm_mmap_pgoff+0xf4/0x1c0
   ksys_mmap_pgoff+0x177/0x240
   __x64_sys_mmap+0x33/0x70
   x64_sys_call+0x24b9/0x2650
   do_syscall_64+0x7e/0x170
   ? srso_alias_return_thunk+0x5/0xfbef5
   ? srso_alias_return_thunk+0x5/0xfbef5
   ? sched_tick+0x119/0x320
   ? srso_alias_return_thunk+0x5/0xfbef5
   ? sysvec_irq_work+0x4f/0xc0
   ? srso_alias_return_thunk+0x5/0xfbef5
   ? rcu_report_qs_rnp+0xd1/0x140
   ? srso_alias_return_thunk+0x5/0xfbef5
   ? rcu_core+0x1c2/0x380
   ? srso_alias_return_thunk+0x5/0xfbef5
   ? rcu_core_si+0xe/0x20
   ? srso_alias_return_thunk+0x5/0xfbef5
   ? handle_softirqs+0xe7/0x330
   ? srso_alias_return_thunk+0x5/0xfbef5
   ? srso_alias_return_thunk+0x5/0xfbef5
   ? irqentry_exit_to_user_mode+0x43/0x250
   ? srso_alias_return_thunk+0x5/0xfbef5
   ? irqentry_exit+0x43/0x50
   ? srso_alias_return_thunk+0x5/0xfbef5
   ? sysvec_apic_timer_interrupt+0x4f/0xc0
   entry_SYSCALL_64_after_hwframe+0x76/0x7e
  RIP: 0033:0x7b2b02b2531c
  Code: ...
  RSP: 002b:00007ffc3b177880 EFLAGS: 00000246 ORIG_RAX: 0000000000000009
  RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007b2b02b2531c
  RDX: 0000000000000003 RSI: 0000000000009000 RDI: 0000000000000000
  RBP: 00007ffc3b177890 R08: 0000000000000003 R09: 0000000000000000
  R10: 0000000000000001 R11: 0000000000000246 R12: 0000000000000001
  R13: 0000000000000000 R14: 0000636a69c7bb60 R15: 00007b2b02e55000
   </TASK>
  Modules linked in: ...
  CR2: 00000000000000d0
  ---[ end trace 0000000000000000 ]---

Thanks,
Ravi

Re: [PATCH 19/19] perf: Make perf_pmu_unregister() useable

Posted by Ravi Bangoria 1 year ago

Hi Peter,

>> 4) A race with perf_event_set_bpf_prog(). perf_event_set_bpf_prog() might
>>    be in process of setting event->prog, where as perf_pmu_unregister(),
>>    which internally calls perf_event_free_bpf_prog(), will clear the
>>    event->prog pointer.
>>
>>    perf_pmu_unregister()                perf_event_set_bpf_prog()
>>      ...                                  perf_event_set_bpf_handler()
>>        perf_event_free_bpf_prog()           event->prog = prog;
>>          event->prog = NULL;
>>
>> I've yet to inspect other code paths, so there might be more races.
> 
> Weird, that should be serialized by perf_event_ctx_lock(), both
> __pmu_detach_event() and _perf_ioctl() are called under that.

There are multiple code paths leading to perf_event_set_bpf_prog(). The
one starting from _perf_ioctl() is serialized. However, this is not:

  __sys_bpf()
    link_create()
      bpf_perf_link_attach()
        perf_event_set_bpf_prog()
          perf_event_set_bpf_handler()
            event->prog = prog;

Thanks,
Ravi

Re: [PATCH 19/19] perf: Make perf_pmu_unregister() useable

Posted by Peter Zijlstra 1 year ago

On Fri, Jan 17, 2025 at 10:50:26AM +0530, Ravi Bangoria wrote:
> Hi Peter,
> 
> >> 4) A race with perf_event_set_bpf_prog(). perf_event_set_bpf_prog() might
> >>    be in process of setting event->prog, where as perf_pmu_unregister(),
> >>    which internally calls perf_event_free_bpf_prog(), will clear the
> >>    event->prog pointer.
> >>
> >>    perf_pmu_unregister()                perf_event_set_bpf_prog()
> >>      ...                                  perf_event_set_bpf_handler()
> >>        perf_event_free_bpf_prog()           event->prog = prog;
> >>          event->prog = NULL;
> >>
> >> I've yet to inspect other code paths, so there might be more races.
> > 
> > Weird, that should be serialized by perf_event_ctx_lock(), both
> > __pmu_detach_event() and _perf_ioctl() are called under that.
> 
> There are multiple code paths leading to perf_event_set_bpf_prog(). The
> one starting from _perf_ioctl() is serialized. However, this is not:
> 
>   __sys_bpf()
>     link_create()
>       bpf_perf_link_attach()
>         perf_event_set_bpf_prog()
>           perf_event_set_bpf_handler()
>             event->prog = prog;
> 

Urgh yeah, that's broken. Damn bpf stuff :-/

Re: [PATCH 19/19] perf: Make perf_pmu_unregister() useable

Posted by Ravi Bangoria 1 year, 1 month ago

>> How did you test; perf-fuzzer or something?
> 
> Prepared a simple test that does pmu register(), unregister() and
> "perf record" in parallel. It's quite dirty, I'll clean it up and
> share it here.

Attaching the testsuite here.

It contains:
- Kernel module that allows user to register and unregister a pmu
- Shell script to that runs "perf record" on test pmu
- README explains how to run the test

Thanks,
Ravi#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
# Author: Ravi Bangoria <ravi.bangoria@amd.com>

# Must not run directly. Run it via tinypmu-u.sh

if [ "$EUID" -ne 0 ]; then
	echo "Please run as root"
	exit
fi

while true; do
	echo 1 > /dev/tinypmu_register
done
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
# Author: Ravi Bangoria <ravi.bangoria@amd.com>

# Must not run directly. Run it via tinypmu-u.sh

if [ "$EUID" -ne 0 ]; then
	echo "Please run as root"
	exit
fi

while true; do
	echo 1 > /dev/tinypmu_unregister
done
# SPDX-License-Identifier: GPL-2.0
# Author: Ravi Bangoria <ravi.bangoria@amd.com>

obj-m := tinypmu-k.o

KDIR  := /lib/modules/$(shell uname -r)/build
PWD   := $(shell pwd)

default:
	$(MAKE) -C $(KDIR) M=$(PWD) modules

clean:
	rm -rf *.o *.mod.c *.mod *.ko *.order *.symvers \.*.o \.*.ko \.*.cmd \.tmp_versions
A simple testsuite to stress test pmu register / unregister code:
https://lore.kernel.org/r/20241104133909.669111662@infradead.org

Build:

  $ make

Clean:

  $ make clean

Test perf pmu register and unregister:

  term1~$ while true; do sudo bash tinypmu-u.sh; done

Test event creation / mmap / unmap / event deletion etc along with
pmu register / unregister, (run this in parallel to above command):

  term2~$ sudo bash tinypmu-u-events.sh
// SPDX-License-Identifier: GPL-2.0
/*
 * Provide an interface /dev/tinypmu_register and /dev/tinypmu_unregister to
 * stress test perf_pmu_register() and perf_pmu_unregister() functions.
 *
 * Author: Ravi Bangoria <ravi.bangoria@amd.com>
 */
#define pr_fmt(fmt) "tinypmu: " fmt

#include <linux/module.h>
#include <linux/perf_event.h>
#include <linux/fs.h>
#include <linux/miscdevice.h>

#define NR_TINYPMUS		1

static int tinypmu_event_init(struct perf_event *event)
{
	return 0;
}

static void tinypmu_del(struct perf_event *event, int flags)
{
}

static int tinypmu_add(struct perf_event *event, int flags)
{
	return 0;
}

static void tinypmu_start(struct perf_event *event, int flags)
{
}

static void tinypmu_stop(struct perf_event *event, int flags)
{
}

static void tinypmu_read(struct perf_event *event)
{
}

PMU_FORMAT_ATTR(event, "config:0-20");
static struct attribute *tinypmu_events_attr[] = {
	&format_attr_event.attr,
	NULL,
};

static struct attribute_group tinypmu_events_group = {
	.name = "format",
	.attrs = tinypmu_events_attr,
};

static const struct attribute_group *tinypmu_attr_groups[] = {
	&tinypmu_events_group,
	NULL,
};

static struct pmu *alloc_tinypmu(void)
{
	struct pmu *tinypmu = kzalloc(sizeof(struct pmu), GFP_KERNEL);
	if (!tinypmu)
		return NULL;

	tinypmu->task_ctx_nr = perf_invalid_context;
	tinypmu->event_init  = tinypmu_event_init;
	tinypmu->add         = tinypmu_add;
	tinypmu->del         = tinypmu_del;
	tinypmu->start       = tinypmu_start;
	tinypmu->stop        = tinypmu_stop;
	tinypmu->read        = tinypmu_read;
	tinypmu->attr_groups = tinypmu_attr_groups;

	return tinypmu;
}

static DEFINE_MUTEX(lock);
static struct pmu *tinypmus[NR_TINYPMUS];

static char pmu_name[NR_TINYPMUS][11];

static void register_pmu(unsigned int idx)
{
	struct pmu *temp;
	int ret;

	if (idx >= NR_TINYPMUS)
		return;

	temp = alloc_tinypmu();
	if (!temp) {
		mutex_unlock(&lock);
		return;
	}

	mutex_lock(&lock);
	if (tinypmus[idx]) {
		mutex_unlock(&lock);
		kfree(temp);
		return;
	}

	ret = perf_pmu_register(temp, pmu_name[idx], -1);
	if (!ret) {
		tinypmus[idx] = temp;
		mutex_unlock(&lock);
		return;
	}

	mutex_unlock(&lock);
	kfree(temp);
	return;
}

static void unregister_pmu(unsigned int idx)
{
	struct pmu *temp;

	if (idx >= NR_TINYPMUS)
		return;

	mutex_lock(&lock);
	if (!tinypmus[idx]) {
		mutex_unlock(&lock);
		return;
	}

	/*
	 * Must call perf_pmu_unregister() inside atomic section. If I try
	 * to reduce atomic section by using temp to cache tinypmus[idx]
	 * plus clear tinypmus[idx] and do unregister after mutex_unlock(),
	 * register_pmu() will assume no pmu exists with "tinypmu<idx>" name
	 * since tinypmus[idx] is NULL and try to register new pmu although
	 * this function is yet to unregistered pmu with the same name.
	 */
	perf_pmu_unregister(tinypmus[idx]);

	temp = tinypmus[idx];
	tinypmus[idx] = NULL;
	mutex_unlock(&lock);

	kfree(temp);
}

static ssize_t register_write(struct file *f, const char *data, size_t size,
			      loff_t *ppos)
{
	unsigned int idx;

	get_random_bytes(&idx, sizeof(idx));
	idx %= NR_TINYPMUS;
	register_pmu(idx);
	return 1;
}

static ssize_t unregister_write(struct file *f, const char *data, size_t size,
			        loff_t *ppos)
{
	unsigned int idx;

	get_random_bytes(&idx, sizeof(idx));
	idx %= NR_TINYPMUS;
	unregister_pmu(idx);
	return 1;
}

static const struct file_operations register_fops = {
	.owner = THIS_MODULE,
	.write = register_write,
};

static const struct file_operations unregister_fops = {
	.owner = THIS_MODULE,
	.write = unregister_write,
};

static struct miscdevice register_dev = {
	MISC_DYNAMIC_MINOR,
	"tinypmu_register",
	&register_fops,
};

static struct miscdevice unregister_dev = {
	MISC_DYNAMIC_MINOR,
	"tinypmu_unregister",
	&unregister_fops,
};

static int __init hello_init(void)
{
	int ret;
	int i;

	ret = misc_register(&register_dev);
	if (ret) {
		pr_err("Failed to register register_dev\n");
		return ret;
	}

	ret = misc_register(&unregister_dev);
	if (ret) {
		pr_err("Failed to register unregister_dev\n");
		misc_deregister(&register_dev);
		return ret;
	}

	for (i = 0; i < NR_TINYPMUS; i++)
		sprintf(pmu_name[i], "tinypmu%d", i);

	for (i = 0; i < NR_TINYPMUS; i++)
		register_pmu(i);

	return 0;
}

module_init(hello_init);

static void __exit hello_exit(void)
{
	int i;

	for (i = 0; i < NR_TINYPMUS; i++)
		unregister_pmu(i);
	misc_deregister(&register_dev);
	misc_deregister(&unregister_dev);
}

module_exit(hello_exit);

MODULE_LICENSE("GPL");
MODULE_AUTHOR("Ravi Bangoria");
MODULE_DESCRIPTION("PMU register/unregister stress test");
MODULE_VERSION("dev");
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
# Author: Ravi Bangoria <ravi.bangoria@amd.com>

if [ "$EUID" -ne 0 ]; then
	echo "Please run as root"
	exit
fi

sudo insmod tinypmu-k.ko

cleanup() {
	for i in "$@"; do
		echo -n "${i} "
		kill ${i}
	done

	wait
	rmmod tinypmu_k

	rm -rf /dev/tinypmu_register
	rm -rf /dev/tinypmu_unregister
	echo ""
	exit
}

bash _tinypmu-u-register.sh &
reg_pid=$!
bash _tinypmu-u-unregister.sh &
unreg_pid=$!

# register Ctrl+C cleanup if aborted inbetween
#trap "cleanup '${reg_pid}' '${unreg_pid}' ${event_pids[@]}" 2

echo ${reg_pid} ${unreg_pid}

sleep 10

cleanup ${reg_pid} ${unreg_pid}

#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
# Author: Ravi Bangoria <ravi.bangoria@amd.com>

pmu_nr=${1}

if [ "$EUID" -ne 0 ]; then
	echo "Please run as root"
	exit
fi

while true; do
	perf record -a -e tinypmu${pmu_nr}// -- sleep 1 >> /dev/null 2>&1 &
done

Re: [PATCH 19/19] perf: Make perf_pmu_unregister() useable

Posted by Liang, Kan 1 year, 3 months ago


On 2024-11-04 8:39 a.m., Peter Zijlstra wrote:
> Previously it was only safe to call perf_pmu_unregister() if there
> were no active events of that pmu around -- which was impossible to
> guarantee since it races all sorts against perf_init_event().
> 
> Rework the whole thing by:
> 
>  - keeping track of all events for a given pmu
> 
>  - 'hiding' the pmu from perf_init_event()
> 
>  - waiting for the appropriate (s)rcu grace periods such that all
>    prior references to the PMU will be completed
> 
>  - detaching all still existing events of that pmu (see first point)
>    and moving them to a new REVOKED state.
> 
>  - actually freeing the pmu data.
> 
> Where notably the new REVOKED state must inhibit all event actions
> from reaching code that wants to use event->pmu.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  include/linux/perf_event.h |   13 +-
>  kernel/events/core.c       |  222 ++++++++++++++++++++++++++++++++++++++++-----
>  2 files changed, 210 insertions(+), 25 deletions(-)
> 
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -318,6 +318,9 @@ struct perf_output_handle;
>  struct pmu {
>  	struct list_head		entry;
>  
> +	spinlock_t			events_lock;
> +	struct list_head		events;
> +
>  	struct module			*module;
>  	struct device			*dev;
>  	struct device			*parent;
> @@ -611,9 +614,10 @@ struct perf_addr_filter_range {
>   * enum perf_event_state - the states of an event:
>   */
>  enum perf_event_state {
> -	PERF_EVENT_STATE_DEAD		= -4,
> -	PERF_EVENT_STATE_EXIT		= -3,
> -	PERF_EVENT_STATE_ERROR		= -2,
> +	PERF_EVENT_STATE_DEAD		= -5,
> +	PERF_EVENT_STATE_REVOKED	= -4, /* pmu gone, must not touch */
> +	PERF_EVENT_STATE_EXIT		= -3, /* task died, still inherit */
> +	PERF_EVENT_STATE_ERROR		= -2, /* scheduling error, can enable */
>  	PERF_EVENT_STATE_OFF		= -1,
>  	PERF_EVENT_STATE_INACTIVE	=  0,
>  	PERF_EVENT_STATE_ACTIVE		=  1,
> @@ -854,6 +858,7 @@ struct perf_event {
>  	void *security;
>  #endif
>  	struct list_head		sb_list;
> +	struct list_head		pmu_list;
>  
>  	/*
>  	 * Certain events gets forwarded to another pmu internally by over-
> @@ -1105,7 +1110,7 @@ extern void perf_aux_output_flag(struct
>  extern void perf_event_itrace_started(struct perf_event *event);
>  
>  extern int perf_pmu_register(struct pmu *pmu, const char *name, int type);
> -extern void perf_pmu_unregister(struct pmu *pmu);
> +extern int perf_pmu_unregister(struct pmu *pmu);
>  
>  extern void __perf_event_task_sched_in(struct task_struct *prev,
>  				       struct task_struct *task);
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -2412,7 +2412,9 @@ ctx_time_update_event(struct perf_event_
>  
>  #define DETACH_GROUP	0x01UL
>  #define DETACH_CHILD	0x02UL
> -#define DETACH_DEAD	0x04UL
> +#define DETACH_EXIT	0x04UL
> +#define DETACH_REVOKE	0x08UL
> +#define DETACH_DEAD	0x10UL
>  
>  /*
>   * Cross CPU call to remove a performance event
> @@ -2427,6 +2429,7 @@ __perf_remove_from_context(struct perf_e
>  			   void *info)
>  {
>  	struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
> +	enum perf_event_state state = PERF_EVENT_STATE_OFF;

Set the PERF_EVENT_STATE_OFF as default seems dangerous.
If the event was in an error state, the state will be overwritten to the
PERF_EVENT_STATE_OFF later.

One example may be the perf_pmu_migrate_context(). After the migration,
it looks like all the error state will be cleared.

Thanks,
Kan

>  	unsigned long flags = (unsigned long)info;
>  
>  	ctx_time_update(cpuctx, ctx);
> @@ -2435,16 +2438,22 @@ __perf_remove_from_context(struct perf_e
>  	 * Ensure event_sched_out() switches to OFF, at the very least
>  	 * this avoids raising perf_pending_task() at this time.
>  	 */
> -	if (flags & DETACH_DEAD)
> +	if (flags & DETACH_EXIT)
> +		state = PERF_EVENT_STATE_EXIT;
> +	if (flags & DETACH_REVOKE)
> +		state = PERF_EVENT_STATE_REVOKED;
> +	if (flags & DETACH_DEAD) {
>  		event->pending_disable = 1;
> +		state = PERF_EVENT_STATE_DEAD;
> +	}
>  	event_sched_out(event, ctx);
>  	if (flags & DETACH_GROUP)
>  		perf_group_detach(event);
>  	if (flags & DETACH_CHILD)
>  		perf_child_detach(event);
>  	list_del_event(event, ctx);
> -	if (flags & DETACH_DEAD)
> -		event->state = PERF_EVENT_STATE_DEAD;
> +
> +	event->state = state;
>  
>  	if (!pmu_ctx->nr_events) {
>  		pmu_ctx->rotate_necessary = 0;
> @@ -4511,7 +4520,8 @@ static void perf_event_enable_on_exec(st
>  
>  static void perf_remove_from_owner(struct perf_event *event);
>  static void perf_event_exit_event(struct perf_event *event,
> -				  struct perf_event_context *ctx);
> +				  struct perf_event_context *ctx,
> +				  bool revoke);
>  
>  /*
>   * Removes all events from the current task that have been marked
> @@ -4538,7 +4548,7 @@ static void perf_event_remove_on_exec(st
>  
>  		modified = true;
>  
> -		perf_event_exit_event(event, ctx);
> +		perf_event_exit_event(event, ctx, false);
>  	}
>  
>  	raw_spin_lock_irqsave(&ctx->lock, flags);
> @@ -5138,6 +5148,7 @@ static bool is_sb_event(struct perf_even
>  	    attr->context_switch || attr->text_poke ||
>  	    attr->bpf_event)
>  		return true;
> +
>  	return false;
>  }
>  
> @@ -5339,6 +5350,8 @@ static void perf_pending_task_sync(struc
>  /* vs perf_event_alloc() error */
>  static void __free_event(struct perf_event *event)
>  {
> +	struct pmu *pmu = event->pmu;
> +
>  	if (event->attach_state & PERF_ATTACH_CALLCHAIN)
>  		put_callchain_buffers();
>  
> @@ -5365,6 +5378,7 @@ static void __free_event(struct perf_eve
>  		 * put_pmu_ctx() needs an event->ctx reference, because of
>  		 * epc->ctx.
>  		 */
> +		WARN_ON_ONCE(!pmu);
>  		WARN_ON_ONCE(!event->ctx);
>  		WARN_ON_ONCE(event->pmu_ctx->ctx != event->ctx);
>  		put_pmu_ctx(event->pmu_ctx);
> @@ -5377,8 +5391,13 @@ static void __free_event(struct perf_eve
>  	if (event->ctx)
>  		put_ctx(event->ctx);
>  
> -	if (event->pmu)
> -		module_put(event->pmu->module);
> +	if (pmu) {
> +		module_put(pmu->module);
> +		scoped_guard (spinlock, &pmu->events_lock) {
> +			list_del(&event->pmu_list);
> +			wake_up_var(pmu);
> +		}
> +	}
>  
>  	call_rcu(&event->rcu_head, free_event_rcu);
>  }
> @@ -5397,6 +5416,7 @@ static void _free_event(struct perf_even
>  	security_perf_event_free(event);
>  
>  	if (event->rb) {
> +		WARN_ON_ONCE(!event->pmu);
>  		/*
>  		 * Can happen when we close an event with re-directed output.
>  		 *
> @@ -5527,7 +5547,11 @@ int perf_event_release_kernel(struct per
>  	 * Thus this guarantees that we will in fact observe and kill _ALL_
>  	 * child events.
>  	 */
> -	perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);
> +	if (event->state > PERF_EVENT_STATE_REVOKED) {
> +		perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);
> +	} else {
> +		event->state = PERF_EVENT_STATE_DEAD;
> +	}
>  
>  	perf_event_ctx_unlock(event, ctx);
>  
> @@ -5838,7 +5862,7 @@ __perf_read(struct perf_event *event, ch
>  	 * error state (i.e. because it was pinned but it couldn't be
>  	 * scheduled on to the CPU at some point).
>  	 */
> -	if (event->state == PERF_EVENT_STATE_ERROR)
> +	if (event->state <= PERF_EVENT_STATE_ERROR)
>  		return 0;
>  
>  	if (count < event->read_size)
> @@ -5877,8 +5901,14 @@ static __poll_t perf_poll(struct file *f
>  	struct perf_buffer *rb;
>  	__poll_t events = EPOLLHUP;
>  
> +	if (event->state <= PERF_EVENT_STATE_REVOKED)
> +		return EPOLLERR;
> +
>  	poll_wait(file, &event->waitq, wait);
>  
> +	if (event->state <= PERF_EVENT_STATE_REVOKED)
> +		return EPOLLERR;
> +
>  	if (is_event_hup(event))
>  		return events;
>  
> @@ -6058,6 +6088,9 @@ static long _perf_ioctl(struct perf_even
>  	void (*func)(struct perf_event *);
>  	u32 flags = arg;
>  
> +	if (event->state <= PERF_EVENT_STATE_REVOKED)
> +		return -ENODEV;
> +
>  	switch (cmd) {
>  	case PERF_EVENT_IOC_ENABLE:
>  		func = _perf_event_enable;
> @@ -6507,6 +6540,7 @@ static void perf_mmap_close(struct vm_ar
>  	unsigned long size = perf_data_size(rb);
>  	bool detach_rest = false;
>  
> +	/* FIXIES vs perf_pmu_unregister() */
>  	if (event->pmu->event_unmapped)
>  		event->pmu->event_unmapped(event, vma->vm_mm);
>  
> @@ -6657,6 +6691,16 @@ static int perf_mmap(struct file *file,
>  	mutex_lock(&event->mmap_mutex);
>  	ret = -EINVAL;
>  
> +	/*
> +	 * This relies on __pmu_detach_event() taking mmap_mutex after marking
> +	 * the event REVOKED. Either we observe the state, or __pmu_detach_event()
> +	 * will detach the rb created here.
> +	 */
> +	if (event->state <= PERF_EVENT_STATE_REVOKED) {
> +		ret = -ENODEV;
> +		goto unlock;
> +	}
> +
>  	if (vma->vm_pgoff == 0) {
>  		nr_pages -= 1;
>  
> @@ -6840,6 +6884,9 @@ static int perf_fasync(int fd, struct fi
>  	struct perf_event *event = filp->private_data;
>  	int retval;
>  
> +	if (event->state <= PERF_EVENT_STATE_REVOKED)
> +		return -ENODEV;
> +
>  	inode_lock(inode);
>  	retval = fasync_helper(fd, filp, on, &event->fasync);
>  	inode_unlock(inode);
> @@ -11892,6 +11939,9 @@ int perf_pmu_register(struct pmu *_pmu,
>  	if (!pmu->event_idx)
>  		pmu->event_idx = perf_event_idx_default;
>  
> +	INIT_LIST_HEAD(&pmu->events);
> +	spin_lock_init(&pmu->events_lock);
> +
>  	/*
>  	 * Now that the PMU is complete, make it visible to perf_try_init_event().
>  	 */
> @@ -11905,11 +11955,100 @@ int perf_pmu_register(struct pmu *_pmu,
>  }
>  EXPORT_SYMBOL_GPL(perf_pmu_register);
>  
> -void perf_pmu_unregister(struct pmu *pmu)
> +static void __pmu_detach_event(struct pmu *pmu, struct perf_event *event,
> +			       struct perf_event_context *ctx)
> +{
> +	/*
> +	 * De-schedule the event and mark it REVOKED.
> +	 */
> +	perf_event_exit_event(event, ctx, true);
> +
> +	/*
> +	 * All _free_event() bits that rely on event->pmu:
> +	 *
> +	 * Notably, perf_mmap() relies on the ordering here.
> +	 */
> +	scoped_guard (mutex, &event->mmap_mutex) {
> +		WARN_ON_ONCE(pmu->event_unmapped);
> +		ring_buffer_attach(event, NULL);
> +	}
> +
> +	perf_event_free_bpf_prog(event);
> +	perf_free_addr_filters(event);
> +
> +	if (event->destroy) {
> +		event->destroy(event);
> +		event->destroy = NULL;
> +	}
> +
> +	if (event->pmu_ctx) {
> +		put_pmu_ctx(event->pmu_ctx);
> +		event->pmu_ctx = NULL;
> +	}
> +
> +	exclusive_event_destroy(event);
> +	module_put(pmu->module);
> +
> +	event->pmu = NULL; /* force fault instead of UAF */
> +}
> +
> +static void pmu_detach_event(struct pmu *pmu, struct perf_event *event)
> +{
> +	struct perf_event_context *ctx;
> +
> +	ctx = perf_event_ctx_lock(event);
> +	__pmu_detach_event(pmu, event, ctx);
> +	perf_event_ctx_unlock(event, ctx);
> +
> +	scoped_guard (spinlock, &pmu->events_lock)
> +		list_del(&event->pmu_list);
> +}
> +
> +static struct perf_event *pmu_get_event(struct pmu *pmu)
> +{
> +	struct perf_event *event;
> +
> +	guard(spinlock)(&pmu->events_lock);
> +	list_for_each_entry(event, &pmu->events, pmu_list) {
> +		if (atomic_long_inc_not_zero(&event->refcount))
> +			return event;
> +	}
> +
> +	return NULL;
> +}
> +
> +static bool pmu_empty(struct pmu *pmu)
> +{
> +	guard(spinlock)(&pmu->events_lock);
> +	return list_empty(&pmu->events);
> +}
> +
> +static void pmu_detach_events(struct pmu *pmu)
> +{
> +	struct perf_event *event;
> +
> +	for (;;) {
> +		event = pmu_get_event(pmu);
> +		if (!event)
> +			break;
> +
> +		pmu_detach_event(pmu, event);
> +		put_event(event);
> +	}
> +
> +	/*
> +	 * wait for pending _free_event()s
> +	 */
> +	wait_var_event(pmu, pmu_empty(pmu));
> +}
> +
> +int perf_pmu_unregister(struct pmu *pmu)
>  {
>  	scoped_guard (mutex, &pmus_lock) {
> +		if (!idr_cmpxchg(&pmu_idr, pmu->type, pmu, NULL))
> +			return -EINVAL;
> +
>  		list_del_rcu(&pmu->entry);
> -		idr_remove(&pmu_idr, pmu->type);
>  	}
>  
>  	/*
> @@ -11919,7 +12058,31 @@ void perf_pmu_unregister(struct pmu *pmu
>  	synchronize_srcu(&pmus_srcu);
>  	synchronize_rcu();
>  
> +	if (pmu->event_unmapped && !pmu_empty(pmu)) {
> +		/*
> +		 * Can't force remove events when pmu::event_unmapped()
> +		 * is used in perf_mmap_close().
> +		 */
> +		guard(mutex)(&pmus_lock);
> +		idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu);
> +		list_add_rcu(&pmu->entry, &pmus);
> +		return -EBUSY;
> +	}
> +
> +	scoped_guard (mutex, &pmus_lock)
> +		idr_remove(&pmu_idr, pmu->type);
> +
> +	/*
> +	 * PMU is removed from the pmus list, so no new events will
> +	 * be created, now take care of the existing ones.
> +	 */
> +	pmu_detach_events(pmu);
> +
> +	/*
> +	 * PMU is unused, make it go away.
> +	 */
>  	perf_pmu_free(pmu);
> +	return 0;
>  }
>  EXPORT_SYMBOL_GPL(perf_pmu_unregister);
>  
> @@ -12226,6 +12389,7 @@ perf_event_alloc(struct perf_event_attr
>  	INIT_LIST_HEAD(&event->active_entry);
>  	INIT_LIST_HEAD(&event->addr_filters.list);
>  	INIT_HLIST_NODE(&event->hlist_entry);
> +	INIT_LIST_HEAD(&event->pmu_list);
>  
>  
>  	init_waitqueue_head(&event->waitq);
> @@ -12294,6 +12458,13 @@ perf_event_alloc(struct perf_event_attr
>  
>  	perf_event__state_init(event);
>  
> +	/*
> +	 * Hold SRCU critical section around perf_init_event(), until returning
> +	 * the fully formed event put on pmu->events_list. This ensures that
> +	 * perf_pmu_unregister() will see any in-progress event creation that
> +	 * races.
> +	 */
> +	guard(srcu)(&pmus_srcu);
>  	pmu = NULL;
>  
>  	hwc = &event->hw;
> @@ -12383,6 +12554,9 @@ perf_event_alloc(struct perf_event_attr
>  	/* symmetric to unaccount_event() in _free_event() */
>  	account_event(event);
>  
> +	scoped_guard (spinlock, &pmu->events_lock)
> +		list_add(&event->pmu_list, &pmu->events);
> +
>  	return_ptr(event);
>  }
>  
> @@ -12769,6 +12943,10 @@ SYSCALL_DEFINE5(perf_event_open,
>  		if (err)
>  			goto err_fd;
>  		group_leader = fd_file(group)->private_data;
> +		if (group_leader->state <= PERF_EVENT_STATE_REVOKED) {
> +			err = -ENODEV;
> +			goto err_group_fd;
> +		}
>  		if (flags & PERF_FLAG_FD_OUTPUT)
>  			output_event = group_leader;
>  		if (flags & PERF_FLAG_FD_NO_GROUP)
> @@ -13316,10 +13494,11 @@ static void sync_child_event(struct perf
>  }
>  
>  static void
> -perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
> +perf_event_exit_event(struct perf_event *event,
> +		      struct perf_event_context *ctx, bool revoke)
>  {
>  	struct perf_event *parent_event = event->parent;
> -	unsigned long detach_flags = 0;
> +	unsigned long detach_flags = DETACH_EXIT;
>  
>  	if (parent_event) {
>  		/*
> @@ -13334,16 +13513,14 @@ perf_event_exit_event(struct perf_event
>  		 * Do destroy all inherited groups, we don't care about those
>  		 * and being thorough is better.
>  		 */
> -		detach_flags = DETACH_GROUP | DETACH_CHILD;
> +		detach_flags |= DETACH_GROUP | DETACH_CHILD;
>  		mutex_lock(&parent_event->child_mutex);
>  	}
>  
> -	perf_remove_from_context(event, detach_flags);
> +	if (revoke)
> +		detach_flags |= DETACH_GROUP | DETACH_REVOKE;
>  
> -	raw_spin_lock_irq(&ctx->lock);
> -	if (event->state > PERF_EVENT_STATE_EXIT)
> -		perf_event_set_state(event, PERF_EVENT_STATE_EXIT);
> -	raw_spin_unlock_irq(&ctx->lock);
> +	perf_remove_from_context(event, detach_flags);
>  
>  	/*
>  	 * Child events can be freed.
> @@ -13419,7 +13596,7 @@ static void perf_event_exit_task_context
>  	perf_event_task(child, child_ctx, 0);
>  
>  	list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
> -		perf_event_exit_event(child_event, child_ctx);
> +		perf_event_exit_event(child_event, child_ctx, false);
>  
>  	mutex_unlock(&child_ctx->mutex);
>  
> @@ -13609,6 +13786,9 @@ inherit_event(struct perf_event *parent_
>  	if (parent_event->parent)
>  		parent_event = parent_event->parent;
>  
> +	if (parent_event->state <= PERF_EVENT_STATE_REVOKED)
> +		return NULL;
> +
>  	child_event = perf_event_alloc(&parent_event->attr,
>  					   parent_event->cpu,
>  					   child,
> 
> 
>

Re: [PATCH 19/19] perf: Make perf_pmu_unregister() useable

Posted by Peter Zijlstra 1 year, 3 months ago

On Tue, Nov 05, 2024 at 10:08:54AM -0500, Liang, Kan wrote:
> > @@ -2427,6 +2429,7 @@ __perf_remove_from_context(struct perf_e
> >  			   void *info)
> >  {
> >  	struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
> > +	enum perf_event_state state = PERF_EVENT_STATE_OFF;
> 
> Set the PERF_EVENT_STATE_OFF as default seems dangerous.
> If the event was in an error state, the state will be overwritten to the
> PERF_EVENT_STATE_OFF later.
> 
> One example may be the perf_pmu_migrate_context(). After the migration,
> it looks like all the error state will be cleared.
> 
> Thanks,
> Kan
> 
> >  	unsigned long flags = (unsigned long)info;
> >  
> >  	ctx_time_update(cpuctx, ctx);
> > @@ -2435,16 +2438,22 @@ __perf_remove_from_context(struct perf_e
> >  	 * Ensure event_sched_out() switches to OFF, at the very least
> >  	 * this avoids raising perf_pending_task() at this time.
> >  	 */
> > -	if (flags & DETACH_DEAD)
> > +	if (flags & DETACH_EXIT)
> > +		state = PERF_EVENT_STATE_EXIT;
> > +	if (flags & DETACH_REVOKE)
> > +		state = PERF_EVENT_STATE_REVOKED;
> > +	if (flags & DETACH_DEAD) {
> >  		event->pending_disable = 1;
> > +		state = PERF_EVENT_STATE_DEAD;
> > +	}
> >  	event_sched_out(event, ctx);
> >  	if (flags & DETACH_GROUP)
> >  		perf_group_detach(event);
> >  	if (flags & DETACH_CHILD)
> >  		perf_child_detach(event);
> >  	list_del_event(event, ctx);
> > -	if (flags & DETACH_DEAD)
> > -		event->state = PERF_EVENT_STATE_DEAD;
> > +
> > +	event->state = state;

How about we make this:

	event->state = min(event->state, state);

?

Re: [PATCH 19/19] perf: Make perf_pmu_unregister() useable

Posted by Liang, Kan 1 year, 3 months ago


On 2024-11-05 10:16 a.m., Peter Zijlstra wrote:
> On Tue, Nov 05, 2024 at 10:08:54AM -0500, Liang, Kan wrote:
>>> @@ -2427,6 +2429,7 @@ __perf_remove_from_context(struct perf_e
>>>  			   void *info)
>>>  {
>>>  	struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
>>> +	enum perf_event_state state = PERF_EVENT_STATE_OFF;
>>
>> Set the PERF_EVENT_STATE_OFF as default seems dangerous.
>> If the event was in an error state, the state will be overwritten to the
>> PERF_EVENT_STATE_OFF later.
>>
>> One example may be the perf_pmu_migrate_context(). After the migration,
>> it looks like all the error state will be cleared.
>>
>> Thanks,
>> Kan
>>
>>>  	unsigned long flags = (unsigned long)info;
>>>  
>>>  	ctx_time_update(cpuctx, ctx);
>>> @@ -2435,16 +2438,22 @@ __perf_remove_from_context(struct perf_e
>>>  	 * Ensure event_sched_out() switches to OFF, at the very least
>>>  	 * this avoids raising perf_pending_task() at this time.
>>>  	 */
>>> -	if (flags & DETACH_DEAD)
>>> +	if (flags & DETACH_EXIT)
>>> +		state = PERF_EVENT_STATE_EXIT;
>>> +	if (flags & DETACH_REVOKE)
>>> +		state = PERF_EVENT_STATE_REVOKED;
>>> +	if (flags & DETACH_DEAD) {
>>>  		event->pending_disable = 1;
>>> +		state = PERF_EVENT_STATE_DEAD;
>>> +	}
>>>  	event_sched_out(event, ctx);
>>>  	if (flags & DETACH_GROUP)
>>>  		perf_group_detach(event);
>>>  	if (flags & DETACH_CHILD)
>>>  		perf_child_detach(event);
>>>  	list_del_event(event, ctx);
>>> -	if (flags & DETACH_DEAD)
>>> -		event->state = PERF_EVENT_STATE_DEAD;
>>> +
>>> +	event->state = state;
> 
> How about we make this:
> 
> 	event->state = min(event->state, state);
> 
> ?

Yep, it looks good to me.

Thanks,
Kan