If a pmu is unregistered while there's an active event, perf will still
access the pmu via event->pmu, even after the event is destroyed. This
makes it difficult for drivers like i915 that can be unbound from the
HW.
BUG: KASAN: use-after-free in exclusive_event_destroy+0xd8/0xf0
Read of size 4 at addr ffff88816e2bb63c by task perf/7748
i915 tries to cope with it by installing a event->destroy, but that is
not sufficient: if pmu is released by the driver, it will still crash
since event->pmu is still used.
Moreover, even with that use-after-free fixed by adjusting the order in
_free_event() or delaying the free by the driver, kernel still oops when
closing the event fd related to a unregistered pmu: the percpu variables
allocated on perf_pmu_register() would already be released. One such
crash is:
BUG: KASAN: user-memory-access in _raw_spin_lock_irqsave+0x88/0x100
Write of size 4 at addr 00000000ffffffff by task perf/727
CPU: 0 UID: 0 PID: 727 Comm: perf Not tainted 6.12.0-rc1-DEMARCHI-dxnf+ #9
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS unknown 2/2/2022
Call Trace:
<TASK>
dump_stack_lvl+0x5f/0x90
print_report+0x4d3/0x50a
? __pfx__raw_spin_lock_irqsave+0x10/0x10
? kasan_addr_to_slab+0xd/0xb0
kasan_report+0xe2/0x170
? _raw_spin_lock_irqsave+0x88/0x100
? _raw_spin_lock_irqsave+0x88/0x100
kasan_check_range+0x125/0x230
__kasan_check_write+0x14/0x30
_raw_spin_lock_irqsave+0x88/0x100
? __pfx__raw_spin_lock_irqsave+0x10/0x10
_atomic_dec_and_raw_lock_irqsave+0x89/0x110
? __kasan_check_write+0x14/0x30
put_pmu_ctx+0x98/0x330
The fix here is to provide a set of get/put hooks that drivers can
implement to piggy back the perf's pmu lifecycle to the driver's
instance lifecycle. With this, perf_pmu_unregister() can be called by
the driver, which is then responsible for freeing the resources.
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
---
include/linux/perf_event.h | 12 ++++++++++++
kernel/events/core.c | 37 ++++++++++++++++++++++++++++++++-----
2 files changed, 44 insertions(+), 5 deletions(-)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index fb908843f209..d6983dbf5a45 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -561,6 +561,17 @@ struct pmu {
* Check period value for PERF_EVENT_IOC_PERIOD ioctl.
*/
int (*check_period) (struct perf_event *event, u64 value); /* optional */
+
+ /*
+ * Optional: get a reference. Typically needed by PMUs that are bound to a device
+ * that can be hotplugged, either physically or through sysfs' bind/unbind. When provided,
+ * pmu::put() is mandatory and it's driver responsibility to call perf_pmu_free() when
+ * resources can be released.
+ */
+ struct pmu *(*get) (struct pmu *pmu);
+
+ /* Optional: put a reference. See pmu::get() */
+ void (*put) (struct pmu *pmu);
};
enum perf_addr_filter_action_t {
@@ -1104,6 +1115,7 @@ extern void perf_event_itrace_started(struct perf_event *event);
extern int perf_pmu_register(struct pmu *pmu, const char *name, int type);
extern void perf_pmu_unregister(struct pmu *pmu);
+extern void perf_pmu_free(struct pmu *pmu);
extern void __perf_event_task_sched_in(struct task_struct *prev,
struct task_struct *task);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6395dbf67671..bf5b8fc8979e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5319,8 +5319,24 @@ static void perf_pending_task_sync(struct perf_event *event)
rcuwait_wait_event(&event->pending_work_wait, !event->pending_work, TASK_UNINTERRUPTIBLE);
}
+static void pmu_module_put(struct pmu **ppmu)
+{
+ struct pmu *pmu = *ppmu;
+ struct module *module = pmu->module;
+
+ if (pmu->put)
+ pmu->put(pmu);
+
+ module_put(module);
+
+ /* Can't touch pmu anymore/ */
+ *ppmu = NULL;
+}
+
static void _free_event(struct perf_event *event)
{
+ struct module *module;
+
irq_work_sync(&event->pending_irq);
irq_work_sync(&event->pending_disable_irq);
perf_pending_task_sync(event);
@@ -5374,7 +5390,8 @@ static void _free_event(struct perf_event *event)
put_ctx(event->ctx);
exclusive_event_destroy(event);
- module_put(event->pmu->module);
+
+ pmu_module_put(&event->pmu);
call_rcu(&event->rcu_head, free_event_rcu);
}
@@ -11512,10 +11529,12 @@ static int perf_event_idx_default(struct perf_event *event)
return 0;
}
-static void free_pmu_context(struct pmu *pmu)
+void perf_pmu_free(struct pmu *pmu)
{
free_percpu(pmu->cpu_pmu_context);
+ free_percpu(pmu->pmu_disable_count);
}
+EXPORT_SYMBOL_GPL(perf_pmu_free);
/*
* Let userspace know that this PMU supports address range filtering:
@@ -11749,6 +11768,11 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
goto free_pdc;
}
+ if (WARN_ONCE((!!pmu->get) ^ (!!pmu->put), "Can not register a pmu with only get or put defined.\n")) {
+ ret = -EINVAL;
+ goto free_pdc;
+ }
+
pmu->name = name;
if (type >= 0)
@@ -11855,8 +11879,8 @@ void perf_pmu_unregister(struct pmu *pmu)
mutex_unlock(&pmus_lock);
- free_percpu(pmu->pmu_disable_count);
- free_pmu_context(pmu);
+ if (!pmu->put)
+ perf_pmu_free(pmu);
}
EXPORT_SYMBOL_GPL(perf_pmu_unregister);
@@ -11890,6 +11914,9 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
BUG_ON(!ctx);
}
+ if (pmu->get)
+ pmu->get(pmu);
+
event->pmu = pmu;
ret = pmu->event_init(event);
@@ -11926,7 +11953,7 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
}
if (ret)
- module_put(pmu->module);
+ pmu_module_put(&pmu);
return ret;
}
--
2.46.2
On Tue, Oct 08, 2024 at 01:34:59PM -0500, Lucas De Marchi wrote: > If a pmu is unregistered while there's an active event, perf will still > access the pmu via event->pmu, even after the event is destroyed. This > makes it difficult for drivers like i915 that can be unbound from the > HW. > > BUG: KASAN: use-after-free in exclusive_event_destroy+0xd8/0xf0 > Read of size 4 at addr ffff88816e2bb63c by task perf/7748 > > i915 tries to cope with it by installing a event->destroy, but that is > not sufficient: if pmu is released by the driver, it will still crash > since event->pmu is still used. > > Moreover, even with that use-after-free fixed by adjusting the order in > _free_event() or delaying the free by the driver, kernel still oops when > closing the event fd related to a unregistered pmu: the percpu variables > allocated on perf_pmu_register() would already be released. One such > crash is: > > BUG: KASAN: user-memory-access in _raw_spin_lock_irqsave+0x88/0x100 > Write of size 4 at addr 00000000ffffffff by task perf/727 > > CPU: 0 UID: 0 PID: 727 Comm: perf Not tainted 6.12.0-rc1-DEMARCHI-dxnf+ #9 > Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS unknown 2/2/2022 > Call Trace: > <TASK> > dump_stack_lvl+0x5f/0x90 > print_report+0x4d3/0x50a > ? __pfx__raw_spin_lock_irqsave+0x10/0x10 > ? kasan_addr_to_slab+0xd/0xb0 > kasan_report+0xe2/0x170 > ? _raw_spin_lock_irqsave+0x88/0x100 > ? _raw_spin_lock_irqsave+0x88/0x100 > kasan_check_range+0x125/0x230 > __kasan_check_write+0x14/0x30 > _raw_spin_lock_irqsave+0x88/0x100 > ? __pfx__raw_spin_lock_irqsave+0x10/0x10 > _atomic_dec_and_raw_lock_irqsave+0x89/0x110 > ? __kasan_check_write+0x14/0x30 > put_pmu_ctx+0x98/0x330 > > The fix here is to provide a set of get/put hooks that drivers can > implement to piggy back the perf's pmu lifecycle to the driver's > instance lifecycle. With this, perf_pmu_unregister() can be called by > the driver, which is then responsible for freeing the resources. I'm confused.. probably because I still don't have any clue about drivers and the above isn't really telling me much either. I don't see how you get rid of the try_module_get() we do per event; without that you can't unload the module. And I don't see how you think it is safe to free a pmu while there are still events around. Nor do I really see what these new get/put methods do. I see you call ->put() where we do module_put(), and ->get() near try_module_get(), but how is that helping?
On Mon, Oct 14, 2024 at 07:32:46PM +0200, Peter Zijlstra wrote: >On Tue, Oct 08, 2024 at 01:34:59PM -0500, Lucas De Marchi wrote: >> If a pmu is unregistered while there's an active event, perf will still >> access the pmu via event->pmu, even after the event is destroyed. This >> makes it difficult for drivers like i915 that can be unbound from the >> HW. >> >> BUG: KASAN: use-after-free in exclusive_event_destroy+0xd8/0xf0 >> Read of size 4 at addr ffff88816e2bb63c by task perf/7748 >> >> i915 tries to cope with it by installing a event->destroy, but that is >> not sufficient: if pmu is released by the driver, it will still crash >> since event->pmu is still used. >> >> Moreover, even with that use-after-free fixed by adjusting the order in >> _free_event() or delaying the free by the driver, kernel still oops when >> closing the event fd related to a unregistered pmu: the percpu variables >> allocated on perf_pmu_register() would already be released. One such >> crash is: >> >> BUG: KASAN: user-memory-access in _raw_spin_lock_irqsave+0x88/0x100 >> Write of size 4 at addr 00000000ffffffff by task perf/727 >> >> CPU: 0 UID: 0 PID: 727 Comm: perf Not tainted 6.12.0-rc1-DEMARCHI-dxnf+ #9 >> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS unknown 2/2/2022 >> Call Trace: >> <TASK> >> dump_stack_lvl+0x5f/0x90 >> print_report+0x4d3/0x50a >> ? __pfx__raw_spin_lock_irqsave+0x10/0x10 >> ? kasan_addr_to_slab+0xd/0xb0 >> kasan_report+0xe2/0x170 >> ? _raw_spin_lock_irqsave+0x88/0x100 >> ? _raw_spin_lock_irqsave+0x88/0x100 >> kasan_check_range+0x125/0x230 >> __kasan_check_write+0x14/0x30 >> _raw_spin_lock_irqsave+0x88/0x100 >> ? __pfx__raw_spin_lock_irqsave+0x10/0x10 >> _atomic_dec_and_raw_lock_irqsave+0x89/0x110 >> ? __kasan_check_write+0x14/0x30 >> put_pmu_ctx+0x98/0x330 >> >> The fix here is to provide a set of get/put hooks that drivers can >> implement to piggy back the perf's pmu lifecycle to the driver's >> instance lifecycle. With this, perf_pmu_unregister() can be called by >> the driver, which is then responsible for freeing the resources. > >I'm confused.. probably because I still don't have any clue about >drivers and the above isn't really telling me much either. > >I don't see how you get rid of the try_module_get() we do per event; >without that you can't unload the module. I don't get rid of the try_module_get(). They serve diffeerent purposes. Having a reference to the module prevents the _module_ going away (and hence the function pointers we call into from perf). It doesn't prevent the module unbinding from the HW. A module may have N instances if it's bound to N devices. This can be done today to unbind the HW (integrated graphics) from the i915 module: # echo 0000:00:02.0 > /sys/bus/pci/drivers/i915/unbind The ref taken by these new get()/put() are related to preventing the data going away - the driver can use that to take a ref on something that will survive the unbind. > >And I don't see how you think it is safe to free a pmu while there are >still events around. so, we don't actually free it - the pmu is unregistered but the `struct pmu` and (possibly) its container are still around after unregister. When the get/put are used, the driver can keep the data around, which is then free'd when the last reference is put. > >Nor do I really see what these new get/put methods do. I see you call >->put() where we do module_put(), and ->get() near try_module_get(), but >how is that helping? Maybe the specific patches for i915 can help? Patch series: https://lore.kernel.org/intel-gfx/20241011225430.1219345-1-lucas.demarchi@intel.com/ Important patches here are patches 2 and 3: - Subject: [PATCH 2/8] drm/i915/pmu: Let resource survive unbind Allow the final kfree() to happen at a different time, not necessarily together with the call to perf_pmu_unregister(). Here it uses drmm_add_action() to easily tie on the last drm ref going away. - Subject: [PATCH 3/8] drm/i915/pmu: Fix crash due to use-after-free This implements the get()/put() so we get/put a reference to the drm dev. These 2 patches for i915 are the equivalent of patch 4 in this series for the dummy_pmu. Lucas De Marchi
On Mon, Oct 14, 2024 at 01:20:34PM -0500, Lucas De Marchi wrote: > On Mon, Oct 14, 2024 at 07:32:46PM +0200, Peter Zijlstra wrote: > > I'm confused.. probably because I still don't have any clue about > > drivers and the above isn't really telling me much either. > > > > I don't see how you get rid of the try_module_get() we do per event; > > without that you can't unload the module. > > I don't get rid of the try_module_get(). They serve diffeerent purposes. > Having a reference to the module prevents the _module_ going away (and > hence the function pointers we call into from perf). It doesn't prevent > the module unbinding from the HW. A module may have N instances if it's > bound to N devices. > > This can be done today to unbind the HW (integrated graphics) from the > i915 module: > > # echo 0000:00:02.0 > /sys/bus/pci/drivers/i915/unbind > > The ref taken by these new get()/put() are related to preventing the > data going away - the driver can use that to take a ref on something > that will survive the unbind. OK, for some reason I thought to remember that you wanted to be able to unload the module too. > > And I don't see how you think it is safe to free a pmu while there are > > still events around. > > so, we don't actually free it - the pmu is unregistered but the > `struct pmu` and (possibly) its container are still around after unregister. > When the get/put are used, the driver can keep the data around, which is > then free'd when the last reference is put. Aaaaah, okay. So the implementation knows to nop out all device interaction when it gets unbound, but the events and pmu data stick around until they're naturally killed off? Ah, reading the below patches that is indeed what i915 does. pmu->closed makes this so. The dummy thing you posted in this thread, does perf_event_disable() on all previously created events, and this is not sound. Userspace can do PERF_EVENT_IOC_ENABLE on them and then things will go side-ways fast. And I was afraid i915 was doing this same. > - Subject: [PATCH 3/8] drm/i915/pmu: Fix crash due to use-after-free So reading that Changelog, you would like a replacement for pmu->closed as well. I suppose, one way to go about doing this is to have perf_pmu_unregister() replace a bunch of methods. Notably you have pmu->closed in: - event_init() - read() - start() - stop() - add() Having perf_pmu_unregister() overwrite those function pointers with something akin to your pmu->closed would go a long way, right? It would require using READ_ONCE() for calling the methods, which would make things a little ugly :/ But I also think we want to force all the events into STATE_ERROR, and I'm not immediately sure how best to go about doing that. Adding better return value handling to ->add() is trivial enough, and perhaps calling perf_pmu_resched() is sufficient to cycle everything. Let me ponder that a little bit.
On Mon, Oct 14, 2024 at 09:25:19PM +0200, Peter Zijlstra wrote: > Let me ponder that a little bit. So I did the thing on top of the get/put thing that would allow you to get rid of the ->closed thing, and before I was finished I already hated all of it :-( The thing is, if you're going to the effort of adding get/put and putting the responsibility on the implementation, then the ->closed thing is only a little extra ask. So... I wondered, how hard it would be for perf_pmu_unregister() to do what you want. Time passed, hacks were done... and while what I have is still riddled with holes (more work is definitely required), it does pass your dummy_pmu test scipt. I'll poke at this a little longer. Afaict it should be possible to make this good enough for what you want. Just needs more holes filled. --- include/linux/perf_event.h | 13 ++- kernel/events/core.c | 260 ++++++++++++++++++++++++++++++++++++++------- 2 files changed, 228 insertions(+), 45 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index fb908843f209..20995d33d27e 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -318,6 +318,9 @@ struct perf_output_handle; struct pmu { struct list_head entry; + spinlock_t events_lock; + struct list_head events; + struct module *module; struct device *dev; struct device *parent; @@ -612,9 +615,10 @@ struct perf_addr_filter_range { * enum perf_event_state - the states of an event: */ enum perf_event_state { - PERF_EVENT_STATE_DEAD = -4, - PERF_EVENT_STATE_EXIT = -3, - PERF_EVENT_STATE_ERROR = -2, + PERF_EVENT_STATE_DEAD = -5, + PERF_EVENT_STATE_REVOKED = -4, /* pmu gone, must not touch */ + PERF_EVENT_STATE_EXIT = -3, /* task died, still inherit */ + PERF_EVENT_STATE_ERROR = -2, /* scheduling error, can enable */ PERF_EVENT_STATE_OFF = -1, PERF_EVENT_STATE_INACTIVE = 0, PERF_EVENT_STATE_ACTIVE = 1, @@ -853,6 +857,7 @@ struct perf_event { void *security; #endif struct list_head sb_list; + struct list_head pmu_list; /* * Certain events gets forwarded to another pmu internally by over- @@ -1103,7 +1108,7 @@ extern void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags); extern void perf_event_itrace_started(struct perf_event *event); extern int perf_pmu_register(struct pmu *pmu, const char *name, int type); -extern void perf_pmu_unregister(struct pmu *pmu); +extern int perf_pmu_unregister(struct pmu *pmu); extern void __perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task); diff --git a/kernel/events/core.c b/kernel/events/core.c index cdd09769e6c5..e66827367a97 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2406,7 +2406,9 @@ ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event) #define DETACH_GROUP 0x01UL #define DETACH_CHILD 0x02UL -#define DETACH_DEAD 0x04UL +#define DETACH_EXIT 0x04UL +#define DETACH_REVOKE 0x08UL +#define DETACH_DEAD 0x10UL /* * Cross CPU call to remove a performance event @@ -2421,6 +2423,7 @@ __perf_remove_from_context(struct perf_event *event, void *info) { struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx; + enum perf_event_state state = PERF_EVENT_STATE_OFF; unsigned long flags = (unsigned long)info; ctx_time_update(cpuctx, ctx); @@ -2429,16 +2432,22 @@ __perf_remove_from_context(struct perf_event *event, * Ensure event_sched_out() switches to OFF, at the very least * this avoids raising perf_pending_task() at this time. */ - if (flags & DETACH_DEAD) + if (flags & DETACH_EXIT) + state = PERF_EVENT_STATE_EXIT; + if (flags & DETACH_REVOKE) + state = PERF_EVENT_STATE_REVOKED; + if (flags & DETACH_DEAD) { event->pending_disable = 1; + state = PERF_EVENT_STATE_DEAD; + } event_sched_out(event, ctx); if (flags & DETACH_GROUP) perf_group_detach(event); if (flags & DETACH_CHILD) perf_child_detach(event); list_del_event(event, ctx); - if (flags & DETACH_DEAD) - event->state = PERF_EVENT_STATE_DEAD; + + event->state = state; if (!pmu_ctx->nr_events) { pmu_ctx->rotate_necessary = 0; @@ -4508,7 +4517,8 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) static void perf_remove_from_owner(struct perf_event *event); static void perf_event_exit_event(struct perf_event *event, - struct perf_event_context *ctx); + struct perf_event_context *ctx, + bool revoke); /* * Removes all events from the current task that have been marked @@ -4535,7 +4545,7 @@ static void perf_event_remove_on_exec(struct perf_event_context *ctx) modified = true; - perf_event_exit_event(event, ctx); + perf_event_exit_event(event, ctx, false); } raw_spin_lock_irqsave(&ctx->lock, flags); @@ -5121,6 +5131,7 @@ static bool is_sb_event(struct perf_event *event) attr->context_switch || attr->text_poke || attr->bpf_event) return true; + return false; } @@ -5321,6 +5332,8 @@ static void perf_pending_task_sync(struct perf_event *event) static void _free_event(struct perf_event *event) { + struct pmu *pmu = event->pmu; + irq_work_sync(&event->pending_irq); irq_work_sync(&event->pending_disable_irq); perf_pending_task_sync(event); @@ -5330,6 +5343,7 @@ static void _free_event(struct perf_event *event) security_perf_event_free(event); if (event->rb) { + WARN_ON_ONCE(!pmu); /* * Can happen when we close an event with re-directed output. * @@ -5349,12 +5363,16 @@ static void _free_event(struct perf_event *event) put_callchain_buffers(); } - perf_event_free_bpf_prog(event); - perf_addr_filters_splice(event, NULL); - kfree(event->addr_filter_ranges); + if (pmu) { + perf_event_free_bpf_prog(event); + perf_addr_filters_splice(event, NULL); + kfree(event->addr_filter_ranges); + } - if (event->destroy) + if (event->destroy) { + WARN_ON_ONCE(!pmu); event->destroy(event); + } /* * Must be after ->destroy(), due to uprobe_perf_close() using @@ -5363,8 +5381,10 @@ static void _free_event(struct perf_event *event) if (event->hw.target) put_task_struct(event->hw.target); - if (event->pmu_ctx) + if (event->pmu_ctx) { + WARN_ON_ONCE(!pmu); put_pmu_ctx(event->pmu_ctx); + } /* * perf_event_free_task() relies on put_ctx() being 'last', in particular @@ -5373,8 +5393,14 @@ static void _free_event(struct perf_event *event) if (event->ctx) put_ctx(event->ctx); - exclusive_event_destroy(event); - module_put(event->pmu->module); + if (pmu) { + exclusive_event_destroy(event); + module_put(pmu->module); + scoped_guard(spinlock, &pmu->events_lock) { + list_del(&event->pmu_list); + wake_up_var(pmu); + } + } call_rcu(&event->rcu_head, free_event_rcu); } @@ -5492,7 +5518,11 @@ int perf_event_release_kernel(struct perf_event *event) * Thus this guarantees that we will in fact observe and kill _ALL_ * child events. */ - perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD); + if (event->state > PERF_EVENT_STATE_REVOKED) { + perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD); + } else { + event->state = PERF_EVENT_STATE_DEAD; + } perf_event_ctx_unlock(event, ctx); @@ -5803,7 +5833,7 @@ __perf_read(struct perf_event *event, char __user *buf, size_t count) * error state (i.e. because it was pinned but it couldn't be * scheduled on to the CPU at some point). */ - if (event->state == PERF_EVENT_STATE_ERROR) + if (event->state <= PERF_EVENT_STATE_ERROR) return 0; if (count < event->read_size) @@ -5842,8 +5872,14 @@ static __poll_t perf_poll(struct file *file, poll_table *wait) struct perf_buffer *rb; __poll_t events = EPOLLHUP; + if (event->state <= PERF_EVENT_STATE_REVOKED) + return EPOLLERR; + poll_wait(file, &event->waitq, wait); + if (event->state <= PERF_EVENT_STATE_REVOKED) + return EPOLLERR; + if (is_event_hup(event)) return events; @@ -6013,7 +6049,7 @@ static inline int perf_fget_light(int fd, struct fd *p) } static int perf_event_set_output(struct perf_event *event, - struct perf_event *output_event); + struct perf_event *output_event, bool force); static int perf_event_set_filter(struct perf_event *event, void __user *arg); static int perf_copy_attr(struct perf_event_attr __user *uattr, struct perf_event_attr *attr); @@ -6023,6 +6059,9 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon void (*func)(struct perf_event *); u32 flags = arg; + if (event->state <= PERF_EVENT_STATE_REVOKED) + return -ENODEV; + switch (cmd) { case PERF_EVENT_IOC_ENABLE: func = _perf_event_enable; @@ -6065,10 +6104,10 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon if (ret) return ret; output_event = fd_file(output)->private_data; - ret = perf_event_set_output(event, output_event); + ret = perf_event_set_output(event, output_event, false); fdput(output); } else { - ret = perf_event_set_output(event, NULL); + ret = perf_event_set_output(event, NULL, false); } return ret; } @@ -6472,6 +6511,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) unsigned long size = perf_data_size(rb); bool detach_rest = false; + /* FIXIES vs perf_pmu_unregister() */ if (event->pmu->event_unmapped) event->pmu->event_unmapped(event, vma->vm_mm); @@ -6591,7 +6631,15 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) unsigned long vma_size; unsigned long nr_pages; long user_extra = 0, extra = 0; - int ret = 0, flags = 0; + int ret, flags = 0; + + ret = security_perf_event_read(event); + if (ret) + return ret; + + /* XXX needs event->mmap_mutex? */ + if (event->state <= PERF_EVENT_STATE_REVOKED) + return -ENODEV; /* * Don't allow mmap() of inherited per-task counters. This would @@ -6604,10 +6652,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; - ret = security_perf_event_read(event); - if (ret) - return ret; - vma_size = vma->vm_end - vma->vm_start; if (vma->vm_pgoff == 0) { @@ -6810,6 +6854,9 @@ static int perf_fasync(int fd, struct file *filp, int on) struct perf_event *event = filp->private_data; int retval; + if (event->state <= PERF_EVENT_STATE_REVOKED) + return -ENODEV; + inode_lock(inode); retval = fasync_helper(fd, filp, on, &event->fasync); inode_unlock(inode); @@ -11513,6 +11560,7 @@ static int perf_event_idx_default(struct perf_event *event) static void free_pmu_context(struct pmu *pmu) { + free_percpu(pmu->pmu_disable_count); free_percpu(pmu->cpu_pmu_context); } @@ -11753,6 +11801,7 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type) if (type >= 0) max = type; + // XXX broken vs perf_init_event(), this publishes before @pmu is finalized ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL); if (ret < 0) goto free_pdc; @@ -11809,8 +11858,14 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type) if (!pmu->event_idx) pmu->event_idx = perf_event_idx_default; - list_add_rcu(&pmu->entry, &pmus); atomic_set(&pmu->exclusive_cnt, 0); + INIT_LIST_HEAD(&pmu->events); + spin_lock_init(&pmu->events_lock); + + /* + * Publish the pmu after it is fully initialized. + */ + list_add_rcu(&pmu->entry, &pmus); ret = 0; unlock: mutex_unlock(&pmus_lock); @@ -11832,10 +11887,110 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type) } EXPORT_SYMBOL_GPL(perf_pmu_register); -void perf_pmu_unregister(struct pmu *pmu) +static void __pmu_detach_event(struct pmu *pmu, struct perf_event *event, + struct perf_event_context *ctx) { + /* + * De-schedule the event and mark it EXIT_PMU. + */ + perf_event_exit_event(event, ctx, true); + + /* + * All _free_event() bits that rely on event->pmu: + */ + perf_event_set_output(event, NULL, true); + + perf_event_free_bpf_prog(event); + perf_addr_filters_splice(event, NULL); + kfree(event->addr_filter_ranges); + + if (event->destroy) { + event->destroy(event); + event->destroy = NULL; + } + + if (event->pmu_ctx) { + /* + * Make sure pmu->cpu_pmu_context is unused. An alternative is + * to have it be pointers and make put_pmu_ctx()'s + * epc->embedded case be another RCU free case. + */ + put_pmu_ctx(event->pmu_ctx); + event->pmu_ctx = NULL; + } + + exclusive_event_destroy(event); + module_put(pmu->module); + + event->pmu = NULL; /* force fault instead of UAF */ +} + +static void pmu_detach_event(struct pmu *pmu, struct perf_event *event) +{ + struct perf_event_context *ctx; + + ctx = perf_event_ctx_lock(event); + __pmu_detach_event(pmu, event, ctx); + perf_event_ctx_unlock(event, ctx); + + scoped_guard(spinlock, &pmu->events_lock) + list_del(&event->pmu_list); +} + +static struct perf_event *pmu_get_event(struct pmu *pmu) +{ + struct perf_event *event; + + guard(spinlock)(&pmu->events_lock); + list_for_each_entry(event, &pmu->events, pmu_list) { + if (atomic_long_inc_not_zero(&event->refcount)) + return event; + } + + return NULL; +} + +static bool pmu_empty(struct pmu *pmu) +{ + guard(spinlock)(&pmu->events_lock); + return list_empty(&pmu->events); +} + +static void pmu_detach_events(struct pmu *pmu) +{ + struct perf_event *event; + + for (;;) { + event = pmu_get_event(pmu); + if (!event) + break; + + pmu_detach_event(pmu, event); + put_event(event); + } + + /* + * wait for pending _free_event()s + */ + wait_var_event(pmu, pmu_empty(pmu)); +} + +int perf_pmu_unregister(struct pmu *pmu) +{ + /* + * FIXME! + * + * perf_mmap_close(); event->pmu->event_unmapped() + * + * XXX this check is racy vs perf_event_alloc() + */ + if (pmu->event_unmapped && !pmu_empty(pmu)) + return -EBUSY; + mutex_lock(&pmus_lock); list_del_rcu(&pmu->entry); + idr_remove(&pmu_idr, pmu->type); + mutex_unlock(&pmus_lock); /* * We dereference the pmu list under both SRCU and regular RCU, so @@ -11844,16 +11999,29 @@ void perf_pmu_unregister(struct pmu *pmu) synchronize_srcu(&pmus_srcu); synchronize_rcu(); - free_percpu(pmu->pmu_disable_count); - idr_remove(&pmu_idr, pmu->type); + /* + * PMU is removed from the pmus list, so no new events will + * be created, now take care of the existing ones. + */ + pmu_detach_events(pmu); + + /* + * PMU is unused, make it go away. + */ if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) { if (pmu->nr_addr_filters) device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); device_del(pmu->dev); put_device(pmu->dev); } + + /* + * XXX -- broken? can still contain SW events at this point? + * audit more, make sure DETACH_GROUP DTRT + */ free_pmu_context(pmu); - mutex_unlock(&pmus_lock); + + return 0; } EXPORT_SYMBOL_GPL(perf_pmu_unregister); @@ -12330,6 +12498,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, /* symmetric to unaccount_event() in _free_event() */ account_event(event); + scoped_guard(spinlock, &pmu->events_lock) + list_add(&event->pmu_list, &pmu->events); + return event; err_callchain_buffer: @@ -12493,7 +12664,7 @@ static void mutex_lock_double(struct mutex *a, struct mutex *b) } static int -perf_event_set_output(struct perf_event *event, struct perf_event *output_event) +perf_event_set_output(struct perf_event *event, struct perf_event *output_event, bool force) { struct perf_buffer *rb = NULL; int ret = -EINVAL; @@ -12549,8 +12720,12 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event) mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex); set: /* Can't redirect output if we've got an active mmap() */ - if (atomic_read(&event->mmap_count)) - goto unlock; + if (atomic_read(&event->mmap_count)) { + if (!force) + goto unlock; + + WARN_ON_ONCE(event->pmu->event_unmapped); + } if (output_event) { /* get the rb we want to redirect to */ @@ -12740,6 +12915,10 @@ SYSCALL_DEFINE5(perf_event_open, if (err) goto err_fd; group_leader = fd_file(group)->private_data; + if (group_leader->state <= PERF_EVENT_STATE_REVOKED) { + err = -ENODEV; + goto err_group_fd; + } if (flags & PERF_FLAG_FD_OUTPUT) output_event = group_leader; if (flags & PERF_FLAG_FD_NO_GROUP) @@ -12916,7 +13095,7 @@ SYSCALL_DEFINE5(perf_event_open, event->pmu_ctx = pmu_ctx; if (output_event) { - err = perf_event_set_output(event, output_event); + err = perf_event_set_output(event, output_event, false); if (err) goto err_context; } @@ -13287,10 +13466,11 @@ static void sync_child_event(struct perf_event *child_event) } static void -perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx) +perf_event_exit_event(struct perf_event *event, + struct perf_event_context *ctx, bool revoke) { struct perf_event *parent_event = event->parent; - unsigned long detach_flags = 0; + unsigned long detach_flags = DETACH_EXIT; if (parent_event) { /* @@ -13305,16 +13485,14 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx) * Do destroy all inherited groups, we don't care about those * and being thorough is better. */ - detach_flags = DETACH_GROUP | DETACH_CHILD; + detach_flags |= DETACH_GROUP | DETACH_CHILD; mutex_lock(&parent_event->child_mutex); } - perf_remove_from_context(event, detach_flags); + if (revoke) + detach_flags |= DETACH_GROUP | DETACH_REVOKE; - raw_spin_lock_irq(&ctx->lock); - if (event->state > PERF_EVENT_STATE_EXIT) - perf_event_set_state(event, PERF_EVENT_STATE_EXIT); - raw_spin_unlock_irq(&ctx->lock); + perf_remove_from_context(event, detach_flags); /* * Child events can be freed. @@ -13390,7 +13568,7 @@ static void perf_event_exit_task_context(struct task_struct *child) perf_event_task(child, child_ctx, 0); list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry) - perf_event_exit_event(child_event, child_ctx); + perf_event_exit_event(child_event, child_ctx, false); mutex_unlock(&child_ctx->mutex);
On Wed, Oct 16, 2024 at 02:03:02PM +0200, Peter Zijlstra wrote: >On Mon, Oct 14, 2024 at 09:25:19PM +0200, Peter Zijlstra wrote: > >> Let me ponder that a little bit. > >So I did the thing on top of the get/put thing that would allow you to >get rid of the ->closed thing, and before I was finished I already hated >all of it :-( > >The thing is, if you're going to the effort of adding get/put and >putting the responsibility on the implementation, then the ->closed >thing is only a little extra ask. > >So... I wondered, how hard it would be for perf_pmu_unregister() to do >what you want. > >Time passed, hacks were done... > >and while what I have is still riddled with holes (more work is >definitely required), it does pass your dummy_pmu test scipt. > >I'll poke at this a little longer. Afaict it should be possible to make >this good enough for what you want. Just needs more holes filled. > >--- > include/linux/perf_event.h | 13 ++- > kernel/events/core.c | 260 ++++++++++++++++++++++++++++++++++++++------- > 2 files changed, 228 insertions(+), 45 deletions(-) > >diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h >index fb908843f209..20995d33d27e 100644 >--- a/include/linux/perf_event.h >+++ b/include/linux/perf_event.h >@@ -318,6 +318,9 @@ struct perf_output_handle; > struct pmu { > struct list_head entry; > >+ spinlock_t events_lock; >+ struct list_head events; oh... I looked at several lists and was wondering which one would contain the events of our pmu. Now I see we didn't have that :) >+ > struct module *module; > struct device *dev; > struct device *parent; >@@ -612,9 +615,10 @@ struct perf_addr_filter_range { > * enum perf_event_state - the states of an event: > */ > enum perf_event_state { >- PERF_EVENT_STATE_DEAD = -4, >- PERF_EVENT_STATE_EXIT = -3, >- PERF_EVENT_STATE_ERROR = -2, >+ PERF_EVENT_STATE_DEAD = -5, >+ PERF_EVENT_STATE_REVOKED = -4, /* pmu gone, must not touch */ >+ PERF_EVENT_STATE_EXIT = -3, /* task died, still inherit */ >+ PERF_EVENT_STATE_ERROR = -2, /* scheduling error, can enable */ > PERF_EVENT_STATE_OFF = -1, > PERF_EVENT_STATE_INACTIVE = 0, > PERF_EVENT_STATE_ACTIVE = 1, >@@ -853,6 +857,7 @@ struct perf_event { > void *security; > #endif > struct list_head sb_list; >+ struct list_head pmu_list; > > /* > * Certain events gets forwarded to another pmu internally by over- >@@ -1103,7 +1108,7 @@ extern void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags); > extern void perf_event_itrace_started(struct perf_event *event); > > extern int perf_pmu_register(struct pmu *pmu, const char *name, int type); >-extern void perf_pmu_unregister(struct pmu *pmu); >+extern int perf_pmu_unregister(struct pmu *pmu); > > extern void __perf_event_task_sched_in(struct task_struct *prev, > struct task_struct *task); >diff --git a/kernel/events/core.c b/kernel/events/core.c >index cdd09769e6c5..e66827367a97 100644 >--- a/kernel/events/core.c >+++ b/kernel/events/core.c >@@ -2406,7 +2406,9 @@ ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event) > > #define DETACH_GROUP 0x01UL > #define DETACH_CHILD 0x02UL >-#define DETACH_DEAD 0x04UL >+#define DETACH_EXIT 0x04UL >+#define DETACH_REVOKE 0x08UL >+#define DETACH_DEAD 0x10UL > > /* > * Cross CPU call to remove a performance event >@@ -2421,6 +2423,7 @@ __perf_remove_from_context(struct perf_event *event, > void *info) > { > struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx; >+ enum perf_event_state state = PERF_EVENT_STATE_OFF; > unsigned long flags = (unsigned long)info; > > ctx_time_update(cpuctx, ctx); >@@ -2429,16 +2432,22 @@ __perf_remove_from_context(struct perf_event *event, > * Ensure event_sched_out() switches to OFF, at the very least > * this avoids raising perf_pending_task() at this time. > */ >- if (flags & DETACH_DEAD) >+ if (flags & DETACH_EXIT) >+ state = PERF_EVENT_STATE_EXIT; >+ if (flags & DETACH_REVOKE) >+ state = PERF_EVENT_STATE_REVOKED; >+ if (flags & DETACH_DEAD) { > event->pending_disable = 1; >+ state = PERF_EVENT_STATE_DEAD; >+ } > event_sched_out(event, ctx); > if (flags & DETACH_GROUP) > perf_group_detach(event); > if (flags & DETACH_CHILD) > perf_child_detach(event); > list_del_event(event, ctx); >- if (flags & DETACH_DEAD) >- event->state = PERF_EVENT_STATE_DEAD; >+ >+ event->state = state; > > if (!pmu_ctx->nr_events) { > pmu_ctx->rotate_necessary = 0; >@@ -4508,7 +4517,8 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) > > static void perf_remove_from_owner(struct perf_event *event); > static void perf_event_exit_event(struct perf_event *event, >- struct perf_event_context *ctx); >+ struct perf_event_context *ctx, >+ bool revoke); > > /* > * Removes all events from the current task that have been marked >@@ -4535,7 +4545,7 @@ static void perf_event_remove_on_exec(struct perf_event_context *ctx) > > modified = true; > >- perf_event_exit_event(event, ctx); >+ perf_event_exit_event(event, ctx, false); > } > > raw_spin_lock_irqsave(&ctx->lock, flags); >@@ -5121,6 +5131,7 @@ static bool is_sb_event(struct perf_event *event) > attr->context_switch || attr->text_poke || > attr->bpf_event) > return true; >+ > return false; > } > >@@ -5321,6 +5332,8 @@ static void perf_pending_task_sync(struct perf_event *event) > > static void _free_event(struct perf_event *event) > { >+ struct pmu *pmu = event->pmu; >+ > irq_work_sync(&event->pending_irq); > irq_work_sync(&event->pending_disable_irq); > perf_pending_task_sync(event); >@@ -5330,6 +5343,7 @@ static void _free_event(struct perf_event *event) > security_perf_event_free(event); > > if (event->rb) { >+ WARN_ON_ONCE(!pmu); > /* > * Can happen when we close an event with re-directed output. > * >@@ -5349,12 +5363,16 @@ static void _free_event(struct perf_event *event) > put_callchain_buffers(); > } > >- perf_event_free_bpf_prog(event); >- perf_addr_filters_splice(event, NULL); >- kfree(event->addr_filter_ranges); >+ if (pmu) { >+ perf_event_free_bpf_prog(event); >+ perf_addr_filters_splice(event, NULL); >+ kfree(event->addr_filter_ranges); >+ } > >- if (event->destroy) >+ if (event->destroy) { >+ WARN_ON_ONCE(!pmu); > event->destroy(event); >+ } > > /* > * Must be after ->destroy(), due to uprobe_perf_close() using >@@ -5363,8 +5381,10 @@ static void _free_event(struct perf_event *event) > if (event->hw.target) > put_task_struct(event->hw.target); > >- if (event->pmu_ctx) >+ if (event->pmu_ctx) { >+ WARN_ON_ONCE(!pmu); > put_pmu_ctx(event->pmu_ctx); >+ } > > /* > * perf_event_free_task() relies on put_ctx() being 'last', in particular >@@ -5373,8 +5393,14 @@ static void _free_event(struct perf_event *event) > if (event->ctx) > put_ctx(event->ctx); > >- exclusive_event_destroy(event); >- module_put(event->pmu->module); >+ if (pmu) { >+ exclusive_event_destroy(event); >+ module_put(pmu->module); >+ scoped_guard(spinlock, &pmu->events_lock) { >+ list_del(&event->pmu_list); >+ wake_up_var(pmu); >+ } >+ } > > call_rcu(&event->rcu_head, free_event_rcu); > } >@@ -5492,7 +5518,11 @@ int perf_event_release_kernel(struct perf_event *event) > * Thus this guarantees that we will in fact observe and kill _ALL_ > * child events. > */ >- perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD); >+ if (event->state > PERF_EVENT_STATE_REVOKED) { >+ perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD); >+ } else { >+ event->state = PERF_EVENT_STATE_DEAD; >+ } > > perf_event_ctx_unlock(event, ctx); > >@@ -5803,7 +5833,7 @@ __perf_read(struct perf_event *event, char __user *buf, size_t count) > * error state (i.e. because it was pinned but it couldn't be > * scheduled on to the CPU at some point). > */ >- if (event->state == PERF_EVENT_STATE_ERROR) >+ if (event->state <= PERF_EVENT_STATE_ERROR) > return 0; > > if (count < event->read_size) >@@ -5842,8 +5872,14 @@ static __poll_t perf_poll(struct file *file, poll_table *wait) > struct perf_buffer *rb; > __poll_t events = EPOLLHUP; > >+ if (event->state <= PERF_EVENT_STATE_REVOKED) >+ return EPOLLERR; >+ > poll_wait(file, &event->waitq, wait); > >+ if (event->state <= PERF_EVENT_STATE_REVOKED) >+ return EPOLLERR; >+ > if (is_event_hup(event)) > return events; > >@@ -6013,7 +6049,7 @@ static inline int perf_fget_light(int fd, struct fd *p) > } > > static int perf_event_set_output(struct perf_event *event, >- struct perf_event *output_event); >+ struct perf_event *output_event, bool force); > static int perf_event_set_filter(struct perf_event *event, void __user *arg); > static int perf_copy_attr(struct perf_event_attr __user *uattr, > struct perf_event_attr *attr); >@@ -6023,6 +6059,9 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon > void (*func)(struct perf_event *); > u32 flags = arg; > >+ if (event->state <= PERF_EVENT_STATE_REVOKED) >+ return -ENODEV; >+ > switch (cmd) { > case PERF_EVENT_IOC_ENABLE: > func = _perf_event_enable; >@@ -6065,10 +6104,10 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon > if (ret) > return ret; > output_event = fd_file(output)->private_data; >- ret = perf_event_set_output(event, output_event); >+ ret = perf_event_set_output(event, output_event, false); > fdput(output); > } else { >- ret = perf_event_set_output(event, NULL); >+ ret = perf_event_set_output(event, NULL, false); > } > return ret; > } >@@ -6472,6 +6511,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) > unsigned long size = perf_data_size(rb); > bool detach_rest = false; > >+ /* FIXIES vs perf_pmu_unregister() */ > if (event->pmu->event_unmapped) > event->pmu->event_unmapped(event, vma->vm_mm); > >@@ -6591,7 +6631,15 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) > unsigned long vma_size; > unsigned long nr_pages; > long user_extra = 0, extra = 0; >- int ret = 0, flags = 0; >+ int ret, flags = 0; >+ >+ ret = security_perf_event_read(event); >+ if (ret) >+ return ret; >+ >+ /* XXX needs event->mmap_mutex? */ >+ if (event->state <= PERF_EVENT_STATE_REVOKED) >+ return -ENODEV; > > /* > * Don't allow mmap() of inherited per-task counters. This would >@@ -6604,10 +6652,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) > if (!(vma->vm_flags & VM_SHARED)) > return -EINVAL; > >- ret = security_perf_event_read(event); >- if (ret) >- return ret; >- > vma_size = vma->vm_end - vma->vm_start; > > if (vma->vm_pgoff == 0) { >@@ -6810,6 +6854,9 @@ static int perf_fasync(int fd, struct file *filp, int on) > struct perf_event *event = filp->private_data; > int retval; > >+ if (event->state <= PERF_EVENT_STATE_REVOKED) >+ return -ENODEV; >+ > inode_lock(inode); > retval = fasync_helper(fd, filp, on, &event->fasync); > inode_unlock(inode); >@@ -11513,6 +11560,7 @@ static int perf_event_idx_default(struct perf_event *event) > > static void free_pmu_context(struct pmu *pmu) > { >+ free_percpu(pmu->pmu_disable_count); > free_percpu(pmu->cpu_pmu_context); > } > >@@ -11753,6 +11801,7 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type) > if (type >= 0) > max = type; > >+ // XXX broken vs perf_init_event(), this publishes before @pmu is finalized > ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL); > if (ret < 0) > goto free_pdc; >@@ -11809,8 +11858,14 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type) > if (!pmu->event_idx) > pmu->event_idx = perf_event_idx_default; > >- list_add_rcu(&pmu->entry, &pmus); > atomic_set(&pmu->exclusive_cnt, 0); >+ INIT_LIST_HEAD(&pmu->events); >+ spin_lock_init(&pmu->events_lock); >+ >+ /* >+ * Publish the pmu after it is fully initialized. >+ */ >+ list_add_rcu(&pmu->entry, &pmus); > ret = 0; > unlock: > mutex_unlock(&pmus_lock); >@@ -11832,10 +11887,110 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type) > } > EXPORT_SYMBOL_GPL(perf_pmu_register); > >-void perf_pmu_unregister(struct pmu *pmu) >+static void __pmu_detach_event(struct pmu *pmu, struct perf_event *event, >+ struct perf_event_context *ctx) > { >+ /* >+ * De-schedule the event and mark it EXIT_PMU. >+ */ >+ perf_event_exit_event(event, ctx, true); >+ >+ /* >+ * All _free_event() bits that rely on event->pmu: >+ */ >+ perf_event_set_output(event, NULL, true); >+ >+ perf_event_free_bpf_prog(event); >+ perf_addr_filters_splice(event, NULL); >+ kfree(event->addr_filter_ranges); >+ >+ if (event->destroy) { >+ event->destroy(event); >+ event->destroy = NULL; >+ } >+ >+ if (event->pmu_ctx) { >+ /* >+ * Make sure pmu->cpu_pmu_context is unused. An alternative is >+ * to have it be pointers and make put_pmu_ctx()'s >+ * epc->embedded case be another RCU free case. >+ */ >+ put_pmu_ctx(event->pmu_ctx); >+ event->pmu_ctx = NULL; >+ } >+ >+ exclusive_event_destroy(event); >+ module_put(pmu->module); >+ >+ event->pmu = NULL; /* force fault instead of UAF */ >+} >+ >+static void pmu_detach_event(struct pmu *pmu, struct perf_event *event) >+{ >+ struct perf_event_context *ctx; >+ >+ ctx = perf_event_ctx_lock(event); >+ __pmu_detach_event(pmu, event, ctx); >+ perf_event_ctx_unlock(event, ctx); >+ >+ scoped_guard(spinlock, &pmu->events_lock) >+ list_del(&event->pmu_list); >+} >+ >+static struct perf_event *pmu_get_event(struct pmu *pmu) >+{ >+ struct perf_event *event; >+ >+ guard(spinlock)(&pmu->events_lock); >+ list_for_each_entry(event, &pmu->events, pmu_list) { >+ if (atomic_long_inc_not_zero(&event->refcount)) >+ return event; >+ } >+ >+ return NULL; >+} >+ >+static bool pmu_empty(struct pmu *pmu) >+{ >+ guard(spinlock)(&pmu->events_lock); >+ return list_empty(&pmu->events); >+} >+ >+static void pmu_detach_events(struct pmu *pmu) >+{ >+ struct perf_event *event; >+ >+ for (;;) { >+ event = pmu_get_event(pmu); >+ if (!event) >+ break; >+ >+ pmu_detach_event(pmu, event); >+ put_event(event); >+ } >+ >+ /* >+ * wait for pending _free_event()s >+ */ >+ wait_var_event(pmu, pmu_empty(pmu)); ok... so now we are synchronosly moving the events to a revoked state during unregister, so we wouldn't need the refcount on the driver side anymore and can just free the objects right after return. I will give this a try with i915 and/or xe. thanks Lucas De Marchi >+} >+ >+int perf_pmu_unregister(struct pmu *pmu) >+{ >+ /* >+ * FIXME! >+ * >+ * perf_mmap_close(); event->pmu->event_unmapped() >+ * >+ * XXX this check is racy vs perf_event_alloc() >+ */ >+ if (pmu->event_unmapped && !pmu_empty(pmu)) >+ return -EBUSY; >+ > mutex_lock(&pmus_lock); > list_del_rcu(&pmu->entry); >+ idr_remove(&pmu_idr, pmu->type); >+ mutex_unlock(&pmus_lock); > > /* > * We dereference the pmu list under both SRCU and regular RCU, so >@@ -11844,16 +11999,29 @@ void perf_pmu_unregister(struct pmu *pmu) > synchronize_srcu(&pmus_srcu); > synchronize_rcu(); > >- free_percpu(pmu->pmu_disable_count); >- idr_remove(&pmu_idr, pmu->type); >+ /* >+ * PMU is removed from the pmus list, so no new events will >+ * be created, now take care of the existing ones. >+ */ >+ pmu_detach_events(pmu); >+ >+ /* >+ * PMU is unused, make it go away. >+ */ > if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) { > if (pmu->nr_addr_filters) > device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); > device_del(pmu->dev); > put_device(pmu->dev); > } >+ >+ /* >+ * XXX -- broken? can still contain SW events at this point? >+ * audit more, make sure DETACH_GROUP DTRT >+ */ > free_pmu_context(pmu); >- mutex_unlock(&pmus_lock); >+ >+ return 0; > } > EXPORT_SYMBOL_GPL(perf_pmu_unregister); > >@@ -12330,6 +12498,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, > /* symmetric to unaccount_event() in _free_event() */ > account_event(event); > >+ scoped_guard(spinlock, &pmu->events_lock) >+ list_add(&event->pmu_list, &pmu->events); >+ > return event; > > err_callchain_buffer: >@@ -12493,7 +12664,7 @@ static void mutex_lock_double(struct mutex *a, struct mutex *b) > } > > static int >-perf_event_set_output(struct perf_event *event, struct perf_event *output_event) >+perf_event_set_output(struct perf_event *event, struct perf_event *output_event, bool force) > { > struct perf_buffer *rb = NULL; > int ret = -EINVAL; >@@ -12549,8 +12720,12 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event) > mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex); > set: > /* Can't redirect output if we've got an active mmap() */ >- if (atomic_read(&event->mmap_count)) >- goto unlock; >+ if (atomic_read(&event->mmap_count)) { >+ if (!force) >+ goto unlock; >+ >+ WARN_ON_ONCE(event->pmu->event_unmapped); >+ } > > if (output_event) { > /* get the rb we want to redirect to */ >@@ -12740,6 +12915,10 @@ SYSCALL_DEFINE5(perf_event_open, > if (err) > goto err_fd; > group_leader = fd_file(group)->private_data; >+ if (group_leader->state <= PERF_EVENT_STATE_REVOKED) { >+ err = -ENODEV; >+ goto err_group_fd; >+ } > if (flags & PERF_FLAG_FD_OUTPUT) > output_event = group_leader; > if (flags & PERF_FLAG_FD_NO_GROUP) >@@ -12916,7 +13095,7 @@ SYSCALL_DEFINE5(perf_event_open, > event->pmu_ctx = pmu_ctx; > > if (output_event) { >- err = perf_event_set_output(event, output_event); >+ err = perf_event_set_output(event, output_event, false); > if (err) > goto err_context; > } >@@ -13287,10 +13466,11 @@ static void sync_child_event(struct perf_event *child_event) > } > > static void >-perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx) >+perf_event_exit_event(struct perf_event *event, >+ struct perf_event_context *ctx, bool revoke) > { > struct perf_event *parent_event = event->parent; >- unsigned long detach_flags = 0; >+ unsigned long detach_flags = DETACH_EXIT; > > if (parent_event) { > /* >@@ -13305,16 +13485,14 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx) > * Do destroy all inherited groups, we don't care about those > * and being thorough is better. > */ >- detach_flags = DETACH_GROUP | DETACH_CHILD; >+ detach_flags |= DETACH_GROUP | DETACH_CHILD; > mutex_lock(&parent_event->child_mutex); > } > >- perf_remove_from_context(event, detach_flags); >+ if (revoke) >+ detach_flags |= DETACH_GROUP | DETACH_REVOKE; > >- raw_spin_lock_irq(&ctx->lock); >- if (event->state > PERF_EVENT_STATE_EXIT) >- perf_event_set_state(event, PERF_EVENT_STATE_EXIT); >- raw_spin_unlock_irq(&ctx->lock); >+ perf_remove_from_context(event, detach_flags); > > /* > * Child events can be freed. >@@ -13390,7 +13568,7 @@ static void perf_event_exit_task_context(struct task_struct *child) > perf_event_task(child, child_ctx, 0); > > list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry) >- perf_event_exit_event(child_event, child_ctx); >+ perf_event_exit_event(child_event, child_ctx, false); > > mutex_unlock(&child_ctx->mutex); >
On Fri, Oct 18, 2024 at 02:46:31PM -0500, Lucas De Marchi wrote: > I will give this a try with i915 and/or xe. Less horrible version here: git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git perf/pmu-unregister I've just pushed it out to the robots, but it builds, passes perf test and your dummy_pmu testcase (for me, on my one testbox and .config etc..) I'll let it sit with the robots for a few days and if they don't complain I'll post it. Thanks!
On Tue, Oct 22, 2024 at 11:52:10PM +0200, Peter Zijlstra wrote: >On Fri, Oct 18, 2024 at 02:46:31PM -0500, Lucas De Marchi wrote: > >> I will give this a try with i915 and/or xe. > >Less horrible version here: > > git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git perf/pmu-unregister > >I've just pushed it out to the robots, but it builds, passes perf test >and your dummy_pmu testcase (for me, on my one testbox and .config >etc..) It passed for me as well with both dummy_pmu and with i915. I have some changes to igt (i915/xe testsuite) that should bring some more coverage. I minimized the pending test changes I had and posted to trigger CI: https://patchwork.freedesktop.org/series/140355/ thanks Lucas De Marchi > >I'll let it sit with the robots for a few days and if they don't >complain I'll post it. > >Thanks!
On Wed, Oct 23, 2024 at 12:07:58AM -0500, Lucas De Marchi wrote: >On Tue, Oct 22, 2024 at 11:52:10PM +0200, Peter Zijlstra wrote: >>On Fri, Oct 18, 2024 at 02:46:31PM -0500, Lucas De Marchi wrote: >> >>>I will give this a try with i915 and/or xe. >> >>Less horrible version here: >> >> git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git perf/pmu-unregister >> >>I've just pushed it out to the robots, but it builds, passes perf test >>and your dummy_pmu testcase (for me, on my one testbox and .config >>etc..) > >It passed for me as well with both dummy_pmu and with i915. I have some >changes to igt (i915/xe testsuite) that should bring some more coverage. >I minimized the pending test changes I had and posted to trigger CI: > >https://patchwork.freedesktop.org/series/140355/ Our CI also didn't trigger that pmu issue and the test could run succesfully. I did get a report from kernel test robot though: https://lore.kernel.org/all/202410281436.8246527d-lkp@intel.com/ thanks Lucas De Marchi
On Thu, Oct 31, 2024 at 12:07:42AM -0500, Lucas De Marchi wrote: > On Wed, Oct 23, 2024 at 12:07:58AM -0500, Lucas De Marchi wrote: > > On Tue, Oct 22, 2024 at 11:52:10PM +0200, Peter Zijlstra wrote: > > > On Fri, Oct 18, 2024 at 02:46:31PM -0500, Lucas De Marchi wrote: > > > > > > > I will give this a try with i915 and/or xe. > > > > > > Less horrible version here: > > > > > > git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git perf/pmu-unregister > > > > > > I've just pushed it out to the robots, but it builds, passes perf test > > > and your dummy_pmu testcase (for me, on my one testbox and .config > > > etc..) > > > > It passed for me as well with both dummy_pmu and with i915. I have some > > changes to igt (i915/xe testsuite) that should bring some more coverage. > > I minimized the pending test changes I had and posted to trigger CI: > > > > https://patchwork.freedesktop.org/series/140355/ > > Our CI also didn't trigger that pmu issue and the test could run > succesfully. Excellent. > I did get a report from kernel test robot though: > > https://lore.kernel.org/all/202410281436.8246527d-lkp@intel.com/ Yeah, I think I fixed that one, but the robot gifted me another one that I still need to find time to address. I'm hoping this weel. I'll repost properly once I fix it. Thanks!
On Mon, Oct 14, 2024 at 09:25:19PM +0200, Peter Zijlstra wrote: >On Mon, Oct 14, 2024 at 01:20:34PM -0500, Lucas De Marchi wrote: >> On Mon, Oct 14, 2024 at 07:32:46PM +0200, Peter Zijlstra wrote: > >> > I'm confused.. probably because I still don't have any clue about >> > drivers and the above isn't really telling me much either. >> > >> > I don't see how you get rid of the try_module_get() we do per event; >> > without that you can't unload the module. >> >> I don't get rid of the try_module_get(). They serve diffeerent purposes. >> Having a reference to the module prevents the _module_ going away (and >> hence the function pointers we call into from perf). It doesn't prevent >> the module unbinding from the HW. A module may have N instances if it's >> bound to N devices. >> >> This can be done today to unbind the HW (integrated graphics) from the >> i915 module: >> >> # echo 0000:00:02.0 > /sys/bus/pci/drivers/i915/unbind >> >> The ref taken by these new get()/put() are related to preventing the >> data going away - the driver can use that to take a ref on something >> that will survive the unbind. > >OK, for some reason I thought to remember that you wanted to be able to >unload the module too. humn... maybe because in the first version we were talking about shortcutting all the function calls. If made by the driver, it'd allow to remove the ugly `if (pmu->closed)`, and if made by perf itself, it would allow to drop the module ref since we would guarantee we are not calling into the module anymore. But I think that's orthogonal to what we really care about: once the HW vanishes underneath us, stop accessing it and unregister the PMU in a way that if the HW shows up again we can still register it and nothing explodes. > >> > And I don't see how you think it is safe to free a pmu while there are >> > still events around. >> >> so, we don't actually free it - the pmu is unregistered but the >> `struct pmu` and (possibly) its container are still around after unregister. >> When the get/put are used, the driver can keep the data around, which is >> then free'd when the last reference is put. > >Aaaaah, okay. So the implementation knows to nop out all device >interaction when it gets unbound, but the events and pmu data stick >around until they're naturally killed off? > >Ah, reading the below patches that is indeed what i915 does. pmu->closed >makes this so. yes. Without these patches it doesn't work well though, particuarly because a) even if we protected the methods with pmu->closed(), the data is freed when we call perf_pmu_unregister(), making the whole pmu pointer invalid b) kernel/core/events.c still accesss the pmu after calling event->destroy() - we can't really hook on that to destroy the pmu like is done today. > >The dummy thing you posted in this thread, does perf_event_disable() on that's optional and I think we could live without. The main issue is completely crashing the kernel if we do: # perf stat -e i915/rc6-residency/ -I1000 & # echo 0000:00:02.0 > /sys/bus/pci/drivers/i915/unbind ... which these patches are fixing regardless of calling perf_event_disable(). >all previously created events, and this is not sound. Userspace can do >PERF_EVENT_IOC_ENABLE on them and then things will go side-ways fast. >And I was afraid i915 was doing this same. For the i915 series, that would be "[PATCH 8/8] drm/i915/pmu: Release open events when unregistering". See release_active_events() I was just wanting a smoke signal/hint for userspace that "something went wrong" with this counter. > >> - Subject: [PATCH 3/8] drm/i915/pmu: Fix crash due to use-after-free > >So reading that Changelog, you would like a replacement for pmu->closed >as well. > >I suppose, one way to go about doing this is to have >perf_pmu_unregister() replace a bunch of methods. Notably you have >pmu->closed in: > > - event_init() > - read() > - start() > - stop() > - add() > >Having perf_pmu_unregister() overwrite those function pointers with >something akin to your pmu->closed would go a long way, right? It would it would help if we want to unload the module, to make it work for other drivers without having to add similar flag and to make those drivers less ugly with those checks. However grepping the kernel for calls to perf_pmu_register(), it seems there are only 3 candidates, all in drm: i915, amdgpu and xe (the xe is a pending patch series waiting on the resolution of this issue). There is drivers/powercap/intel_rapl_common.c with its `bool registered` flag, but that is basically doing multiple register/unregister to replace the pmu rather than working with HW that can be removed. >require using READ_ONCE() for calling the methods, which would make >things a little ugly :/ > >But I also think we want to force all the events into STATE_ERROR, and yeah... When I was looking for the smoke signal I mentioned, I wanted something to move the event into that state, but couldn't find one. The best I found was to disable it. >I'm not immediately sure how best to go about doing that. Adding better >return value handling to ->add() is trivial enough, and perhaps calling >perf_pmu_resched() is sufficient to cycle everything. > >Let me ponder that a little bit. thanks for the help Lucas De Marchi >
Hi Lucas, kernel test robot noticed the following build warnings: [auto build test WARNING on perf-tools-next/perf-tools-next] [cannot apply to tip/perf/core perf-tools/perf-tools acme/perf/core linus/master v6.12-rc2 next-20241008] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information] url: https://github.com/intel-lab-lkp/linux/commits/Lucas-De-Marchi/perf-Add-dummy-pmu-module/20241009-023728 base: https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git perf-tools-next patch link: https://lore.kernel.org/r/20241008183501.1354695-4-lucas.demarchi%40intel.com patch subject: [PATCH 3/5] perf: Add pmu get/put config: x86_64-kexec (https://download.01.org/0day-ci/archive/20241009/202410091848.aRUoRWWD-lkp@intel.com/config) compiler: clang version 18.1.8 (https://github.com/llvm/llvm-project 3b5b5c1ec4a3095ab096dd780e84d7ab81f3d7ff) reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241009/202410091848.aRUoRWWD-lkp@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Closes: https://lore.kernel.org/oe-kbuild-all/202410091848.aRUoRWWD-lkp@intel.com/ All warnings (new ones prefixed by >>): >> kernel/events/core.c:5235:17: warning: unused variable 'module' [-Wunused-variable] 5235 | struct module *module; | ^~~~~~ 1 warning generated. vim +/module +5235 kernel/events/core.c 5232 5233 static void _free_event(struct perf_event *event) 5234 { > 5235 struct module *module; 5236 5237 irq_work_sync(&event->pending_irq); 5238 irq_work_sync(&event->pending_disable_irq); 5239 perf_pending_task_sync(event); 5240 5241 unaccount_event(event); 5242 5243 security_perf_event_free(event); 5244 5245 if (event->rb) { 5246 /* 5247 * Can happen when we close an event with re-directed output. 5248 * 5249 * Since we have a 0 refcount, perf_mmap_close() will skip 5250 * over us; possibly making our ring_buffer_put() the last. 5251 */ 5252 mutex_lock(&event->mmap_mutex); 5253 ring_buffer_attach(event, NULL); 5254 mutex_unlock(&event->mmap_mutex); 5255 } 5256 5257 if (is_cgroup_event(event)) 5258 perf_detach_cgroup(event); 5259 5260 if (!event->parent) { 5261 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) 5262 put_callchain_buffers(); 5263 } 5264 5265 perf_event_free_bpf_prog(event); 5266 perf_addr_filters_splice(event, NULL); 5267 kfree(event->addr_filter_ranges); 5268 5269 if (event->destroy) 5270 event->destroy(event); 5271 5272 /* 5273 * Must be after ->destroy(), due to uprobe_perf_close() using 5274 * hw.target. 5275 */ 5276 if (event->hw.target) 5277 put_task_struct(event->hw.target); 5278 5279 if (event->pmu_ctx) 5280 put_pmu_ctx(event->pmu_ctx); 5281 5282 /* 5283 * perf_event_free_task() relies on put_ctx() being 'last', in particular 5284 * all task references must be cleaned up. 5285 */ 5286 if (event->ctx) 5287 put_ctx(event->ctx); 5288 5289 exclusive_event_destroy(event); 5290 5291 pmu_module_put(&event->pmu); 5292 5293 call_rcu(&event->rcu_head, free_event_rcu); 5294 } 5295 -- 0-DAY CI Kernel Test Service https://github.com/intel/lkp-tests/wiki
© 2016 - 2024 Red Hat, Inc.