A signal is delivered by raising irq_work() which works from any context
including NMI. irq_work() can be delayed if the architecture does not
provide an interrupt vector. In order not to lose a signal, the signal
is injected via task_work during event_sched_out().
Instead going via irq_work, the signal could be added directly via
task_work. The signal is sent to current and can be enqueued on its
return path to userland instead of triggering irq_work. A dummy IRQ is
required in the NMI case to ensure the task_work is handled before
returning to user land. For this irq_work is used. An alternative would
be just raising an interrupt like arch_send_call_function_single_ipi().
During testing with `remove_on_exec' it become visible that the event
can be enqueued via NMI during execve(). The task_work must not be kept
because free_event() will complain later. Also the new task will not
have a sighandler installed.
Queue signal via task_work. Remove perf_event::pending_sigtrap and
and use perf_event::pending_work instead. Raise irq_work in the NMI case
for a dummy interrupt. Remove the task_work if the event is freed.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/perf_event.h | 3 +--
kernel/events/core.c | 45 +++++++++++++++++---------------------
2 files changed, 21 insertions(+), 27 deletions(-)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index d2a15c0c6f8a9..24ac6765146c7 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -781,7 +781,6 @@ struct perf_event {
unsigned int pending_wakeup;
unsigned int pending_kill;
unsigned int pending_disable;
- unsigned int pending_sigtrap;
unsigned long pending_addr; /* SIGTRAP */
struct irq_work pending_irq;
struct callback_head pending_task;
@@ -959,7 +958,7 @@ struct perf_event_context {
struct rcu_head rcu_head;
/*
- * Sum (event->pending_sigtrap + event->pending_work)
+ * Sum (event->pending_work + event->pending_work)
*
* The SIGTRAP is targeted at ctx->task, as such it won't do changing
* that until the signal is delivered.
diff --git a/kernel/events/core.c b/kernel/events/core.c
index c7a0274c662c8..e9926baaa1587 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2283,21 +2283,6 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
state = PERF_EVENT_STATE_OFF;
}
- if (event->pending_sigtrap) {
- bool dec = true;
-
- event->pending_sigtrap = 0;
- if (state != PERF_EVENT_STATE_OFF &&
- !event->pending_work) {
- event->pending_work = 1;
- dec = false;
- WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
- task_work_add(current, &event->pending_task, TWA_RESUME);
- }
- if (dec)
- local_dec(&event->ctx->nr_pending);
- }
-
perf_event_set_state(event, state);
if (!is_software_event(event))
@@ -6741,11 +6726,6 @@ static void __perf_pending_irq(struct perf_event *event)
* Yay, we hit home and are in the context of the event.
*/
if (cpu == smp_processor_id()) {
- if (event->pending_sigtrap) {
- event->pending_sigtrap = 0;
- perf_sigtrap(event);
- local_dec(&event->ctx->nr_pending);
- }
if (event->pending_disable) {
event->pending_disable = 0;
perf_event_disable_local(event);
@@ -9592,14 +9572,17 @@ static int __perf_event_overflow(struct perf_event *event,
if (regs)
pending_id = hash32_ptr((void *)instruction_pointer(regs)) ?: 1;
- if (!event->pending_sigtrap) {
- event->pending_sigtrap = pending_id;
+ if (!event->pending_work) {
+ event->pending_work = pending_id;
local_inc(&event->ctx->nr_pending);
- irq_work_queue(&event->pending_irq);
+ WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
+ task_work_add(current, &event->pending_task, TWA_RESUME);
+ if (in_nmi())
+ irq_work_queue(&event->pending_irq);
} else if (event->attr.exclude_kernel && valid_sample) {
/*
* Should not be able to return to user space without
- * consuming pending_sigtrap; with exceptions:
+ * consuming pending_work; with exceptions:
*
* 1. Where !exclude_kernel, events can overflow again
* in the kernel without returning to user space.
@@ -9609,7 +9592,7 @@ static int __perf_event_overflow(struct perf_event *event,
* To approximate progress (with false negatives),
* check 32-bit hash of the current IP.
*/
- WARN_ON_ONCE(event->pending_sigtrap != pending_id);
+ WARN_ON_ONCE(event->pending_work != pending_id);
}
event->pending_addr = 0;
@@ -13049,6 +13032,13 @@ static void sync_child_event(struct perf_event *child_event)
&parent_event->child_total_time_running);
}
+static bool task_work_cb_match(struct callback_head *cb, void *data)
+{
+ struct perf_event *event = container_of(cb, struct perf_event, pending_task);
+
+ return event == data;
+}
+
static void
perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
{
@@ -13088,6 +13078,11 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
* Kick perf_poll() for is_event_hup();
*/
perf_event_wakeup(parent_event);
+ if (event->pending_work &&
+ task_work_cancel_match(current, task_work_cb_match, event)) {
+ put_event(event);
+ local_dec(&event->ctx->nr_pending);
+ }
free_event(event);
put_event(parent_event);
return;
--
2.43.0
On Tue, 12 Mar 2024 at 19:08, Sebastian Andrzej Siewior
<bigeasy@linutronix.de> wrote:
>
> A signal is delivered by raising irq_work() which works from any context
> including NMI. irq_work() can be delayed if the architecture does not
> provide an interrupt vector. In order not to lose a signal, the signal
> is injected via task_work during event_sched_out().
>
> Instead going via irq_work, the signal could be added directly via
> task_work. The signal is sent to current and can be enqueued on its
> return path to userland instead of triggering irq_work. A dummy IRQ is
> required in the NMI case to ensure the task_work is handled before
> returning to user land. For this irq_work is used. An alternative would
> be just raising an interrupt like arch_send_call_function_single_ipi().
>
> During testing with `remove_on_exec' it become visible that the event
> can be enqueued via NMI during execve(). The task_work must not be kept
> because free_event() will complain later. Also the new task will not
> have a sighandler installed.
>
> Queue signal via task_work. Remove perf_event::pending_sigtrap and
> and use perf_event::pending_work instead. Raise irq_work in the NMI case
> for a dummy interrupt. Remove the task_work if the event is freed.
>
> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> ---
> include/linux/perf_event.h | 3 +--
> kernel/events/core.c | 45 +++++++++++++++++---------------------
> 2 files changed, 21 insertions(+), 27 deletions(-)
>
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index d2a15c0c6f8a9..24ac6765146c7 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -781,7 +781,6 @@ struct perf_event {
> unsigned int pending_wakeup;
> unsigned int pending_kill;
> unsigned int pending_disable;
> - unsigned int pending_sigtrap;
> unsigned long pending_addr; /* SIGTRAP */
> struct irq_work pending_irq;
> struct callback_head pending_task;
> @@ -959,7 +958,7 @@ struct perf_event_context {
> struct rcu_head rcu_head;
>
> /*
> - * Sum (event->pending_sigtrap + event->pending_work)
> + * Sum (event->pending_work + event->pending_work)
> *
> * The SIGTRAP is targeted at ctx->task, as such it won't do changing
> * that until the signal is delivered.
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index c7a0274c662c8..e9926baaa1587 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -2283,21 +2283,6 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
> state = PERF_EVENT_STATE_OFF;
> }
>
> - if (event->pending_sigtrap) {
> - bool dec = true;
> -
> - event->pending_sigtrap = 0;
> - if (state != PERF_EVENT_STATE_OFF &&
> - !event->pending_work) {
> - event->pending_work = 1;
> - dec = false;
> - WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
> - task_work_add(current, &event->pending_task, TWA_RESUME);
> - }
> - if (dec)
> - local_dec(&event->ctx->nr_pending);
> - }
> -
> perf_event_set_state(event, state);
>
> if (!is_software_event(event))
> @@ -6741,11 +6726,6 @@ static void __perf_pending_irq(struct perf_event *event)
> * Yay, we hit home and are in the context of the event.
> */
> if (cpu == smp_processor_id()) {
> - if (event->pending_sigtrap) {
> - event->pending_sigtrap = 0;
> - perf_sigtrap(event);
> - local_dec(&event->ctx->nr_pending);
> - }
> if (event->pending_disable) {
> event->pending_disable = 0;
> perf_event_disable_local(event);
> @@ -9592,14 +9572,17 @@ static int __perf_event_overflow(struct perf_event *event,
>
> if (regs)
> pending_id = hash32_ptr((void *)instruction_pointer(regs)) ?: 1;
> - if (!event->pending_sigtrap) {
> - event->pending_sigtrap = pending_id;
> + if (!event->pending_work) {
> + event->pending_work = pending_id;
> local_inc(&event->ctx->nr_pending);
> - irq_work_queue(&event->pending_irq);
> + WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
> + task_work_add(current, &event->pending_task, TWA_RESUME);
> + if (in_nmi())
> + irq_work_queue(&event->pending_irq);
Some brief code comments here would help having to dig through git
history to understand this.
> } else if (event->attr.exclude_kernel && valid_sample) {
> /*
> * Should not be able to return to user space without
> - * consuming pending_sigtrap; with exceptions:
> + * consuming pending_work; with exceptions:
> *
> * 1. Where !exclude_kernel, events can overflow again
> * in the kernel without returning to user space.
> @@ -9609,7 +9592,7 @@ static int __perf_event_overflow(struct perf_event *event,
> * To approximate progress (with false negatives),
> * check 32-bit hash of the current IP.
> */
> - WARN_ON_ONCE(event->pending_sigtrap != pending_id);
> + WARN_ON_ONCE(event->pending_work != pending_id);
> }
>
> event->pending_addr = 0;
> @@ -13049,6 +13032,13 @@ static void sync_child_event(struct perf_event *child_event)
> &parent_event->child_total_time_running);
> }
>
> +static bool task_work_cb_match(struct callback_head *cb, void *data)
> +{
> + struct perf_event *event = container_of(cb, struct perf_event, pending_task);
> +
> + return event == data;
> +}
> +
> static void
> perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
> {
> @@ -13088,6 +13078,11 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
> * Kick perf_poll() for is_event_hup();
> */
> perf_event_wakeup(parent_event);
> + if (event->pending_work &&
> + task_work_cancel_match(current, task_work_cb_match, event)) {
Brief comment which case this covers would be good.
> + put_event(event);
> + local_dec(&event->ctx->nr_pending);
> + }
> free_event(event);
> put_event(parent_event);
> return;
> --
> 2.43.0
>
On 2024-03-13 15:41:18 [+0100], Marco Elver wrote:
> > diff --git a/kernel/events/core.c b/kernel/events/core.c
> > index c7a0274c662c8..e9926baaa1587 100644
> > --- a/kernel/events/core.c
> > +++ b/kernel/events/core.c
> > @@ -9592,14 +9572,17 @@ static int __perf_event_overflow(struct perf_event *event,
> >
> > if (regs)
> > pending_id = hash32_ptr((void *)instruction_pointer(regs)) ?: 1;
> > - if (!event->pending_sigtrap) {
> > - event->pending_sigtrap = pending_id;
> > + if (!event->pending_work) {
> > + event->pending_work = pending_id;
> > local_inc(&event->ctx->nr_pending);
> > - irq_work_queue(&event->pending_irq);
> > + WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
> > + task_work_add(current, &event->pending_task, TWA_RESUME);
> > + if (in_nmi())
> > + irq_work_queue(&event->pending_irq);
>
> Some brief code comments here would help having to dig through git
> history to understand this.
Sure.
> > } else if (event->attr.exclude_kernel && valid_sample) {
> > /*
> > * Should not be able to return to user space without
> > - * consuming pending_sigtrap; with exceptions:
> > + * consuming pending_work; with exceptions:
> > *
> > * 1. Where !exclude_kernel, events can overflow again
> > * in the kernel without returning to user space.
> > @@ -13049,6 +13032,13 @@ static void sync_child_event(struct perf_event *child_event)
> > &parent_event->child_total_time_running);
> > }
> >
> > +static bool task_work_cb_match(struct callback_head *cb, void *data)
> > +{
> > + struct perf_event *event = container_of(cb, struct perf_event, pending_task);
> > +
> > + return event == data;
> > +}
> > +
> > static void
> > perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
> > {
> > @@ -13088,6 +13078,11 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
> > * Kick perf_poll() for is_event_hup();
> > */
> > perf_event_wakeup(parent_event);
> > + if (event->pending_work &&
> > + task_work_cancel_match(current, task_work_cb_match, event)) {
>
> Brief comment which case this covers would be good.
Okay.
> > + put_event(event);
> > + local_dec(&event->ctx->nr_pending);
> > + }
> > free_event(event);
> > put_event(parent_event);
> > return;
Sebastian
© 2016 - 2026 Red Hat, Inc.