[v2] perf: Make SIGTRAP and __perf_pending_irq() work on RT.

[PATCH v2 2/4] perf: Enqueue SIGTRAP always via task_work.

Posted by Sebastian Andrzej Siewior 1 year, 11 months ago

A signal is delivered by raising irq_work() which works from any context
including NMI. irq_work() can be delayed if the architecture does not
provide an interrupt vector. In order not to lose a signal, the signal
is injected via task_work during event_sched_out().

Instead going via irq_work, the signal could be added directly via
task_work. The signal is sent to current and can be enqueued on its
return path to userland instead of triggering irq_work. A dummy IRQ is
required in the NMI case to ensure the task_work is handled before
returning to user land. For this irq_work is used. An alternative would
be just raising an interrupt like arch_send_call_function_single_ipi().

During testing with `remove_on_exec' it become visible that the event
can be enqueued via NMI during execve(). The task_work must not be kept
because free_event() will complain later. Also the new task will not
have a sighandler installed.

Queue signal via task_work. Remove perf_event::pending_sigtrap and
and use perf_event::pending_work instead. Raise irq_work in the NMI case
for a dummy interrupt. Remove the task_work if the event is freed.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 include/linux/perf_event.h |  3 +--
 kernel/events/core.c       | 45 +++++++++++++++++---------------------
 2 files changed, 21 insertions(+), 27 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index d2a15c0c6f8a9..24ac6765146c7 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -781,7 +781,6 @@ struct perf_event {
 	unsigned int			pending_wakeup;
 	unsigned int			pending_kill;
 	unsigned int			pending_disable;
-	unsigned int			pending_sigtrap;
 	unsigned long			pending_addr;	/* SIGTRAP */
 	struct irq_work			pending_irq;
 	struct callback_head		pending_task;
@@ -959,7 +958,7 @@ struct perf_event_context {
 	struct rcu_head			rcu_head;
 
 	/*
-	 * Sum (event->pending_sigtrap + event->pending_work)
+	 * Sum (event->pending_work + event->pending_work)
 	 *
 	 * The SIGTRAP is targeted at ctx->task, as such it won't do changing
 	 * that until the signal is delivered.
diff --git a/kernel/events/core.c b/kernel/events/core.c
index c7a0274c662c8..e9926baaa1587 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2283,21 +2283,6 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
 		state = PERF_EVENT_STATE_OFF;
 	}
 
-	if (event->pending_sigtrap) {
-		bool dec = true;
-
-		event->pending_sigtrap = 0;
-		if (state != PERF_EVENT_STATE_OFF &&
-		    !event->pending_work) {
-			event->pending_work = 1;
-			dec = false;
-			WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
-			task_work_add(current, &event->pending_task, TWA_RESUME);
-		}
-		if (dec)
-			local_dec(&event->ctx->nr_pending);
-	}
-
 	perf_event_set_state(event, state);
 
 	if (!is_software_event(event))
@@ -6741,11 +6726,6 @@ static void __perf_pending_irq(struct perf_event *event)
 	 * Yay, we hit home and are in the context of the event.
 	 */
 	if (cpu == smp_processor_id()) {
-		if (event->pending_sigtrap) {
-			event->pending_sigtrap = 0;
-			perf_sigtrap(event);
-			local_dec(&event->ctx->nr_pending);
-		}
 		if (event->pending_disable) {
 			event->pending_disable = 0;
 			perf_event_disable_local(event);
@@ -9592,14 +9572,17 @@ static int __perf_event_overflow(struct perf_event *event,
 
 		if (regs)
 			pending_id = hash32_ptr((void *)instruction_pointer(regs)) ?: 1;
-		if (!event->pending_sigtrap) {
-			event->pending_sigtrap = pending_id;
+		if (!event->pending_work) {
+			event->pending_work = pending_id;
 			local_inc(&event->ctx->nr_pending);
-			irq_work_queue(&event->pending_irq);
+			WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
+			task_work_add(current, &event->pending_task, TWA_RESUME);
+			if (in_nmi())
+				irq_work_queue(&event->pending_irq);
 		} else if (event->attr.exclude_kernel && valid_sample) {
 			/*
 			 * Should not be able to return to user space without
-			 * consuming pending_sigtrap; with exceptions:
+			 * consuming pending_work; with exceptions:
 			 *
 			 *  1. Where !exclude_kernel, events can overflow again
 			 *     in the kernel without returning to user space.
@@ -9609,7 +9592,7 @@ static int __perf_event_overflow(struct perf_event *event,
 			 *     To approximate progress (with false negatives),
 			 *     check 32-bit hash of the current IP.
 			 */
-			WARN_ON_ONCE(event->pending_sigtrap != pending_id);
+			WARN_ON_ONCE(event->pending_work != pending_id);
 		}
 
 		event->pending_addr = 0;
@@ -13049,6 +13032,13 @@ static void sync_child_event(struct perf_event *child_event)
 		     &parent_event->child_total_time_running);
 }
 
+static bool task_work_cb_match(struct callback_head *cb, void *data)
+{
+	struct perf_event *event = container_of(cb, struct perf_event, pending_task);
+
+	return event == data;
+}
+
 static void
 perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
 {
@@ -13088,6 +13078,11 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
 		 * Kick perf_poll() for is_event_hup();
 		 */
 		perf_event_wakeup(parent_event);
+		if (event->pending_work &&
+		    task_work_cancel_match(current, task_work_cb_match, event)) {
+			put_event(event);
+			local_dec(&event->ctx->nr_pending);
+		}
 		free_event(event);
 		put_event(parent_event);
 		return;
-- 
2.43.0

Re: [PATCH v2 2/4] perf: Enqueue SIGTRAP always via task_work.

Posted by Marco Elver 1 year, 11 months ago

On Tue, 12 Mar 2024 at 19:08, Sebastian Andrzej Siewior
<bigeasy@linutronix.de> wrote:
>
> A signal is delivered by raising irq_work() which works from any context
> including NMI. irq_work() can be delayed if the architecture does not
> provide an interrupt vector. In order not to lose a signal, the signal
> is injected via task_work during event_sched_out().
>
> Instead going via irq_work, the signal could be added directly via
> task_work. The signal is sent to current and can be enqueued on its
> return path to userland instead of triggering irq_work. A dummy IRQ is
> required in the NMI case to ensure the task_work is handled before
> returning to user land. For this irq_work is used. An alternative would
> be just raising an interrupt like arch_send_call_function_single_ipi().
>
> During testing with `remove_on_exec' it become visible that the event
> can be enqueued via NMI during execve(). The task_work must not be kept
> because free_event() will complain later. Also the new task will not
> have a sighandler installed.
>
> Queue signal via task_work. Remove perf_event::pending_sigtrap and
> and use perf_event::pending_work instead. Raise irq_work in the NMI case
> for a dummy interrupt. Remove the task_work if the event is freed.
>
> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> ---
>  include/linux/perf_event.h |  3 +--
>  kernel/events/core.c       | 45 +++++++++++++++++---------------------
>  2 files changed, 21 insertions(+), 27 deletions(-)
>
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index d2a15c0c6f8a9..24ac6765146c7 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -781,7 +781,6 @@ struct perf_event {
>         unsigned int                    pending_wakeup;
>         unsigned int                    pending_kill;
>         unsigned int                    pending_disable;
> -       unsigned int                    pending_sigtrap;
>         unsigned long                   pending_addr;   /* SIGTRAP */
>         struct irq_work                 pending_irq;
>         struct callback_head            pending_task;
> @@ -959,7 +958,7 @@ struct perf_event_context {
>         struct rcu_head                 rcu_head;
>
>         /*
> -        * Sum (event->pending_sigtrap + event->pending_work)
> +        * Sum (event->pending_work + event->pending_work)
>          *
>          * The SIGTRAP is targeted at ctx->task, as such it won't do changing
>          * that until the signal is delivered.
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index c7a0274c662c8..e9926baaa1587 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -2283,21 +2283,6 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
>                 state = PERF_EVENT_STATE_OFF;
>         }
>
> -       if (event->pending_sigtrap) {
> -               bool dec = true;
> -
> -               event->pending_sigtrap = 0;
> -               if (state != PERF_EVENT_STATE_OFF &&
> -                   !event->pending_work) {
> -                       event->pending_work = 1;
> -                       dec = false;
> -                       WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
> -                       task_work_add(current, &event->pending_task, TWA_RESUME);
> -               }
> -               if (dec)
> -                       local_dec(&event->ctx->nr_pending);
> -       }
> -
>         perf_event_set_state(event, state);
>
>         if (!is_software_event(event))
> @@ -6741,11 +6726,6 @@ static void __perf_pending_irq(struct perf_event *event)
>          * Yay, we hit home and are in the context of the event.
>          */
>         if (cpu == smp_processor_id()) {
> -               if (event->pending_sigtrap) {
> -                       event->pending_sigtrap = 0;
> -                       perf_sigtrap(event);
> -                       local_dec(&event->ctx->nr_pending);
> -               }
>                 if (event->pending_disable) {
>                         event->pending_disable = 0;
>                         perf_event_disable_local(event);
> @@ -9592,14 +9572,17 @@ static int __perf_event_overflow(struct perf_event *event,
>
>                 if (regs)
>                         pending_id = hash32_ptr((void *)instruction_pointer(regs)) ?: 1;
> -               if (!event->pending_sigtrap) {
> -                       event->pending_sigtrap = pending_id;
> +               if (!event->pending_work) {
> +                       event->pending_work = pending_id;
>                         local_inc(&event->ctx->nr_pending);
> -                       irq_work_queue(&event->pending_irq);
> +                       WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
> +                       task_work_add(current, &event->pending_task, TWA_RESUME);
> +                       if (in_nmi())
> +                               irq_work_queue(&event->pending_irq);

Some brief code comments here would help having to dig through git
history to understand this.

>                 } else if (event->attr.exclude_kernel && valid_sample) {
>                         /*
>                          * Should not be able to return to user space without
> -                        * consuming pending_sigtrap; with exceptions:
> +                        * consuming pending_work; with exceptions:
>                          *
>                          *  1. Where !exclude_kernel, events can overflow again
>                          *     in the kernel without returning to user space.
> @@ -9609,7 +9592,7 @@ static int __perf_event_overflow(struct perf_event *event,
>                          *     To approximate progress (with false negatives),
>                          *     check 32-bit hash of the current IP.
>                          */
> -                       WARN_ON_ONCE(event->pending_sigtrap != pending_id);
> +                       WARN_ON_ONCE(event->pending_work != pending_id);
>                 }
>
>                 event->pending_addr = 0;
> @@ -13049,6 +13032,13 @@ static void sync_child_event(struct perf_event *child_event)
>                      &parent_event->child_total_time_running);
>  }
>
> +static bool task_work_cb_match(struct callback_head *cb, void *data)
> +{
> +       struct perf_event *event = container_of(cb, struct perf_event, pending_task);
> +
> +       return event == data;
> +}
> +
>  static void
>  perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
>  {
> @@ -13088,6 +13078,11 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
>                  * Kick perf_poll() for is_event_hup();
>                  */
>                 perf_event_wakeup(parent_event);
> +               if (event->pending_work &&
> +                   task_work_cancel_match(current, task_work_cb_match, event)) {

Brief comment which case this covers would be good.

> +                       put_event(event);
> +                       local_dec(&event->ctx->nr_pending);
> +               }
>                 free_event(event);
>                 put_event(parent_event);
>                 return;
> --
> 2.43.0
>

Re: [PATCH v2 2/4] perf: Enqueue SIGTRAP always via task_work.

Posted by Sebastian Andrzej Siewior 1 year, 11 months ago

On 2024-03-13 15:41:18 [+0100], Marco Elver wrote:
> > diff --git a/kernel/events/core.c b/kernel/events/core.c
> > index c7a0274c662c8..e9926baaa1587 100644
> > --- a/kernel/events/core.c
> > +++ b/kernel/events/core.c
> > @@ -9592,14 +9572,17 @@ static int __perf_event_overflow(struct perf_event *event,
> >
> >                 if (regs)
> >                         pending_id = hash32_ptr((void *)instruction_pointer(regs)) ?: 1;
> > -               if (!event->pending_sigtrap) {
> > -                       event->pending_sigtrap = pending_id;
> > +               if (!event->pending_work) {
> > +                       event->pending_work = pending_id;
> >                         local_inc(&event->ctx->nr_pending);
> > -                       irq_work_queue(&event->pending_irq);
> > +                       WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
> > +                       task_work_add(current, &event->pending_task, TWA_RESUME);
> > +                       if (in_nmi())
> > +                               irq_work_queue(&event->pending_irq);
> 
> Some brief code comments here would help having to dig through git
> history to understand this.

Sure.

> >                 } else if (event->attr.exclude_kernel && valid_sample) {
> >                         /*
> >                          * Should not be able to return to user space without
> > -                        * consuming pending_sigtrap; with exceptions:
> > +                        * consuming pending_work; with exceptions:
> >                          *
> >                          *  1. Where !exclude_kernel, events can overflow again
> >                          *     in the kernel without returning to user space.
> > @@ -13049,6 +13032,13 @@ static void sync_child_event(struct perf_event *child_event)
> >                      &parent_event->child_total_time_running);
> >  }
> >
> > +static bool task_work_cb_match(struct callback_head *cb, void *data)
> > +{
> > +       struct perf_event *event = container_of(cb, struct perf_event, pending_task);
> > +
> > +       return event == data;
> > +}
> > +
> >  static void
> >  perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
> >  {
> > @@ -13088,6 +13078,11 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
> >                  * Kick perf_poll() for is_event_hup();
> >                  */
> >                 perf_event_wakeup(parent_event);
> > +               if (event->pending_work &&
> > +                   task_work_cancel_match(current, task_work_cb_match, event)) {
> 
> Brief comment which case this covers would be good.

Okay.

> > +                       put_event(event);
> > +                       local_dec(&event->ctx->nr_pending);
> > +               }
> >                 free_event(event);
> >                 put_event(parent_event);
> >                 return;

Sebastian

[PATCH v2 1/4] perf: Move irq_work_queue() where the event is prepared.
[PATCH v2 2/4] perf: Enqueue SIGTRAP always via task_work.
[PATCH v2 3/4] perf: Remove perf_swevent_get_recursion_context() from perf_pending_task().
[PATCH v2 4/4] perf: Split __perf_pending_irq() out of perf_pending_irq()