[PATCH 01/13] perf: Simplify perf_event_alloc() error path

Peter Zijlstra posted 13 patches 2 years, 1 month ago
[PATCH 01/13] perf: Simplify perf_event_alloc() error path
Posted by Peter Zijlstra 2 years, 1 month ago
The error cleanup sequence in perf_event_alloc() is a subset of the
existing _free_event() function (it must of course be).

Split this out into __free_event() and simplify the error path.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/perf_event.h |    1 
 kernel/events/core.c       |  129 ++++++++++++++++++++++-----------------------
 2 files changed, 66 insertions(+), 64 deletions(-)

--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -634,6 +634,7 @@ struct swevent_hlist {
 #define PERF_ATTACH_ITRACE	0x10
 #define PERF_ATTACH_SCHED_CB	0x20
 #define PERF_ATTACH_CHILD	0x40
+#define PERF_ATTACH_EXCLUSIVE	0x80
 
 struct bpf_prog;
 struct perf_cgroup;
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5094,6 +5094,8 @@ static int exclusive_event_init(struct p
 			return -EBUSY;
 	}
 
+	event->attach_state |= PERF_ATTACH_EXCLUSIVE;
+
 	return 0;
 }
 
@@ -5101,14 +5103,13 @@ static void exclusive_event_destroy(stru
 {
 	struct pmu *pmu = event->pmu;
 
-	if (!is_exclusive_pmu(pmu))
-		return;
-
 	/* see comment in exclusive_event_init() */
 	if (event->attach_state & PERF_ATTACH_TASK)
 		atomic_dec(&pmu->exclusive_cnt);
 	else
 		atomic_inc(&pmu->exclusive_cnt);
+
+	event->attach_state &= ~PERF_ATTACH_EXCLUSIVE;
 }
 
 static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
@@ -5143,38 +5144,22 @@ static bool exclusive_event_installable(
 static void perf_addr_filters_splice(struct perf_event *event,
 				       struct list_head *head);
 
-static void _free_event(struct perf_event *event)
+/* vs perf_event_alloc() error */
+static void __free_event(struct perf_event *event)
 {
-	irq_work_sync(&event->pending_irq);
-
-	unaccount_event(event);
-
-	security_perf_event_free(event);
-
-	if (event->rb) {
-		/*
-		 * Can happen when we close an event with re-directed output.
-		 *
-		 * Since we have a 0 refcount, perf_mmap_close() will skip
-		 * over us; possibly making our ring_buffer_put() the last.
-		 */
-		mutex_lock(&event->mmap_mutex);
-		ring_buffer_attach(event, NULL);
-		mutex_unlock(&event->mmap_mutex);
-	}
-
-	if (is_cgroup_event(event))
-		perf_detach_cgroup(event);
-
 	if (!event->parent) {
 		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
 			put_callchain_buffers();
 	}
 
-	perf_event_free_bpf_prog(event);
-	perf_addr_filters_splice(event, NULL);
 	kfree(event->addr_filter_ranges);
 
+	if (event->attach_state & PERF_ATTACH_EXCLUSIVE)
+		exclusive_event_destroy(event);
+
+	if (is_cgroup_event(event))
+		perf_detach_cgroup(event);
+
 	if (event->destroy)
 		event->destroy(event);
 
@@ -5185,22 +5170,56 @@ static void _free_event(struct perf_even
 	if (event->hw.target)
 		put_task_struct(event->hw.target);
 
-	if (event->pmu_ctx)
+	if (event->pmu_ctx) {
+		/*
+		 * put_pmu_ctx() needs an event->ctx reference, because of
+		 * epc->ctx.
+		 */
+		WARN_ON_ONCE(!event->ctx);
+		WARN_ON_ONCE(event->pmu_ctx->ctx != event->ctx);
 		put_pmu_ctx(event->pmu_ctx);
+	}
 
 	/*
-	 * perf_event_free_task() relies on put_ctx() being 'last', in particular
-	 * all task references must be cleaned up.
+	 * perf_event_free_task() relies on put_ctx() being 'last', in
+	 * particular all task references must be cleaned up.
 	 */
 	if (event->ctx)
 		put_ctx(event->ctx);
 
-	exclusive_event_destroy(event);
-	module_put(event->pmu->module);
+	if (event->pmu)
+		module_put(event->pmu->module);
 
 	call_rcu(&event->rcu_head, free_event_rcu);
 }
 
+/* vs perf_event_alloc() success */
+static void _free_event(struct perf_event *event)
+{
+	irq_work_sync(&event->pending_irq);
+
+	unaccount_event(event);
+
+	security_perf_event_free(event);
+
+	if (event->rb) {
+		/*
+		 * Can happen when we close an event with re-directed output.
+		 *
+		 * Since we have a 0 refcount, perf_mmap_close() will skip
+		 * over us; possibly making our ring_buffer_put() the last.
+		 */
+		mutex_lock(&event->mmap_mutex);
+		ring_buffer_attach(event, NULL);
+		mutex_unlock(&event->mmap_mutex);
+	}
+
+	perf_event_free_bpf_prog(event);
+	perf_addr_filters_splice(event, NULL);
+
+	__free_event(event);
+}
+
 /*
  * Used to free events which have a known refcount of 1, such as in error paths
  * where the event isn't exposed yet and inherited events.
@@ -11591,8 +11610,10 @@ static int perf_try_init_event(struct pm
 			event->destroy(event);
 	}
 
-	if (ret)
+	if (ret) {
+		event->pmu = NULL;
 		module_put(pmu->module);
+	}
 
 	return ret;
 }
@@ -11918,7 +11939,7 @@ perf_event_alloc(struct perf_event_attr
 	 * See perf_output_read().
 	 */
 	if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))
-		goto err_ns;
+		goto err;
 
 	if (!has_branch_stack(event))
 		event->attr.branch_sample_type = 0;
@@ -11926,7 +11947,7 @@ perf_event_alloc(struct perf_event_attr
 	pmu = perf_init_event(event);
 	if (IS_ERR(pmu)) {
 		err = PTR_ERR(pmu);
-		goto err_ns;
+		goto err;
 	}
 
 	/*
@@ -11936,24 +11957,24 @@ perf_event_alloc(struct perf_event_attr
 	 */
 	if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1)) {
 		err = -EINVAL;
-		goto err_pmu;
+		goto err;
 	}
 
 	if (event->attr.aux_output &&
 	    !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) {
 		err = -EOPNOTSUPP;
-		goto err_pmu;
+		goto err;
 	}
 
 	if (cgroup_fd != -1) {
 		err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
 		if (err)
-			goto err_pmu;
+			goto err;
 	}
 
 	err = exclusive_event_init(event);
 	if (err)
-		goto err_pmu;
+		goto err;
 
 	if (has_addr_filter(event)) {
 		event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters,
@@ -11961,7 +11982,7 @@ perf_event_alloc(struct perf_event_attr
 						    GFP_KERNEL);
 		if (!event->addr_filter_ranges) {
 			err = -ENOMEM;
-			goto err_per_task;
+			goto err;
 		}
 
 		/*
@@ -11986,41 +12007,21 @@ perf_event_alloc(struct perf_event_attr
 		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
 			err = get_callchain_buffers(attr->sample_max_stack);
 			if (err)
-				goto err_addr_filters;
+				goto err;
 		}
 	}
 
 	err = security_perf_event_alloc(event);
 	if (err)
-		goto err_callchain_buffer;
+		goto err;
 
 	/* symmetric to unaccount_event() in _free_event() */
 	account_event(event);
 
 	return event;
 
-err_callchain_buffer:
-	if (!event->parent) {
-		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
-			put_callchain_buffers();
-	}
-err_addr_filters:
-	kfree(event->addr_filter_ranges);
-
-err_per_task:
-	exclusive_event_destroy(event);
-
-err_pmu:
-	if (is_cgroup_event(event))
-		perf_detach_cgroup(event);
-	if (event->destroy)
-		event->destroy(event);
-	module_put(pmu->module);
-err_ns:
-	if (event->hw.target)
-		put_task_struct(event->hw.target);
-	call_rcu(&event->rcu_head, free_event_rcu);
-
+err:
+	__free_event(event);
 	return ERR_PTR(err);
 }
Re: [PATCH 01/13] perf: Simplify perf_event_alloc() error path
Posted by Jiri Olsa 2 years, 1 month ago
On Thu, Nov 02, 2023 at 04:09:20PM +0100, Peter Zijlstra wrote:

SNIP

> @@ -11936,24 +11957,24 @@ perf_event_alloc(struct perf_event_attr
>  	 */
>  	if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1)) {
>  		err = -EINVAL;
> -		goto err_pmu;
> +		goto err;
>  	}
>  
>  	if (event->attr.aux_output &&
>  	    !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) {
>  		err = -EOPNOTSUPP;
> -		goto err_pmu;
> +		goto err;
>  	}
>  
>  	if (cgroup_fd != -1) {
>  		err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
>  		if (err)
> -			goto err_pmu;
> +			goto err;
>  	}
>  
>  	err = exclusive_event_init(event);
>  	if (err)
> -		goto err_pmu;
> +		goto err;
>  
>  	if (has_addr_filter(event)) {
>  		event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters,
> @@ -11961,7 +11982,7 @@ perf_event_alloc(struct perf_event_attr
>  						    GFP_KERNEL);
>  		if (!event->addr_filter_ranges) {
>  			err = -ENOMEM;
> -			goto err_per_task;
> +			goto err;
>  		}
>  
>  		/*
> @@ -11986,41 +12007,21 @@ perf_event_alloc(struct perf_event_attr
>  		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
>  			err = get_callchain_buffers(attr->sample_max_stack);
>  			if (err)
> -				goto err_addr_filters;
> +				goto err;
>  		}
>  	}
>  
>  	err = security_perf_event_alloc(event);
>  	if (err)
> -		goto err_callchain_buffer;
> +		goto err;
>  
>  	/* symmetric to unaccount_event() in _free_event() */
>  	account_event(event);
>  
>  	return event;
>  
> -err_callchain_buffer:
> -	if (!event->parent) {
> -		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
> -			put_callchain_buffers();
> -	}

hum, so this is now called all the time via __free_event, but it should
be called only if we passed get_callchain_buffers call.. this could screw
up nr_callchain_events number eventually no?

jirka

> -err_addr_filters:
> -	kfree(event->addr_filter_ranges);
> -
> -err_per_task:
> -	exclusive_event_destroy(event);
> -
> -err_pmu:
> -	if (is_cgroup_event(event))
> -		perf_detach_cgroup(event);
> -	if (event->destroy)
> -		event->destroy(event);
> -	module_put(pmu->module);
> -err_ns:
> -	if (event->hw.target)
> -		put_task_struct(event->hw.target);
> -	call_rcu(&event->rcu_head, free_event_rcu);
> -
> +err:
> +	__free_event(event);
>  	return ERR_PTR(err);
>  }
>  
> 
>
Re: [PATCH 01/13] perf: Simplify perf_event_alloc() error path
Posted by Peter Zijlstra 2 years, 1 month ago
On Fri, Nov 03, 2023 at 01:38:05PM +0100, Jiri Olsa wrote:

> > -err_callchain_buffer:
> > -	if (!event->parent) {
> > -		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
> > -			put_callchain_buffers();
> > -	}
> 
> hum, so this is now called all the time via __free_event, but it should
> be called only if we passed get_callchain_buffers call.. this could screw
> up nr_callchain_events number eventually no?

Yes, good catch, thanks!

Something like the below should handle that, no?

---
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -628,14 +628,15 @@ struct swevent_hlist {
 	struct rcu_head			rcu_head;
 };
 
-#define PERF_ATTACH_CONTEXT	0x01
-#define PERF_ATTACH_GROUP	0x02
-#define PERF_ATTACH_TASK	0x04
-#define PERF_ATTACH_TASK_DATA	0x08
-#define PERF_ATTACH_ITRACE	0x10
-#define PERF_ATTACH_SCHED_CB	0x20
-#define PERF_ATTACH_CHILD	0x40
-#define PERF_ATTACH_EXCLUSIVE	0x80
+#define PERF_ATTACH_CONTEXT	0x0001
+#define PERF_ATTACH_GROUP	0x0002
+#define PERF_ATTACH_TASK	0x0004
+#define PERF_ATTACH_TASK_DATA	0x0008
+#define PERF_ATTACH_ITRACE	0x0010
+#define PERF_ATTACH_SCHED_CB	0x0020
+#define PERF_ATTACH_CHILD	0x0040
+#define PERF_ATTACH_EXCLUSIVE	0x0080
+#define PERF_ATTACH_CALLCHAIN	0x0100
 
 struct bpf_prog;
 struct perf_cgroup;
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5166,10 +5166,8 @@ static void perf_addr_filters_splice(str
 /* vs perf_event_alloc() error */
 static void __free_event(struct perf_event *event)
 {
-	if (!event->parent) {
-		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
-			put_callchain_buffers();
-	}
+	if (event->attach_state & PERF_ATTACH_CALLCHAIN)
+		put_callchain_buffers();
 
 	kfree(event->addr_filter_ranges);
 
@@ -12065,6 +12063,7 @@ perf_event_alloc(struct perf_event_attr
 			err = get_callchain_buffers(attr->sample_max_stack);
 			if (err)
 				goto err;
+			event->attach_state |= PERF_ATTACH_CALLCHAIN;
 		}
 	}
Re: [PATCH 01/13] perf: Simplify perf_event_alloc() error path
Posted by Namhyung Kim 2 years, 1 month ago
Hi Jiri and Peter,

On Fri, Nov 3, 2023 at 5:38 AM Jiri Olsa <olsajiri@gmail.com> wrote:
>
> On Thu, Nov 02, 2023 at 04:09:20PM +0100, Peter Zijlstra wrote:
>
> SNIP
>
> > @@ -11936,24 +11957,24 @@ perf_event_alloc(struct perf_event_attr
> >        */
> >       if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1)) {
> >               err = -EINVAL;
> > -             goto err_pmu;
> > +             goto err;
> >       }
> >
> >       if (event->attr.aux_output &&
> >           !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) {
> >               err = -EOPNOTSUPP;
> > -             goto err_pmu;
> > +             goto err;
> >       }
> >
> >       if (cgroup_fd != -1) {
> >               err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
> >               if (err)
> > -                     goto err_pmu;
> > +                     goto err;
> >       }
> >
> >       err = exclusive_event_init(event);
> >       if (err)
> > -             goto err_pmu;
> > +             goto err;
> >
> >       if (has_addr_filter(event)) {
> >               event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters,
> > @@ -11961,7 +11982,7 @@ perf_event_alloc(struct perf_event_attr
> >                                                   GFP_KERNEL);
> >               if (!event->addr_filter_ranges) {
> >                       err = -ENOMEM;
> > -                     goto err_per_task;
> > +                     goto err;
> >               }
> >
> >               /*
> > @@ -11986,41 +12007,21 @@ perf_event_alloc(struct perf_event_attr
> >               if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
> >                       err = get_callchain_buffers(attr->sample_max_stack);
> >                       if (err)
> > -                             goto err_addr_filters;
> > +                             goto err;
> >               }
> >       }
> >
> >       err = security_perf_event_alloc(event);
> >       if (err)
> > -             goto err_callchain_buffer;
> > +             goto err;
> >
> >       /* symmetric to unaccount_event() in _free_event() */
> >       account_event(event);
> >
> >       return event;
> >
> > -err_callchain_buffer:
> > -     if (!event->parent) {
> > -             if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
> > -                     put_callchain_buffers();
> > -     }
>
> hum, so this is now called all the time via __free_event, but it should
> be called only if we passed get_callchain_buffers call.. this could screw
> up nr_callchain_events number eventually no?

Looks like so.

>
> jirka
>
> > -err_addr_filters:
> > -     kfree(event->addr_filter_ranges);
> > -
> > -err_per_task:
> > -     exclusive_event_destroy(event);
> > -
> > -err_pmu:
> > -     if (is_cgroup_event(event))
> > -             perf_detach_cgroup(event);
> > -     if (event->destroy)
> > -             event->destroy(event);
> > -     module_put(pmu->module);

I'm afraid of this part.  If it failed at perf_init_event(), it might
call event->destroy() already.  I saw you cleared event->pmu
in the failure path.  Maybe we need the same thing for the
event->destroy.

Thanks,
Namhyung


> > -err_ns:
> > -     if (event->hw.target)
> > -             put_task_struct(event->hw.target);
> > -     call_rcu(&event->rcu_head, free_event_rcu);
> > -
> > +err:
> > +     __free_event(event);
> >       return ERR_PTR(err);
> >  }
> >
> >
> >
Re: [PATCH 01/13] perf: Simplify perf_event_alloc() error path
Posted by Peter Zijlstra 2 years, 1 month ago
On Fri, Nov 03, 2023 at 12:50:19PM -0700, Namhyung Kim wrote:

> > > -err_pmu:
> > > -     if (is_cgroup_event(event))
> > > -             perf_detach_cgroup(event);
> > > -     if (event->destroy)
> > > -             event->destroy(event);
> > > -     module_put(pmu->module);
> 
> I'm afraid of this part.  If it failed at perf_init_event(), it might
> call event->destroy() already.  I saw you cleared event->pmu
> in the failure path.  Maybe we need the same thing for the
> event->destroy.

In that case event->destroy will not yet have been set, no?

And once it is set, we should call it.
Re: [PATCH 01/13] perf: Simplify perf_event_alloc() error path
Posted by Namhyung Kim 2 years, 1 month ago
On Wed, Nov 15, 2023 at 1:58 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Fri, Nov 03, 2023 at 12:50:19PM -0700, Namhyung Kim wrote:
>
> > > > -err_pmu:
> > > > -     if (is_cgroup_event(event))
> > > > -             perf_detach_cgroup(event);
> > > > -     if (event->destroy)
> > > > -             event->destroy(event);
> > > > -     module_put(pmu->module);
> >
> > I'm afraid of this part.  If it failed at perf_init_event(), it might
> > call event->destroy() already.  I saw you cleared event->pmu
> > in the failure path.  Maybe we need the same thing for the
> > event->destroy.
>
> In that case event->destroy will not yet have been set, no?

In perf_try_init_event(), it calls event->destroy() if set when it
has EXTENDED_REGS or NO_EXCLUDE capabilities and returns an error.

Thanks,
Namhyung