[patch 4/6] perf/core: Split out ringbuffer allocation

Thomas Gleixner posted 6 patches 1 month, 4 weeks ago
There is a newer version of this series
[patch 4/6] perf/core: Split out ringbuffer allocation
Posted by Thomas Gleixner 1 month, 4 weeks ago
The code logic in perf_mmap() is incomprehensible and has been source of
subtle bugs in the past. It makes it impossible to convert the atomic_t
reference counts to refcount_t.

There is not really much, which is shared between the ringbuffer and AUX
buffer allocation code since the mlock limit calculation and the
accounting has been split out into helper functions.

Move the AUX buffer allocation code out and integrate the call with a
momentary workaround to allow skipping the remaining ringbuffer related
code completely. That workaround will be removed once the ringbuffer
allocation is moved to its own function as well.

No functional change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/events/core.c |  134 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 75 insertions(+), 59 deletions(-)

--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6970,12 +6970,76 @@ static void perf_mmap_account(struct vm_
 	atomic64_add(extra, &vma->vm_mm->pinned_vm);
 }
 
+static int perf_mmap_aux(struct vm_area_struct *vma, struct perf_event *event,
+			 unsigned long nr_pages)
+{
+	long user_extra = nr_pages, extra = 0;
+	struct perf_buffer *rb = event->rb;
+	u64 aux_offset, aux_size;
+	int ret, rb_flags = 0;
+
+	/*
+	 * AUX area mapping: if rb->aux_nr_pages != 0, it's already
+	 * mapped, all subsequent mappings should have the same size
+	 * and offset. Must be above the normal perf buffer.
+	 */
+	aux_offset = READ_ONCE(rb->user_page->aux_offset);
+	aux_size = READ_ONCE(rb->user_page->aux_size);
+
+	if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
+		return -EINVAL;
+
+	if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
+		return -EINVAL;
+
+	/* Already mapped with a different offset */
+	if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
+		return -EINVAL;
+
+	if (aux_size != nr_pages * PAGE_SIZE)
+		return -EINVAL;
+
+	/* Already mapped with a different size */
+	if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
+		return -EINVAL;
+
+	if (!is_power_of_2(nr_pages))
+		return -EINVAL;
+
+	/* If this succeeds, subsequent failures have to undo it */
+	if (!atomic_inc_not_zero(&rb->mmap_count))
+		return -EINVAL;
+
+	/* If mapped, attach to it */
+	if (rb_has_aux(rb)) {
+		atomic_inc(&rb->aux_mmap_count);
+		return 0;
+	}
+
+	if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) {
+		atomic_dec(&rb->mmap_count);
+		return -EPERM;
+	}
+
+	ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
+			   event->attr.aux_watermark, rb_flags);
+	if (ret) {
+		atomic_dec(&rb->mmap_count);
+		return ret;
+	}
+
+	atomic_set(&rb->aux_mmap_count, 1);
+	rb->aux_mmap_locked = extra;
+	perf_mmap_account(vma, user_extra, extra);
+	atomic_inc(&event->mmap_count);
+	return 0;
+}
+
 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct perf_event *event = file->private_data;
 	unsigned long vma_size, nr_pages;
 	long user_extra = 0, extra = 0;
-	struct mutex *aux_mutex = NULL;
 	struct perf_buffer *rb = NULL;
 	int ret, flags = 0;
 	mapped_f mapped;
@@ -7055,51 +7119,15 @@ static int perf_mmap(struct file *file,
 		}
 
 	} else {
-		/*
-		 * AUX area mapping: if rb->aux_nr_pages != 0, it's already
-		 * mapped, all subsequent mappings should have the same size
-		 * and offset. Must be above the normal perf buffer.
-		 */
-		u64 aux_offset, aux_size;
-
-		rb = event->rb;
-		if (!rb)
-			goto aux_unlock;
-
-		aux_mutex = &rb->aux_mutex;
-		mutex_lock(aux_mutex);
-
-		aux_offset = READ_ONCE(rb->user_page->aux_offset);
-		aux_size = READ_ONCE(rb->user_page->aux_size);
-
-		if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
-			goto aux_unlock;
-
-		if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
-			goto aux_unlock;
-
-		/* already mapped with a different offset */
-		if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
-			goto aux_unlock;
-
-		if (aux_size != nr_pages * PAGE_SIZE)
-			goto aux_unlock;
-
-		/* already mapped with a different size */
-		if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
-			goto aux_unlock;
-
-		if (!is_power_of_2(nr_pages))
-			goto aux_unlock;
-
-		if (!atomic_inc_not_zero(&rb->mmap_count))
-			goto aux_unlock;
-
-		if (rb_has_aux(rb)) {
-			atomic_inc(&rb->aux_mmap_count);
-			ret = 0;
-			goto unlock;
+		if (event->rb) {
+			ret = -EINVAL;
+		} else {
+			scoped_guard(mutex, &event->rb->aux_mutex)
+				ret = perf_mmap_aux(vma, event, nr_pages);
 		}
+		// Temporary workaround to split out AUX handling first
+		mutex_unlock(&event->mmap_mutex);
+		goto out;
 	}
 
 	if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) {
@@ -7132,28 +7160,16 @@ static int perf_mmap(struct file *file,
 		perf_event_init_userpage(event);
 		perf_event_update_userpage(event);
 		ret = 0;
-	} else {
-		ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
-				   event->attr.aux_watermark, flags);
-		if (!ret) {
-			atomic_set(&rb->aux_mmap_count, 1);
-			rb->aux_mmap_locked = extra;
-		}
 	}
-
 unlock:
 	if (!ret) {
 		perf_mmap_account(vma, user_extra, extra);
 		atomic_inc(&event->mmap_count);
-	} else if (rb) {
-		/* AUX allocation failed */
-		atomic_dec(&rb->mmap_count);
 	}
-aux_unlock:
-	if (aux_mutex)
-		mutex_unlock(aux_mutex);
 	mutex_unlock(&event->mmap_mutex);
 
+// Temporary until RB allocation is split out.
+out:
 	if (ret)
 		return ret;
Re: [patch 4/6] perf/core: Split out ringbuffer allocation
Posted by Lorenzo Stoakes 1 month, 4 weeks ago
On Wed, Aug 06, 2025 at 10:12:58PM +0200, Thomas Gleixner wrote:
> The code logic in perf_mmap() is incomprehensible and has been source of
> subtle bugs in the past. It makes it impossible to convert the atomic_t
> reference counts to refcount_t.
>
> There is not really much, which is shared between the ringbuffer and AUX
> buffer allocation code since the mlock limit calculation and the
> accounting has been split out into helper functions.
>
> Move the AUX buffer allocation code out and integrate the call with a
> momentary workaround to allow skipping the remaining ringbuffer related
> code completely. That workaround will be removed once the ringbuffer
> allocation is moved to its own function as well.
>
> No functional change.
>
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

It's easier if you separate out the move and refactor but am going through here,
just to check logic:

- Separate out checks into perf_mmap_aux()
- Add some comments
- We use scoped_guard() for the &event->rb->aux_mutex lock
- We actually return -EINVAL instead of stupidly relying on the default ret
  (such a footgun that, have hit issues with that before)
- Instead of proceeding with the rest of the code as before, we temporarily:
   - drop mutex event->mmap_mutex mutex

And then we explore the other differences below.

There seem to be two bugs, I've noted them inline below. The most series is an
inverted event->rb check, the logic below assumes that's fixed.

THe second is that you don't seem to be doing:

WARN_ON(!rb && event->rb);
if (vma->vm_flags & VM_WRITE)
	flags |= RING_BUFFER_WRITABLE;

In the aux code any more. Maybe first irrelevant, but second surely is?

I noted below inline anyway.

OK moving on, therefore perf_mmap_aux() does the stuff in perf_mmap() that would
previously be execute which would be:

~~~~~~~~~~ 1. ~~~~~~~~~~

-- BEFORE --

	if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) {
		ret = -EPERM;
		goto unlock;
	}
...
unlock:
	if (!ret) {
		...
	} else if (rb) {
		/* AUX allocation failed */
		atomic_dec(&rb->mmap_count);
	}

aux_unlock:
	if (aux_mutex)
		mutex_unlock(aux_mutex);
	mutex_unlock(&event->mmap_mutex);

	if (ret)
		return ret;

-- AFTER --

(We already checked that event->rb is non-NULL)


	if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) {
		atomic_dec(&rb->mmap_count);
		return -EPERM;
	}
	... < scope_guard takes care of aux_mutex >

		// Temporary workaround to split out AUX handling first
		mutex_unlock(&event->mmap_mutex);
		goto out;
 	}

out:
 	if (ret)
 		return ret;

(seems equivalent)

~~~~~~~~~~ 2. ~~~~~~~~~~

-- BEFORE --

	WARN_ON(!rb && event->rb);

	if (vma->vm_flags & VM_WRITE)
		flags |= RING_BUFFER_WRITABLE;

	...

	} else {
		ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
				   event->attr.aux_watermark, flags);
		if (!ret) {
			atomic_set(&rb->aux_mmap_count, 1);
			rb->aux_mmap_locked = extra;
		}
	}

unlock:
	if (!ret) {
		perf_mmap_account(vma, user_extra, extra);
		atomic_inc(&event->mmap_count);
	} else if (rb) {
		/* AUX allocation failed */
		atomic_dec(&rb->mmap_count);
	}

aux_unlock:
	if (aux_mutex)
		mutex_unlock(aux_mutex);
	mutex_unlock(&event->mmap_mutex);

	if (ret)
		return ret;

	... vm_flags_set() blah blah ...

-- AFTER --

	ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
			   event->attr.aux_watermark, rb_flags);
	if (ret) {
		atomic_dec(&rb->mmap_count);
		return ret;
	}

	atomic_set(&rb->aux_mmap_count, 1);
	rb->aux_mmap_locked = extra;
	perf_mmap_account(vma, user_extra, extra);
	atomic_inc(&event->mmap_count);
	return 0;

	... < scope_guard takes care of aux_mutex >

		// Temporary workaround to split out AUX handling first
		mutex_unlock(&event->mmap_mutex);
		goto out;
 	}

out:
 	if (ret)
 		return ret;

	... vm_flags_set() blah blah ...


-- rb_alloc_aux() failure path --

BEFORE

WARN_ON(!rb && event->rb);
if (vma->vm_flags & VM_WRITE)
	flags |= RING_BUFFER_WRITABLE;
atomic_dec(&rb->mmap_count);
if (aux_mutex)
	mutex_unlock(aux_mutex);
mutex_unlock(&event->mmap_mutex);
if (ret)
	return ret;

AFTER

atomic_dec(&rb->mmap_count);
.. < scope_guard takes care of aux_mutex >
mutex_unlock(&event->mmap_mutex);
if (ret)
	return ret;

--  rb_alloc_aux() success path --

BEFORE

WARN_ON(!rb && event->rb);
if (vma->vm_flags & VM_WRITE)
	flags |= RING_BUFFER_WRITABLE;
atomic_set(&rb->aux_mmap_count, 1);
rb->aux_mmap_locked = extra;
perf_mmap_account(vma, user_extra, extra);
atomic_inc(&event->mmap_count);
if (aux_mutex)
	mutex_unlock(aux_mutex);
mutex_unlock(&event->mmap_mutex);
...vm_flags_set() blah blah  ...

AFTER

atomic_set(&rb->aux_mmap_count, 1);
rb->aux_mmap_locked = extra;
.. < scope_guard takes care of aux_mutex >

perf_mmap_account(vma, user_extra, extra);
atomic_inc(&event->mmap_count);
mutex_unlock(&event->mmap_mutex);
...vm_flags_set() blah blah  ...


DIFFERENCES:

If we get to the rb_alloc_aux() bit, we're missing the:

WARN_ON(!rb && event->rb);
if (vma->vm_flags & VM_WRITE)
	flags |= RING_BUFFER_WRITABLE;

Bit for aux case.

Otherwise, it seems to be equivalent.

> ---
>  kernel/events/core.c |  134 ++++++++++++++++++++++++++++-----------------------
>  1 file changed, 75 insertions(+), 59 deletions(-)
>
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -6970,12 +6970,76 @@ static void perf_mmap_account(struct vm_
>  	atomic64_add(extra, &vma->vm_mm->pinned_vm);
>  }
>
> +static int perf_mmap_aux(struct vm_area_struct *vma, struct perf_event *event,
> +			 unsigned long nr_pages)
> +{
> +	long user_extra = nr_pages, extra = 0;
> +	struct perf_buffer *rb = event->rb;
> +	u64 aux_offset, aux_size;
> +	int ret, rb_flags = 0;
> +
> +	/*
> +	 * AUX area mapping: if rb->aux_nr_pages != 0, it's already
> +	 * mapped, all subsequent mappings should have the same size
> +	 * and offset. Must be above the normal perf buffer.
> +	 */
> +	aux_offset = READ_ONCE(rb->user_page->aux_offset);
> +	aux_size = READ_ONCE(rb->user_page->aux_size);
> +
> +	if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
> +		return -EINVAL;
> +
> +	if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
> +		return -EINVAL;
> +
> +	/* Already mapped with a different offset */
> +	if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
> +		return -EINVAL;
> +
> +	if (aux_size != nr_pages * PAGE_SIZE)
> +		return -EINVAL;
> +
> +	/* Already mapped with a different size */
> +	if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
> +		return -EINVAL;
> +
> +	if (!is_power_of_2(nr_pages))
> +		return -EINVAL;
> +
> +	/* If this succeeds, subsequent failures have to undo it */
> +	if (!atomic_inc_not_zero(&rb->mmap_count))
> +		return -EINVAL;
> +
> +	/* If mapped, attach to it */
> +	if (rb_has_aux(rb)) {
> +		atomic_inc(&rb->aux_mmap_count);
> +		return 0;
> +	}
> +
> +	if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) {
> +		atomic_dec(&rb->mmap_count);
> +		return -EPERM;
> +	}
> +
> +	ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
> +			   event->attr.aux_watermark, rb_flags);
> +	if (ret) {
> +		atomic_dec(&rb->mmap_count);
> +		return ret;
> +	}
> +
> +	atomic_set(&rb->aux_mmap_count, 1);
> +	rb->aux_mmap_locked = extra;
> +	perf_mmap_account(vma, user_extra, extra);
> +	atomic_inc(&event->mmap_count);
> +	return 0;
> +}
> +
>  static int perf_mmap(struct file *file, struct vm_area_struct *vma)
>  {
>  	struct perf_event *event = file->private_data;
>  	unsigned long vma_size, nr_pages;
>  	long user_extra = 0, extra = 0;
> -	struct mutex *aux_mutex = NULL;
>  	struct perf_buffer *rb = NULL;
>  	int ret, flags = 0;
>  	mapped_f mapped;
> @@ -7055,51 +7119,15 @@ static int perf_mmap(struct file *file,
>  		}
>
>  	} else {
> -		/*
> -		 * AUX area mapping: if rb->aux_nr_pages != 0, it's already
> -		 * mapped, all subsequent mappings should have the same size
> -		 * and offset. Must be above the normal perf buffer.
> -		 */
> -		u64 aux_offset, aux_size;
> -
> -		rb = event->rb;
> -		if (!rb)
> -			goto aux_unlock;
> -
> -		aux_mutex = &rb->aux_mutex;
> -		mutex_lock(aux_mutex);
> -
> -		aux_offset = READ_ONCE(rb->user_page->aux_offset);
> -		aux_size = READ_ONCE(rb->user_page->aux_size);
> -
> -		if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
> -			goto aux_unlock;
> -
> -		if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
> -			goto aux_unlock;
> -
> -		/* already mapped with a different offset */
> -		if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
> -			goto aux_unlock;
> -
> -		if (aux_size != nr_pages * PAGE_SIZE)
> -			goto aux_unlock;
> -
> -		/* already mapped with a different size */
> -		if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
> -			goto aux_unlock;
> -
> -		if (!is_power_of_2(nr_pages))
> -			goto aux_unlock;
> -
> -		if (!atomic_inc_not_zero(&rb->mmap_count))
> -			goto aux_unlock;
> -
> -		if (rb_has_aux(rb)) {
> -			atomic_inc(&rb->aux_mmap_count);
> -			ret = 0;
> -			goto unlock;
> +		if (event->rb) {
> +			ret = -EINVAL;

Shouldn't this be if (!event->rb) ?

> +		} else {

Because here you're dereffing event->rb in branch where !event->rb?

> +			scoped_guard(mutex, &event->rb->aux_mutex)
> +				ret = perf_mmap_aux(vma, event, nr_pages);
>  		}
> +		// Temporary workaround to split out AUX handling first
> +		mutex_unlock(&event->mmap_mutex);
> +		goto out;

Noted above but you're now skipping the:

	WARN_ON(!rb && event->rb);

	if (vma->vm_flags & VM_WRITE)
		flags |= RING_BUFFER_WRITABLE;

Bit in the aux case no? Unless I'm missing something?

>  	}
>
>  	if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) {
> @@ -7132,28 +7160,16 @@ static int perf_mmap(struct file *file,
>  		perf_event_init_userpage(event);
>  		perf_event_update_userpage(event);
>  		ret = 0;
> -	} else {
> -		ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
> -				   event->attr.aux_watermark, flags);
> -		if (!ret) {
> -			atomic_set(&rb->aux_mmap_count, 1);
> -			rb->aux_mmap_locked = extra;
> -		}
>  	}
> -
>  unlock:
>  	if (!ret) {
>  		perf_mmap_account(vma, user_extra, extra);
>  		atomic_inc(&event->mmap_count);
> -	} else if (rb) {
> -		/* AUX allocation failed */
> -		atomic_dec(&rb->mmap_count);
>  	}
> -aux_unlock:
> -	if (aux_mutex)
> -		mutex_unlock(aux_mutex);
>  	mutex_unlock(&event->mmap_mutex);
>
> +// Temporary until RB allocation is split out.
> +out:
>  	if (ret)
>  		return ret;
>
>

Cheers, Lorenzo
Re: [patch 4/6] perf/core: Split out ringbuffer allocation
Posted by Thomas Gleixner 1 month, 3 weeks ago
On Thu, Aug 07 2025 at 16:38, Lorenzo Stoakes wrote:
> On Wed, Aug 06, 2025 at 10:12:58PM +0200, Thomas Gleixner wrote:
> THe second is that you don't seem to be doing:
>
> WARN_ON(!rb && event->rb);
> if (vma->vm_flags & VM_WRITE)
> 	flags |= RING_BUFFER_WRITABLE;
>
> In the aux code any more. Maybe first irrelevant, but second surely
> is?

Yeah. The first one is kinda silly. The second one I dropped unintentionally.

> DIFFERENCES:
>
> If we get to the rb_alloc_aux() bit, we're missing the:
>
> WARN_ON(!rb && event->rb);
> if (vma->vm_flags & VM_WRITE)
> 	flags |= RING_BUFFER_WRITABLE;
>
> Bit for aux case.
>
> Otherwise, it seems to be equivalent.

Thanks for taking the time to go through this.

>> -		if (rb_has_aux(rb)) {
>> -			atomic_inc(&rb->aux_mmap_count);
>> -			ret = 0;
>> -			goto unlock;
>> +		if (event->rb) {
>> +			ret = -EINVAL;
>
> Shouldn't this be if (!event->rb) ?
>
>> +		} else {
>
> Because here you're dereffing event->rb in branch where !event->rb?

Yes. I obviously failed to tested this particular patch alone and that's
fixed up in the next which moves the RB allocation out, so it did not
blow up in my face when I tested the whole pile.

Thanks for spotting!

       tglx