kernel/events/core.c | 78 +++++++++++++++++++++++++++----------------- 1 file changed, 48 insertions(+), 30 deletions(-)
From: Haocheng Yu <yuhaocheng035@gmail.com>
Syzkaller reported a refcount_t: addition on 0; use-after-free warning
in perf_mmap.
The issue is caused by a race condition between a failing mmap() setup
and a concurrent mmap() on a dependent event (e.g., using output
redirection).
In perf_mmap(), the ring_buffer (rb) is allocated and assigned to
event->rb with the mmap_mutex held. The mutex is then released to
perform map_range().
If map_range() fails, perf_mmap_close() is called to clean up.
However, since the mutex was dropped, another thread attaching to
this event (via inherited events or output redirection) can acquire
the mutex, observe the valid event->rb pointer, and attempt to
increment its reference count. If the cleanup path has already
dropped the reference count to zero, this results in a
use-after-free or refcount saturation warning.
Fix this by extending the scope of mmap_mutex to cover the
map_range() call. This ensures that the ring buffer initialization
and mapping (or cleanup on failure) happens atomically effectively,
preventing other threads from accessing a half-initialized or
dying ring buffer.
v2:
Because expanding the guarded region would cause the event->mmap_mutex
to be acquired repeatedly in the perf_mmap_close function, potentially
leading to a self deadlock, the original logic of perf_mmap_close was
retained, and the mutex-holding logic was modified to obtain the
perf_mmap_close_locked function.
v3:
The fix is made smaller by passing the parameter "holds_event_mmap_mutex"
to perf_mmap_close.
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202602020208.m7KIjdzW-lkp@intel.com/
Suggested-by: Ian Rogers <irogers@google.com>
Signed-off-by: Haocheng Yu <yuhaocheng035@gmail.com>
---
kernel/events/core.c | 78 +++++++++++++++++++++++++++-----------------
1 file changed, 48 insertions(+), 30 deletions(-)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2c35acc2722b..a3228c587de1 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6730,9 +6730,10 @@ static void perf_pmu_output_stop(struct perf_event *event);
* the buffer here, where we still have a VM context. This means we need
* to detach all events redirecting to us.
*/
-static void perf_mmap_close(struct vm_area_struct *vma)
+static void __perf_mmap_close(struct vm_area_struct *vma, struct perf_event *event,
+ bool holds_event_mmap_lock)
{
- struct perf_event *event = vma->vm_file->private_data;
+ struct perf_event *iter_event;
mapped_f unmapped = get_mapped(event, event_unmapped);
struct perf_buffer *rb = ring_buffer_get(event);
struct user_struct *mmap_user = rb->mmap_user;
@@ -6772,11 +6773,14 @@ static void perf_mmap_close(struct vm_area_struct *vma)
if (refcount_dec_and_test(&rb->mmap_count))
detach_rest = true;
- if (!refcount_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
+ if ((!holds_event_mmap_lock &&
+ !refcount_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) ||
+ (holds_event_mmap_lock && !refcount_dec_and_test(&event->mmap_count)))
goto out_put;
ring_buffer_attach(event, NULL);
- mutex_unlock(&event->mmap_mutex);
+ if (!holds_event_mmap_lock)
+ mutex_unlock(&event->mmap_mutex);
/* If there's still other mmap()s of this buffer, we're done. */
if (!detach_rest)
@@ -6789,8 +6793,8 @@ static void perf_mmap_close(struct vm_area_struct *vma)
*/
again:
rcu_read_lock();
- list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
- if (!atomic_long_inc_not_zero(&event->refcount)) {
+ list_for_each_entry_rcu(iter_event, &rb->event_list, rb_entry) {
+ if (!atomic_long_inc_not_zero(&iter_event->refcount)) {
/*
* This event is en-route to free_event() which will
* detach it and remove it from the list.
@@ -6799,7 +6803,8 @@ static void perf_mmap_close(struct vm_area_struct *vma)
}
rcu_read_unlock();
- mutex_lock(&event->mmap_mutex);
+ if (!holds_event_mmap_lock)
+ mutex_lock(&iter_event->mmap_mutex);
/*
* Check we didn't race with perf_event_set_output() which can
* swizzle the rb from under us while we were waiting to
@@ -6810,11 +6815,12 @@ static void perf_mmap_close(struct vm_area_struct *vma)
* still restart the iteration to make sure we're not now
* iterating the wrong list.
*/
- if (event->rb == rb)
- ring_buffer_attach(event, NULL);
+ if (iter_event->rb == rb)
+ ring_buffer_attach(iter_event, NULL);
- mutex_unlock(&event->mmap_mutex);
- put_event(event);
+ if (!holds_event_mmap_lock)
+ mutex_unlock(&iter_event->mmap_mutex);
+ put_event(iter_event);
/*
* Restart the iteration; either we're on the wrong list or
@@ -6842,6 +6848,18 @@ static void perf_mmap_close(struct vm_area_struct *vma)
ring_buffer_put(rb); /* could be last */
}
+static void perf_mmap_close(struct vm_area_struct *vma)
+{
+ struct perf_event *event = vma->vm_file->private_data;
+
+ __perf_mmap_close(vma, event, false);
+}
+
+static void perf_mmap_close_locked(struct vm_area_struct *vma, struct perf_event *event)
+{
+ __perf_mmap_close(vma, event, true);
+}
+
static vm_fault_t perf_mmap_pfn_mkwrite(struct vm_fault *vmf)
{
/* The first page is the user control page, others are read-only. */
@@ -7167,28 +7185,28 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
ret = perf_mmap_aux(vma, event, nr_pages);
if (ret)
return ret;
- }
- /*
- * Since pinned accounting is per vm we cannot allow fork() to copy our
- * vma.
- */
- vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP);
- vma->vm_ops = &perf_mmap_vmops;
+ /*
+ * Since pinned accounting is per vm we cannot allow fork() to copy our
+ * vma.
+ */
+ vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP);
+ vma->vm_ops = &perf_mmap_vmops;
- mapped = get_mapped(event, event_mapped);
- if (mapped)
- mapped(event, vma->vm_mm);
+ mapped = get_mapped(event, event_mapped);
+ if (mapped)
+ mapped(event, vma->vm_mm);
- /*
- * Try to map it into the page table. On fail, invoke
- * perf_mmap_close() to undo the above, as the callsite expects
- * full cleanup in this case and therefore does not invoke
- * vmops::close().
- */
- ret = map_range(event->rb, vma);
- if (ret)
- perf_mmap_close(vma);
+ /*
+ * Try to map it into the page table. On fail, invoke
+ * perf_mmap_close() to undo the above, as the callsite expects
+ * full cleanup in this case and therefore does not invoke
+ * vmops::close().
+ */
+ ret = map_range(event->rb, vma);
+ if (ret)
+ perf_mmap_close_locked(vma, event);
+ }
return ret;
}
base-commit: 7d0a66e4bb9081d75c82ec4957c50034cb0ea449
--
2.51.0
Argh,. why is this hidden in this old thread :/
On Wed, Mar 25, 2026 at 06:20:53PM +0800, yuhaocheng035@gmail.com wrote:
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 2c35acc2722b..a3228c587de1 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -6730,9 +6730,10 @@ static void perf_pmu_output_stop(struct perf_event *event);
> * the buffer here, where we still have a VM context. This means we need
> * to detach all events redirecting to us.
> */
> -static void perf_mmap_close(struct vm_area_struct *vma)
> +static void __perf_mmap_close(struct vm_area_struct *vma, struct perf_event *event,
> + bool holds_event_mmap_lock)
> {
> - struct perf_event *event = vma->vm_file->private_data;
> + struct perf_event *iter_event;
> mapped_f unmapped = get_mapped(event, event_unmapped);
> struct perf_buffer *rb = ring_buffer_get(event);
> struct user_struct *mmap_user = rb->mmap_user;
> @@ -6772,11 +6773,14 @@ static void perf_mmap_close(struct vm_area_struct *vma)
> if (refcount_dec_and_test(&rb->mmap_count))
> detach_rest = true;
>
> - if (!refcount_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
> + if ((!holds_event_mmap_lock &&
> + !refcount_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) ||
> + (holds_event_mmap_lock && !refcount_dec_and_test(&event->mmap_count)))
> goto out_put;
*groan*, this is horrible.
Let me have a poke to see if there isn't a saner variant around.
On Wed, 25 Mar 2026 at 23:17, Peter Zijlstra <peterz@infradead.org> wrote:
> Argh,. why is this hidden in this old thread :/
>
> On Wed, Mar 25, 2026 at 06:20:53PM +0800, yuhaocheng035@gmail.com wrote:
>
> > diff --git a/kernel/events/core.c b/kernel/events/core.c
> > index 2c35acc2722b..a3228c587de1 100644
> > --- a/kernel/events/core.c
> > +++ b/kernel/events/core.c
> > @@ -6730,9 +6730,10 @@ static void perf_pmu_output_stop(struct perf_event *event);
> > * the buffer here, where we still have a VM context. This means we need
> > * to detach all events redirecting to us.
> > */
> > -static void perf_mmap_close(struct vm_area_struct *vma)
> > +static void __perf_mmap_close(struct vm_area_struct *vma, struct perf_event *event,
> > + bool holds_event_mmap_lock)
> > {
> > - struct perf_event *event = vma->vm_file->private_data;
> > + struct perf_event *iter_event;
> > mapped_f unmapped = get_mapped(event, event_unmapped);
> > struct perf_buffer *rb = ring_buffer_get(event);
> > struct user_struct *mmap_user = rb->mmap_user;
> > @@ -6772,11 +6773,14 @@ static void perf_mmap_close(struct vm_area_struct *vma)
> > if (refcount_dec_and_test(&rb->mmap_count))
> > detach_rest = true;
> >
> > - if (!refcount_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
> > + if ((!holds_event_mmap_lock &&
> > + !refcount_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) ||
> > + (holds_event_mmap_lock && !refcount_dec_and_test(&event->mmap_count)))
> > goto out_put;
>
> *groan*, this is horrible.
>
> Let me have a poke to see if there isn't a saner variant around.
I think it's ok to move perf_mmap_close() outside the mutex lock, like this:
https://lore.kernel.org/all/20260325153240.GK3739106@noisy.programming.kicks-ass.net/T/#m0f82e8ecdfdfce4acd5121bcb799e864cf05ebf9
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1f5699b339ec..e5ce03ce926d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7485,9 +7485,12 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
*/
ret = map_range(event->rb, vma);
if (ret)
- perf_mmap_close(vma);
+ goto out_close;
}
+ return 0;
+out_close:
+ perf_mmap_close(vma);
return ret;
}
How do you think?
--
Qing
On Thu, Mar 26, 2026 at 11:18:06AM +0800, Qing Wang wrote:
> On Wed, 25 Mar 2026 at 23:17, Peter Zijlstra <peterz@infradead.org> wrote:
> > Argh,. why is this hidden in this old thread :/
> >
> > On Wed, Mar 25, 2026 at 06:20:53PM +0800, yuhaocheng035@gmail.com wrote:
> >
> > > diff --git a/kernel/events/core.c b/kernel/events/core.c
> > > index 2c35acc2722b..a3228c587de1 100644
> > > --- a/kernel/events/core.c
> > > +++ b/kernel/events/core.c
> > > @@ -6730,9 +6730,10 @@ static void perf_pmu_output_stop(struct perf_event *event);
> > > * the buffer here, where we still have a VM context. This means we need
> > > * to detach all events redirecting to us.
> > > */
> > > -static void perf_mmap_close(struct vm_area_struct *vma)
> > > +static void __perf_mmap_close(struct vm_area_struct *vma, struct perf_event *event,
> > > + bool holds_event_mmap_lock)
> > > {
> > > - struct perf_event *event = vma->vm_file->private_data;
> > > + struct perf_event *iter_event;
> > > mapped_f unmapped = get_mapped(event, event_unmapped);
> > > struct perf_buffer *rb = ring_buffer_get(event);
> > > struct user_struct *mmap_user = rb->mmap_user;
> > > @@ -6772,11 +6773,14 @@ static void perf_mmap_close(struct vm_area_struct *vma)
> > > if (refcount_dec_and_test(&rb->mmap_count))
> > > detach_rest = true;
> > >
> > > - if (!refcount_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
> > > + if ((!holds_event_mmap_lock &&
> > > + !refcount_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) ||
> > > + (holds_event_mmap_lock && !refcount_dec_and_test(&event->mmap_count)))
> > > goto out_put;
> >
> > *groan*, this is horrible.
> >
> > Let me have a poke to see if there isn't a saner variant around.
>
> I think it's ok to move perf_mmap_close() outside the mutex lock, like this:
>
> https://lore.kernel.org/all/20260325153240.GK3739106@noisy.programming.kicks-ass.net/T/#m0f82e8ecdfdfce4acd5121bcb799e864cf05ebf9
>
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 1f5699b339ec..e5ce03ce926d 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -7485,9 +7485,12 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
> */
> ret = map_range(event->rb, vma);
> if (ret)
> - perf_mmap_close(vma);
> + goto out_close;
> }
> + return 0;
>
> +out_close:
> + perf_mmap_close(vma);
> return ret;
> }
>
> How do you think?
Well, that will just re-introduce the original problem. As you were told
there.
What about something like this?
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1f5699b339ec..0bb1d8b83bc9 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7010,6 +7010,7 @@ static void perf_mmap_open(struct vm_area_struct *vma)
}
static void perf_pmu_output_stop(struct perf_event *event);
+static void perf_mmap_unaccount(struct vm_area_struct *vma, struct perf_buffer *rb);
/*
* A buffer can be mmap()ed multiple times; either directly through the same
@@ -7025,8 +7026,6 @@ static void perf_mmap_close(struct vm_area_struct *vma)
mapped_f unmapped = get_mapped(event, event_unmapped);
struct perf_buffer *rb = ring_buffer_get(event);
struct user_struct *mmap_user = rb->mmap_user;
- int mmap_locked = rb->mmap_locked;
- unsigned long size = perf_data_size(rb);
bool detach_rest = false;
/* FIXIES vs perf_pmu_unregister() */
@@ -7121,11 +7120,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
* Aside from that, this buffer is 'fully' detached and unmapped,
* undo the VM accounting.
*/
-
- atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked,
- &mmap_user->locked_vm);
- atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm);
- free_uid(mmap_user);
+ perf_mmap_unaccount(vma, rb);
out_put:
ring_buffer_put(rb); /* could be last */
@@ -7265,6 +7260,15 @@ static void perf_mmap_account(struct vm_area_struct *vma, long user_extra, long
atomic64_add(extra, &vma->vm_mm->pinned_vm);
}
+static void perf_mmap_unaccount(struct vm_area_struct *vma, struct perf_buffer *rb)
+{
+ struct user_struct *user = rb->mmap_user;
+
+ atomic_long_sub((perf_data_size(rb) >> PAGE_SHIFT) + 1 - rb->mmap_locked,
+ &user->locked_vm);
+ atomic64_sub(rb->mmap_locked, &vma->vm_mm->pinned_vm);
+}
+
static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event,
unsigned long nr_pages)
{
@@ -7327,8 +7331,6 @@ static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event,
if (!rb)
return -ENOMEM;
- refcount_set(&rb->mmap_count, 1);
- rb->mmap_user = get_current_user();
rb->mmap_locked = extra;
ring_buffer_attach(event, rb);
@@ -7484,10 +7486,43 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
* vmops::close().
*/
ret = map_range(event->rb, vma);
- if (ret)
- perf_mmap_close(vma);
+ if (likely(!ret))
+ return 0;
+
+ /* Error path */
+
+ /*
+ * If this is the first mmap(), then event->mmap_count should
+ * be stable at 1. It is only modified by:
+ * perf_mmap_{open,close}() and perf_mmap().
+ *
+ * The former are not possible because this mmap() hasn't been
+ * successful yet, and the latter is serialized by
+ * event->mmap_mutex which we still hold (note that mmap_lock
+ * is not strictly sufficient here, because the event fd can
+ * be passed to another process through trivial means like
+ * fork(), leading to concurrent mmap() from different mm).
+ *
+ * Make sure to remove event->rb before releasing
+ * event->mmap_mutex, such that any concurrent mmap() will not
+ * attempt use this failed buffer.
+ */
+ if (refcount_read(&event->mmap_count) == 1) {
+ /*
+ * Minimal perf_mmap_close(); there can't be AUX or
+ * other events on account of this being the first.
+ */
+ mapped = get_mapped(event, event_unmapped);
+ if (mapped)
+ mapped(event, vma->vm_mm);
+ perf_mmap_unaccount(vma, event->rb);
+ ring_buffer_attach(event, NULL); /* drops last rb->refcount */
+ refcount_set(&event->mmap_count, 0);
+ return ret;
+ }
}
+ perf_mmap_close(vma);
return ret;
}
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index d9cc57083091..c03c4f2eea57 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -67,6 +67,7 @@ static inline void rb_free_rcu(struct rcu_head *rcu_head)
struct perf_buffer *rb;
rb = container_of(rcu_head, struct perf_buffer, rcu_head);
+ free_uid(rb->mmap_user);
rb_free(rb);
}
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 3e7de2661417..9fe92161715e 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -340,6 +340,8 @@ ring_buffer_init(struct perf_buffer *rb, long watermark, int flags)
rb->paused = 1;
mutex_init(&rb->aux_mutex);
+ rb->mmap_user = get_current_user();
+ refcount_set(&rb->mmap_count, 1);
}
void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags)
From: Haocheng Yu <yuhaocheng035@gmail.com>
Syzkaller reported a refcount_t: addition on 0; use-after-free warning
in perf_mmap.
The issue is caused by a race condition between a failing mmap() setup
and a concurrent mmap() on a dependent event (e.g., using output
redirection).
In perf_mmap(), the ring_buffer (rb) is allocated and assigned to
event->rb with the mmap_mutex held. The mutex is then released to
perform map_range().
If map_range() fails, perf_mmap_close() is called to clean up.
However, since the mutex was dropped, another thread attaching to
this event (via inherited events or output redirection) can acquire
the mutex, observe the valid event->rb pointer, and attempt to
increment its reference count. If the cleanup path has already
dropped the reference count to zero, this results in a
use-after-free or refcount saturation warning.
Fix this by extending the scope of mmap_mutex to cover the
map_range() call. This ensures that the ring buffer initialization
and mapping (or cleanup on failure) happens atomically effectively,
preventing other threads from accessing a half-initialized or
dying ring buffer.
v2:
Because expanding the guarded region would cause the event->mmap_mutex
to be acquired repeatedly in the perf_mmap_close function, potentially
leading to a self deadlock, the original logic of perf_mmap_close was
retained, and the mutex-holding logic was modified to obtain the
perf_mmap_close_locked function.
v3:
The fix is made smaller by passing the parameter "holds_event_mmap_mutex"
to perf_mmap_close.
v4:
This problem is solved in a smarter way.
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202602020208.m7KIjdzW-lkp@intel.com/
Reviewed-by: Ian Rogers <irogers@google.com>
Reviewed-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Haocheng Yu <yuhaocheng035@gmail.com>
---
kernel/events/core.c | 59 +++++++++++++++++++++++++++++--------
kernel/events/internal.h | 1 +
kernel/events/ring_buffer.c | 2 ++
3 files changed, 49 insertions(+), 13 deletions(-)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 22a0f405585b..d3f978402b1e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7010,7 +7010,7 @@ static void perf_mmap_open(struct vm_area_struct *vma)
}
static void perf_pmu_output_stop(struct perf_event *event);
-
+static void perf_mmap_unaccount(struct vm_area_struct *vma, struct perf_buffer *rb);
/*
* A buffer can be mmap()ed multiple times; either directly through the same
* event, or through other events by use of perf_event_set_output().
@@ -7025,8 +7025,6 @@ static void perf_mmap_close(struct vm_area_struct *vma)
mapped_f unmapped = get_mapped(event, event_unmapped);
struct perf_buffer *rb = ring_buffer_get(event);
struct user_struct *mmap_user = rb->mmap_user;
- int mmap_locked = rb->mmap_locked;
- unsigned long size = perf_data_size(rb);
bool detach_rest = false;
/* FIXIES vs perf_pmu_unregister() */
@@ -7121,11 +7119,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
* Aside from that, this buffer is 'fully' detached and unmapped,
* undo the VM accounting.
*/
-
- atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked,
- &mmap_user->locked_vm);
- atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm);
- free_uid(mmap_user);
+ perf_mmap_unaccount(vma, rb);
out_put:
ring_buffer_put(rb); /* could be last */
@@ -7265,6 +7259,15 @@ static void perf_mmap_account(struct vm_area_struct *vma, long user_extra, long
atomic64_add(extra, &vma->vm_mm->pinned_vm);
}
+static void perf_mmap_unaccount(struct vm_area_struct *vma, struct perf_buffer *rb)
+{
+ struct user_struct *user = rb->mmap_user;
+
+ atomic_long_sub((perf_data_size(rb) >> PAGE_SHIFT) + 1 - rb->mmap_locked,
+ &user->locked_vm);
+ atomic64_sub(rb->mmap_locked, &vma->vm_mm->pinned_vm);
+}
+
static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event,
unsigned long nr_pages)
{
@@ -7327,8 +7330,6 @@ static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event,
if (!rb)
return -ENOMEM;
- refcount_set(&rb->mmap_count, 1);
- rb->mmap_user = get_current_user();
rb->mmap_locked = extra;
ring_buffer_attach(event, rb);
@@ -7484,10 +7485,42 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
* vmops::close().
*/
ret = map_range(event->rb, vma);
- if (ret)
- perf_mmap_close(vma);
- }
+ if (likely(!ret))
+ return 0;
+
+ /* Error path */
+ /*
+ * If this is the first mmap(), then event->mmap_count should
+ * be stable at 1. It is only modified by:
+ * perf_mmap_{open,close}() and perf_mmap().
+ *
+ * The former are not possible because this mmap() hasn't been
+ * successful yet, and the latter is serialized by
+ * event->mmap_mutex which we still hold (note that mmap_lock
+ * is not strictly sufficient here, because the event fd can
+ * be passed to another process through trivial means like
+ * fork(), leading to concurrent mmap() from different mm).
+ *
+ * Make sure to remove event->rb before releasing
+ * event->mmap_mutex, such that any concurrent mmap() will not
+ * attempt use this failed buffer.
+ */
+ if (refcount_read(&event->mmap_count) == 1) {
+ /*
+ * Minimal perf_mmap_close(); there can't be AUX or
+ * other events on account of this being the first.
+ */
+ mapped = get_mapped(event, event_unmapped);
+ if (mapped)
+ mapped(event, vma->vm_mm);
+ perf_mmap_unaccount(vma, event->rb);
+ ring_buffer_attach(event, NULL); /* drops last rb->refcount */
+ refcount_set(&event->mmap_count, 0);
+ return ret;
+ }
+ }
+ perf_mmap_close(vma);
return ret;
}
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index d9cc57083091..c03c4f2eea57 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -67,6 +67,7 @@ static inline void rb_free_rcu(struct rcu_head *rcu_head)
struct perf_buffer *rb;
rb = container_of(rcu_head, struct perf_buffer, rcu_head);
+ free_uid(rb->mmap_user);
rb_free(rb);
}
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 3e7de2661417..9fe92161715e 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -340,6 +340,8 @@ ring_buffer_init(struct perf_buffer *rb, long watermark, int flags)
rb->paused = 1;
mutex_init(&rb->aux_mutex);
+ rb->mmap_user = get_current_user();
+ refcount_set(&rb->mmap_count, 1);
}
void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags)
base-commit: 77de62ad3de3967818c3dbe656b7336ebee461d2
--
2.51.0
On Fri, Mar 27, 2026 at 08:29:52PM +0800, yuhaocheng035@gmail.com wrote: > From: Haocheng Yu <yuhaocheng035@gmail.com> > > Syzkaller reported a refcount_t: addition on 0; use-after-free warning > in perf_mmap. > > The issue is caused by a race condition between a failing mmap() setup > and a concurrent mmap() on a dependent event (e.g., using output > redirection). > > In perf_mmap(), the ring_buffer (rb) is allocated and assigned to > event->rb with the mmap_mutex held. The mutex is then released to > perform map_range(). > > If map_range() fails, perf_mmap_close() is called to clean up. > However, since the mutex was dropped, another thread attaching to > this event (via inherited events or output redirection) can acquire > the mutex, observe the valid event->rb pointer, and attempt to > increment its reference count. If the cleanup path has already > dropped the reference count to zero, this results in a > use-after-free or refcount saturation warning. > > Fix this by extending the scope of mmap_mutex to cover the > map_range() call. This ensures that the ring buffer initialization > and mapping (or cleanup on failure) happens atomically effectively, > preventing other threads from accessing a half-initialized or > dying ring buffer. > > v2: > Because expanding the guarded region would cause the event->mmap_mutex > to be acquired repeatedly in the perf_mmap_close function, potentially > leading to a self deadlock, the original logic of perf_mmap_close was > retained, and the mutex-holding logic was modified to obtain the > perf_mmap_close_locked function. > > v3: > The fix is made smaller by passing the parameter "holds_event_mmap_mutex" > to perf_mmap_close. > > v4: > This problem is solved in a smarter way. > > Reported-by: kernel test robot <lkp@intel.com> > Closes: https://lore.kernel.org/oe-kbuild-all/202602020208.m7KIjdzW-lkp@intel.com/ > Reviewed-by: Ian Rogers <irogers@google.com> > Reviewed-by: Peter Zijlstra <peterz@infradead.org> > Signed-off-by: Haocheng Yu <yuhaocheng035@gmail.com> You can't claim this as your patch. I was the one who wrote it -- yesterday.
Your solution looks much better.
I tried to incorporate it and submit a patch v4. If you are already
handling it, please ignore my patch.
Thanks,
Haocheng
> From: Haocheng Yu <yuhaocheng035@gmail.com>
>
> Syzkaller reported a refcount_t: addition on 0; use-after-free warning
> in perf_mmap.
>
> The issue is caused by a race condition between a failing mmap() setup
> and a concurrent mmap() on a dependent event (e.g., using output
> redirection).
>
> In perf_mmap(), the ring_buffer (rb) is allocated and assigned to
> event->rb with the mmap_mutex held. The mutex is then released to
> perform map_range().
>
> If map_range() fails, perf_mmap_close() is called to clean up.
> However, since the mutex was dropped, another thread attaching to
> this event (via inherited events or output redirection) can acquire
> the mutex, observe the valid event->rb pointer, and attempt to
> increment its reference count. If the cleanup path has already
> dropped the reference count to zero, this results in a
> use-after-free or refcount saturation warning.
>
> Fix this by extending the scope of mmap_mutex to cover the
> map_range() call. This ensures that the ring buffer initialization
> and mapping (or cleanup on failure) happens atomically effectively,
> preventing other threads from accessing a half-initialized or
> dying ring buffer.
>
> v2:
> Because expanding the guarded region would cause the event->mmap_mutex
> to be acquired repeatedly in the perf_mmap_close function, potentially
> leading to a self deadlock, the original logic of perf_mmap_close was
> retained, and the mutex-holding logic was modified to obtain the
> perf_mmap_close_locked function.
>
> v3:
> The fix is made smaller by passing the parameter "holds_event_mmap_mutex"
> to perf_mmap_close.
>
> v4:
> This problem is solved in a smarter way.
>
> Reported-by: kernel test robot <lkp@intel.com>
> Closes: https://lore.kernel.org/oe-kbuild-all/202602020208.m7KIjdzW-lkp@intel.com/
> Reviewed-by: Ian Rogers <irogers@google.com>
> Reviewed-by: Peter Zijlstra <peterz@infradead.org>
> Signed-off-by: Haocheng Yu <yuhaocheng035@gmail.com>
> ---
> kernel/events/core.c | 59 +++++++++++++++++++++++++++++--------
> kernel/events/internal.h | 1 +
> kernel/events/ring_buffer.c | 2 ++
> 3 files changed, 49 insertions(+), 13 deletions(-)
>
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 22a0f405585b..d3f978402b1e 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -7010,7 +7010,7 @@ static void perf_mmap_open(struct vm_area_struct *vma)
> }
>
> static void perf_pmu_output_stop(struct perf_event *event);
> -
> +static void perf_mmap_unaccount(struct vm_area_struct *vma, struct perf_buffer *rb);
> /*
> * A buffer can be mmap()ed multiple times; either directly through the same
> * event, or through other events by use of perf_event_set_output().
> @@ -7025,8 +7025,6 @@ static void perf_mmap_close(struct vm_area_struct *vma)
> mapped_f unmapped = get_mapped(event, event_unmapped);
> struct perf_buffer *rb = ring_buffer_get(event);
> struct user_struct *mmap_user = rb->mmap_user;
> - int mmap_locked = rb->mmap_locked;
> - unsigned long size = perf_data_size(rb);
> bool detach_rest = false;
>
> /* FIXIES vs perf_pmu_unregister() */
> @@ -7121,11 +7119,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
> * Aside from that, this buffer is 'fully' detached and unmapped,
> * undo the VM accounting.
> */
> -
> - atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked,
> - &mmap_user->locked_vm);
> - atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm);
> - free_uid(mmap_user);
> + perf_mmap_unaccount(vma, rb);
>
> out_put:
> ring_buffer_put(rb); /* could be last */
> @@ -7265,6 +7259,15 @@ static void perf_mmap_account(struct vm_area_struct *vma, long user_extra, long
> atomic64_add(extra, &vma->vm_mm->pinned_vm);
> }
>
> +static void perf_mmap_unaccount(struct vm_area_struct *vma, struct perf_buffer *rb)
> +{
> + struct user_struct *user = rb->mmap_user;
> +
> + atomic_long_sub((perf_data_size(rb) >> PAGE_SHIFT) + 1 - rb->mmap_locked,
> + &user->locked_vm);
> + atomic64_sub(rb->mmap_locked, &vma->vm_mm->pinned_vm);
> +}
> +
> static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event,
> unsigned long nr_pages)
> {
> @@ -7327,8 +7330,6 @@ static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event,
> if (!rb)
> return -ENOMEM;
>
> - refcount_set(&rb->mmap_count, 1);
> - rb->mmap_user = get_current_user();
> rb->mmap_locked = extra;
>
> ring_buffer_attach(event, rb);
> @@ -7484,10 +7485,42 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
> * vmops::close().
> */
> ret = map_range(event->rb, vma);
> - if (ret)
> - perf_mmap_close(vma);
> - }
> + if (likely(!ret))
> + return 0;
> +
> + /* Error path */
>
> + /*
> + * If this is the first mmap(), then event->mmap_count should
> + * be stable at 1. It is only modified by:
> + * perf_mmap_{open,close}() and perf_mmap().
> + *
> + * The former are not possible because this mmap() hasn't been
> + * successful yet, and the latter is serialized by
> + * event->mmap_mutex which we still hold (note that mmap_lock
> + * is not strictly sufficient here, because the event fd can
> + * be passed to another process through trivial means like
> + * fork(), leading to concurrent mmap() from different mm).
> + *
> + * Make sure to remove event->rb before releasing
> + * event->mmap_mutex, such that any concurrent mmap() will not
> + * attempt use this failed buffer.
> + */
> + if (refcount_read(&event->mmap_count) == 1) {
> + /*
> + * Minimal perf_mmap_close(); there can't be AUX or
> + * other events on account of this being the first.
> + */
> + mapped = get_mapped(event, event_unmapped);
> + if (mapped)
> + mapped(event, vma->vm_mm);
> + perf_mmap_unaccount(vma, event->rb);
> + ring_buffer_attach(event, NULL); /* drops last rb->refcount */
> + refcount_set(&event->mmap_count, 0);
> + return ret;
> + }
> + }
> + perf_mmap_close(vma);
> return ret;
> }
>
> diff --git a/kernel/events/internal.h b/kernel/events/internal.h
> index d9cc57083091..c03c4f2eea57 100644
> --- a/kernel/events/internal.h
> +++ b/kernel/events/internal.h
> @@ -67,6 +67,7 @@ static inline void rb_free_rcu(struct rcu_head *rcu_head)
> struct perf_buffer *rb;
>
> rb = container_of(rcu_head, struct perf_buffer, rcu_head);
> + free_uid(rb->mmap_user);
> rb_free(rb);
> }
>
> diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
> index 3e7de2661417..9fe92161715e 100644
> --- a/kernel/events/ring_buffer.c
> +++ b/kernel/events/ring_buffer.c
> @@ -340,6 +340,8 @@ ring_buffer_init(struct perf_buffer *rb, long watermark, int flags)
> rb->paused = 1;
>
> mutex_init(&rb->aux_mutex);
> + rb->mmap_user = get_current_user();
> + refcount_set(&rb->mmap_count, 1);
> }
>
> void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags)
>
> base-commit: 77de62ad3de3967818c3dbe656b7336ebee461d2
> --
> 2.51.0
>
On Wed, Mar 25, 2026 at 04:17:35PM +0100, Peter Zijlstra wrote:
>
> Argh,. why is this hidden in this old thread :/
>
> On Wed, Mar 25, 2026 at 06:20:53PM +0800, yuhaocheng035@gmail.com wrote:
>
> > diff --git a/kernel/events/core.c b/kernel/events/core.c
> > index 2c35acc2722b..a3228c587de1 100644
> > --- a/kernel/events/core.c
> > +++ b/kernel/events/core.c
> > @@ -6730,9 +6730,10 @@ static void perf_pmu_output_stop(struct perf_event *event);
> > * the buffer here, where we still have a VM context. This means we need
> > * to detach all events redirecting to us.
> > */
> > -static void perf_mmap_close(struct vm_area_struct *vma)
> > +static void __perf_mmap_close(struct vm_area_struct *vma, struct perf_event *event,
> > + bool holds_event_mmap_lock)
> > {
> > - struct perf_event *event = vma->vm_file->private_data;
> > + struct perf_event *iter_event;
> > mapped_f unmapped = get_mapped(event, event_unmapped);
> > struct perf_buffer *rb = ring_buffer_get(event);
> > struct user_struct *mmap_user = rb->mmap_user;
> > @@ -6772,11 +6773,14 @@ static void perf_mmap_close(struct vm_area_struct *vma)
> > if (refcount_dec_and_test(&rb->mmap_count))
> > detach_rest = true;
> >
> > - if (!refcount_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
> > + if ((!holds_event_mmap_lock &&
> > + !refcount_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) ||
> > + (holds_event_mmap_lock && !refcount_dec_and_test(&event->mmap_count)))
> > goto out_put;
>
> *groan*, this is horrible.
>
> Let me have a poke to see if there isn't a saner variant around.
Also, I just realized this patch doesn't even apply, it is against a
tree without 77de62ad3de3 ("perf/core: Fix refcount bug and potential
UAF in perf_mmap").
On Wed, Mar 25, 2026 at 3:21 AM <yuhaocheng035@gmail.com> wrote:
>
> From: Haocheng Yu <yuhaocheng035@gmail.com>
>
> Syzkaller reported a refcount_t: addition on 0; use-after-free warning
> in perf_mmap.
>
> The issue is caused by a race condition between a failing mmap() setup
> and a concurrent mmap() on a dependent event (e.g., using output
> redirection).
>
> In perf_mmap(), the ring_buffer (rb) is allocated and assigned to
> event->rb with the mmap_mutex held. The mutex is then released to
> perform map_range().
>
> If map_range() fails, perf_mmap_close() is called to clean up.
> However, since the mutex was dropped, another thread attaching to
> this event (via inherited events or output redirection) can acquire
> the mutex, observe the valid event->rb pointer, and attempt to
> increment its reference count. If the cleanup path has already
> dropped the reference count to zero, this results in a
> use-after-free or refcount saturation warning.
>
> Fix this by extending the scope of mmap_mutex to cover the
> map_range() call. This ensures that the ring buffer initialization
> and mapping (or cleanup on failure) happens atomically effectively,
> preventing other threads from accessing a half-initialized or
> dying ring buffer.
>
> v2:
> Because expanding the guarded region would cause the event->mmap_mutex
> to be acquired repeatedly in the perf_mmap_close function, potentially
> leading to a self deadlock, the original logic of perf_mmap_close was
> retained, and the mutex-holding logic was modified to obtain the
> perf_mmap_close_locked function.
>
> v3:
> The fix is made smaller by passing the parameter "holds_event_mmap_mutex"
> to perf_mmap_close.
>
> Reported-by: kernel test robot <lkp@intel.com>
> Closes: https://lore.kernel.org/oe-kbuild-all/202602020208.m7KIjdzW-lkp@intel.com/
> Suggested-by: Ian Rogers <irogers@google.com>
> Signed-off-by: Haocheng Yu <yuhaocheng035@gmail.com>
Reviewed-by: Ian Rogers <irogers@google.com>
Thanks,
Ian
> ---
> kernel/events/core.c | 78 +++++++++++++++++++++++++++-----------------
> 1 file changed, 48 insertions(+), 30 deletions(-)
>
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 2c35acc2722b..a3228c587de1 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -6730,9 +6730,10 @@ static void perf_pmu_output_stop(struct perf_event *event);
> * the buffer here, where we still have a VM context. This means we need
> * to detach all events redirecting to us.
> */
> -static void perf_mmap_close(struct vm_area_struct *vma)
> +static void __perf_mmap_close(struct vm_area_struct *vma, struct perf_event *event,
> + bool holds_event_mmap_lock)
> {
> - struct perf_event *event = vma->vm_file->private_data;
> + struct perf_event *iter_event;
> mapped_f unmapped = get_mapped(event, event_unmapped);
> struct perf_buffer *rb = ring_buffer_get(event);
> struct user_struct *mmap_user = rb->mmap_user;
> @@ -6772,11 +6773,14 @@ static void perf_mmap_close(struct vm_area_struct *vma)
> if (refcount_dec_and_test(&rb->mmap_count))
> detach_rest = true;
>
> - if (!refcount_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
> + if ((!holds_event_mmap_lock &&
> + !refcount_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) ||
> + (holds_event_mmap_lock && !refcount_dec_and_test(&event->mmap_count)))
> goto out_put;
>
> ring_buffer_attach(event, NULL);
> - mutex_unlock(&event->mmap_mutex);
> + if (!holds_event_mmap_lock)
> + mutex_unlock(&event->mmap_mutex);
>
> /* If there's still other mmap()s of this buffer, we're done. */
> if (!detach_rest)
> @@ -6789,8 +6793,8 @@ static void perf_mmap_close(struct vm_area_struct *vma)
> */
> again:
> rcu_read_lock();
> - list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
> - if (!atomic_long_inc_not_zero(&event->refcount)) {
> + list_for_each_entry_rcu(iter_event, &rb->event_list, rb_entry) {
> + if (!atomic_long_inc_not_zero(&iter_event->refcount)) {
> /*
> * This event is en-route to free_event() which will
> * detach it and remove it from the list.
> @@ -6799,7 +6803,8 @@ static void perf_mmap_close(struct vm_area_struct *vma)
> }
> rcu_read_unlock();
>
> - mutex_lock(&event->mmap_mutex);
> + if (!holds_event_mmap_lock)
> + mutex_lock(&iter_event->mmap_mutex);
> /*
> * Check we didn't race with perf_event_set_output() which can
> * swizzle the rb from under us while we were waiting to
> @@ -6810,11 +6815,12 @@ static void perf_mmap_close(struct vm_area_struct *vma)
> * still restart the iteration to make sure we're not now
> * iterating the wrong list.
> */
> - if (event->rb == rb)
> - ring_buffer_attach(event, NULL);
> + if (iter_event->rb == rb)
> + ring_buffer_attach(iter_event, NULL);
>
> - mutex_unlock(&event->mmap_mutex);
> - put_event(event);
> + if (!holds_event_mmap_lock)
> + mutex_unlock(&iter_event->mmap_mutex);
> + put_event(iter_event);
>
> /*
> * Restart the iteration; either we're on the wrong list or
> @@ -6842,6 +6848,18 @@ static void perf_mmap_close(struct vm_area_struct *vma)
> ring_buffer_put(rb); /* could be last */
> }
>
> +static void perf_mmap_close(struct vm_area_struct *vma)
> +{
> + struct perf_event *event = vma->vm_file->private_data;
> +
> + __perf_mmap_close(vma, event, false);
> +}
> +
> +static void perf_mmap_close_locked(struct vm_area_struct *vma, struct perf_event *event)
> +{
> + __perf_mmap_close(vma, event, true);
> +}
> +
> static vm_fault_t perf_mmap_pfn_mkwrite(struct vm_fault *vmf)
> {
> /* The first page is the user control page, others are read-only. */
> @@ -7167,28 +7185,28 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
> ret = perf_mmap_aux(vma, event, nr_pages);
> if (ret)
> return ret;
> - }
>
> - /*
> - * Since pinned accounting is per vm we cannot allow fork() to copy our
> - * vma.
> - */
> - vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP);
> - vma->vm_ops = &perf_mmap_vmops;
> + /*
> + * Since pinned accounting is per vm we cannot allow fork() to copy our
> + * vma.
> + */
> + vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP);
> + vma->vm_ops = &perf_mmap_vmops;
>
> - mapped = get_mapped(event, event_mapped);
> - if (mapped)
> - mapped(event, vma->vm_mm);
> + mapped = get_mapped(event, event_mapped);
> + if (mapped)
> + mapped(event, vma->vm_mm);
>
> - /*
> - * Try to map it into the page table. On fail, invoke
> - * perf_mmap_close() to undo the above, as the callsite expects
> - * full cleanup in this case and therefore does not invoke
> - * vmops::close().
> - */
> - ret = map_range(event->rb, vma);
> - if (ret)
> - perf_mmap_close(vma);
> + /*
> + * Try to map it into the page table. On fail, invoke
> + * perf_mmap_close() to undo the above, as the callsite expects
> + * full cleanup in this case and therefore does not invoke
> + * vmops::close().
> + */
> + ret = map_range(event->rb, vma);
> + if (ret)
> + perf_mmap_close_locked(vma, event);
> + }
>
> return ret;
> }
>
> base-commit: 7d0a66e4bb9081d75c82ec4957c50034cb0ea449
> --
> 2.51.0
>
© 2016 - 2026 Red Hat, Inc.