kernel/events/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
In order to fix the race condition between exit_mmap and
perf_output_sample below, forbidding to copy the user stack
of an exiting process.
Internal error: synchronous external abort: ffffffff96000010 [#1]
PREEMPT SMP
CPU: 3 PID: 2651 Comm: binder:2649_1 Tainted: G W OE
5.15.149-android13-8-00008-gbe074b05e5af-ab12096863 #1
Hardware name: Spreadtrum UMS9230 1H10 SoC (DT)
pstate: 204000c5 (nzCv daIF +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
pc : __arch_copy_from_user+0x180/0x218
lr : arch_perf_out_copy_user+0xb0/0x17c
sp : ffffffc00801baf0
x29: ffffffc00801baf0 x28: ffffffc00801bbf8 x27: ffffffc00801bbe8
x26: 0000000000000000 x25: 0000000000001000 x24: 000000000000feb8
x23: 00000000000005f0 x22: ffffff80613c8000 x21: ffffff8143102a10
x20: 0000007c239643c0 x19: 00000000000005f0 x18: ffffffc00801d058
x17: ffffffc16e677000 x16: ffffffc008018000 x15: 0000007c239643c0
x14: 0000000000000002 x13: 0000000000000003 x12: ffffffc008000000
x11: ffffff8000090000 x10: ffffffc008090000 x9 : 0000007fffffffff
x8 : 0000007c239643c0 x7 : 000000000000feb8 x6 : ffffff8143102a10
x5 : ffffff8143103000 x4 : 0000000000000000 x3 : ffffff8093cad140
x2 : 0000000000000570 x1 : 0000007c239643c0 x0 : ffffff8143102a10
Call trace:
__arch_copy_from_user+0x180/0x218
perf_output_sample+0x14e4/0x1904
perf_event_output_forward+0x90/0x130
__perf_event_overflow+0xc8/0x17c
perf_swevent_hrtimer+0x124/0x290
__run_hrtimer+0x134/0x4a0
hrtimer_interrupt+0x2e4/0x560
arch_timer_handler_phys+0x5c/0xa0
handle_percpu_devid_irq+0xc0/0x374
handle_domain_irq+0xd8/0x160
gic_handle_irq.34215+0x58/0x26c
call_on_irq_stack+0x3c/0x70
do_interrupt_handler+0x44/0xa0
el1_interrupt+0x34/0x64
el1h_64_irq_handler+0x1c/0x2c
el1h_64_irq+0x7c/0x80
release_pages+0xac/0x9b4
tlb_finish_mmu+0xb0/0x238
exit_mmap+0x1b8/0x538
__mmput+0x40/0x274
mmput+0x40/0x134
exit_mm+0x3bc/0x72c
do_exit+0x294/0x1160
do_group_exit+0xc8/0x174
get_signal+0x830/0x95c
do_signal+0x9c/0x2a8
do_notify_resume+0x98/0x1ac
el0_svc+0x5c/0x84
el0t_64_sync_handler+0x88/0xec
el0t_64_sync+0x1b8/0x1bc
Signed-off-by: Baisheng Gao <baisheng.gao@unisoc.com>
---
kernel/events/core.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e93c19565914..9c9b571b812d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7867,7 +7867,8 @@ void perf_output_sample(struct perf_output_handle *handle,
}
}
- if (sample_type & PERF_SAMPLE_STACK_USER) {
+ if (sample_type & PERF_SAMPLE_STACK_USER &&
+ !(current->flags & PF_EXITING)) {
perf_output_sample_ustack(handle,
data->stack_user_size,
data->regs_user.regs);
--
2.34.1
________________________________
This email (including its attachments) is intended only for the person or entity to which it is addressed and may contain information that is privileged, confidential or otherwise protected from disclosure. Unauthorized use, dissemination, distribution or copying of this email or the information herein or taking any action in reliance on the contents of this email or the information herein, by anyone other than the intended recipient, or an employee or agent responsible for delivering the message to the intended recipient, is strictly prohibited. If you are not the intended recipient, please do not read, copy, use or disclose any part of this e-mail to others. Please notify the sender immediately and permanently delete this e-mail and any attachments if you received it in error. Internet communications cannot be guaranteed to be timely, secure, error-free or virus-free. The sender does not accept liability for any errors or omissions.
本邮件及其附件具有保密性质,受法律保护不得泄露,仅发送给本邮件所指特定收件人。严禁非经授权使用、宣传、发布或复制本邮件或其内容。若非该特定收件人,请勿阅读、复制、 使用或披露本邮件的任何内容。若误收本邮件,请从系统中永久性删除本邮件及所有附件,并以回复邮件的方式即刻告知发件人。无法保证互联网通信及时、安全、无误或防毒。发件人对任何错漏均不承担责任。
On Thu, Apr 24, 2025 at 10:54:29AM +0800, Baisheng Gao wrote:
> In order to fix the race condition between exit_mmap and
> perf_output_sample below, forbidding to copy the user stack
> of an exiting process.
>
> Internal error: synchronous external abort: ffffffff96000010 [#1]
> PREEMPT SMP
That ESR value (0x96000010, which got sign-extended somewhere), is a
synchronous external abort, not on translation table walk.
That means that some memory access was translated to some PA where the
endpoint responded with an error (and it strongly implies you're poking
MMIO or a PA terminated early by the interconnect).
> CPU: 3 PID: 2651 Comm: binder:2649_1 Tainted: G W OE
> 5.15.149-android13-8-00008-gbe074b05e5af-ab12096863 #1
> Hardware name: Spreadtrum UMS9230 1H10 SoC (DT)
> pstate: 204000c5 (nzCv daIF +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
> pc : __arch_copy_from_user+0x180/0x218
> lr : arch_perf_out_copy_user+0xb0/0x17c
> sp : ffffffc00801baf0
> x29: ffffffc00801baf0 x28: ffffffc00801bbf8 x27: ffffffc00801bbe8
> x26: 0000000000000000 x25: 0000000000001000 x24: 000000000000feb8
> x23: 00000000000005f0 x22: ffffff80613c8000 x21: ffffff8143102a10
> x20: 0000007c239643c0 x19: 00000000000005f0 x18: ffffffc00801d058
> x17: ffffffc16e677000 x16: ffffffc008018000 x15: 0000007c239643c0
> x14: 0000000000000002 x13: 0000000000000003 x12: ffffffc008000000
> x11: ffffff8000090000 x10: ffffffc008090000 x9 : 0000007fffffffff
> x8 : 0000007c239643c0 x7 : 000000000000feb8 x6 : ffffff8143102a10
> x5 : ffffff8143103000 x4 : 0000000000000000 x3 : ffffff8093cad140
> x2 : 0000000000000570 x1 : 0000007c239643c0 x0 : ffffff8143102a10
> Call trace:
> __arch_copy_from_user+0x180/0x218
> perf_output_sample+0x14e4/0x1904
> perf_event_output_forward+0x90/0x130
> __perf_event_overflow+0xc8/0x17c
> perf_swevent_hrtimer+0x124/0x290
> __run_hrtimer+0x134/0x4a0
> hrtimer_interrupt+0x2e4/0x560
> arch_timer_handler_phys+0x5c/0xa0
> handle_percpu_devid_irq+0xc0/0x374
> handle_domain_irq+0xd8/0x160
> gic_handle_irq.34215+0x58/0x26c
> call_on_irq_stack+0x3c/0x70
> do_interrupt_handler+0x44/0xa0
> el1_interrupt+0x34/0x64
> el1h_64_irq_handler+0x1c/0x2c
> el1h_64_irq+0x7c/0x80
> release_pages+0xac/0x9b4
> tlb_finish_mmu+0xb0/0x238
> exit_mmap+0x1b8/0x538
> __mmput+0x40/0x274
> mmput+0x40/0x134
> exit_mm+0x3bc/0x72c
The mmput() here happens after current->mm was set to NULL and the task
entered lazy TLB mode.
AFAICT, either:
* Something has gone wrong such that the CPU is still using the
translation tables of the exiting task, and those are in some
inconsistent state.
* Due to lazymm, the active_mm belongs to some other tas, and perf is
sampling from that other task's VA space (which does not correlate at
all with the pt_regs). That other task happens to have some MMIO
mapped.
... either of which is a pretty horrid bug.
Loooking at 5.15.149 and current HEAD (5abc7438f1e9), do_exit() calls
exit_mm() before perf_event_exit_task(), so it looks
like perf could sample from another task's mm.
Yuck.
Peter, does the above sound plausible to you?
Mark.
> do_exit+0x294/0x1160
> do_group_exit+0xc8/0x174
> get_signal+0x830/0x95c
> do_signal+0x9c/0x2a8
> do_notify_resume+0x98/0x1ac
> el0_svc+0x5c/0x84
> el0t_64_sync_handler+0x88/0xec
> el0t_64_sync+0x1b8/0x1bc
>
> Signed-off-by: Baisheng Gao <baisheng.gao@unisoc.com>
> ---
> kernel/events/core.c | 3 ++-
> 1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index e93c19565914..9c9b571b812d 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -7867,7 +7867,8 @@ void perf_output_sample(struct perf_output_handle *handle,
> }
> }
>
> - if (sample_type & PERF_SAMPLE_STACK_USER) {
> + if (sample_type & PERF_SAMPLE_STACK_USER &&
> + !(current->flags & PF_EXITING)) {
> perf_output_sample_ustack(handle,
> data->stack_user_size,
> data->regs_user.regs);
> --
> 2.34.1
>
> ________________________________
> This email (including its attachments) is intended only for the person or entity to which it is addressed and may contain information that is privileged, confidential or otherwise protected from disclosure. Unauthorized use, dissemination, distribution or copying of this email or the information herein or taking any action in reliance on the contents of this email or the information herein, by anyone other than the intended recipient, or an employee or agent responsible for delivering the message to the intended recipient, is strictly prohibited. If you are not the intended recipient, please do not read, copy, use or disclose any part of this e-mail to others. Please notify the sender immediately and permanently delete this e-mail and any attachments if you received it in error. Internet communications cannot be guaranteed to be timely, secure, error-free or virus-free. The sender does not accept liability for any errors or omissions.
> 本邮件及其附件具有保密性质,受法律保护不得泄露,仅发送给本邮件所指特定收件人。严禁非经授权使用、宣传、发布或复制本邮件或其内容。若非该特定收件人,请勿阅读、复制、 使用或披露本邮件的任何内容。若误收本邮件,请从系统中永久性删除本邮件及所有附件,并以回复邮件的方式即刻告知发件人。无法保证互联网通信及时、安全、无误或防毒。发件人对任何错漏均不承担责任。
On Wed, Jun 04, 2025 at 03:05:43PM +0100, Mark Rutland wrote: > Loooking at 5.15.149 and current HEAD (5abc7438f1e9), do_exit() calls > exit_mm() before perf_event_exit_task(), so it looks > like perf could sample from another task's mm. > > Yuck. > > Peter, does the above sound plausible to you? Yuck indeed. And yeah, we should probably re-arrange things there. Something like so? --- diff --git a/kernel/exit.c b/kernel/exit.c index 38645039dd8f..3407c16fc5a3 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -944,6 +944,15 @@ void __noreturn do_exit(long code) taskstats_exit(tsk, group_dead); trace_sched_process_exit(tsk, group_dead); + /* + * Since samping can touch ->mm, make sure to stop everything before we + * tear it down. + * + * Also flushes inherited counters to the parent - before the parent + * gets woken up by child-exit notifications. + */ + perf_event_exit_task(tsk); + exit_mm(); if (group_dead) @@ -959,14 +968,6 @@ void __noreturn do_exit(long code) exit_task_work(tsk); exit_thread(tsk); - /* - * Flush inherited counters to the parent - before the parent - * gets woken up by child-exit notifications. - * - * because of cgroup mode, must be called before cgroup_exit() - */ - perf_event_exit_task(tsk); - sched_autogroup_exit_task(tsk); cgroup_exit(tsk);
On Wed, Jun 04, 2025 at 04:24:37PM +0200, Peter Zijlstra wrote: > On Wed, Jun 04, 2025 at 03:05:43PM +0100, Mark Rutland wrote: > > > Loooking at 5.15.149 and current HEAD (5abc7438f1e9), do_exit() calls > > exit_mm() before perf_event_exit_task(), so it looks > > like perf could sample from another task's mm. > > > > Yuck. > > > > Peter, does the above sound plausible to you? > > Yuck indeed. And yeah, we should probably re-arrange things there. > > Something like so? That should plumb the hole for task-bound events, yep. I think we might need something in the perf core for cpu-bound events, assuming those can also potentially make samples. From a quick scan of perf_event_sample_format: PERF_SAMPLE_IP // safe PERF_SAMPLE_TID // safe PERF_SAMPLE_TIME // safe PERF_SAMPLE_ADDR // ??? PERF_SAMPLE_READ // ??? PERF_SAMPLE_CALLCHAIN // may access mm PERF_SAMPLE_ID // safe PERF_SAMPLE_CPU // safe PERF_SAMPLE_PERIOD // safe PERF_SAMPLE_STREAM_ID // ??? PERF_SAMPLE_RAW // ??? PERF_SAMPLE_BRANCH_STACK // safe PERF_SAMPLE_REGS_USER // safe PERF_SAMPLE_STACK_USER // may access mm PERF_SAMPLE_WEIGHT // ??? PERF_SAMPLE_DATA_SRC // ??? PERF_SAMPLE_IDENTIFIER // safe PERF_SAMPLE_TRANSACTION // ??? PERF_SAMPLE_REGS_INTR // safe PERF_SAMPLE_PHYS_ADDR // safe; handles mm==NULL && addr < TASK_SIZE PERF_SAMPLE_AUX // ??? PERF_SAMPLE_CGROUP // safe PERF_SAMPLE_DATA_PAGE_SIZE // partial; doesn't check addr < TASK_SIZE PERF_SAMPLE_CODE_PAGE_SIZE // partial; doesn't check addr < TASK_SIZE PERF_SAMPLE_WEIGHT_STRUCT // ??? ... I think all the dodgy cases use mm somehow, so maybe the perf core should check for current->mm? > > --- > diff --git a/kernel/exit.c b/kernel/exit.c > index 38645039dd8f..3407c16fc5a3 100644 > --- a/kernel/exit.c > +++ b/kernel/exit.c > @@ -944,6 +944,15 @@ void __noreturn do_exit(long code) > taskstats_exit(tsk, group_dead); > trace_sched_process_exit(tsk, group_dead); > > + /* > + * Since samping can touch ->mm, make sure to stop everything before we Typo: s/samping/sampling/ > + * tear it down. > + * > + * Also flushes inherited counters to the parent - before the parent > + * gets woken up by child-exit notifications. > + */ > + perf_event_exit_task(tsk); > + > exit_mm(); > > if (group_dead) > @@ -959,14 +968,6 @@ void __noreturn do_exit(long code) > exit_task_work(tsk); > exit_thread(tsk); > > - /* > - * Flush inherited counters to the parent - before the parent > - * gets woken up by child-exit notifications. > - * > - * because of cgroup mode, must be called before cgroup_exit() > - */ > - perf_event_exit_task(tsk); > - > sched_autogroup_exit_task(tsk); > cgroup_exit(tsk); > Otherwise, that looks good to me! Mark.
On Wed, Jun 04, 2025 at 03:55:01PM +0100, Mark Rutland wrote: > I think we might need something in the perf core for cpu-bound events, assuming > those can also potentially make samples. > > From a quick scan of perf_event_sample_format: > > PERF_SAMPLE_IP // safe > PERF_SAMPLE_TID // safe > PERF_SAMPLE_TIME // safe > PERF_SAMPLE_ADDR // ??? Safe, set by driver, or 0. > PERF_SAMPLE_READ // ??? This is basically read(2) on a fd, but in sample format. Only the count values. This good. > PERF_SAMPLE_CALLCHAIN // may access mm Right. > PERF_SAMPLE_ID // safe > PERF_SAMPLE_CPU // safe > PERF_SAMPLE_PERIOD // safe > PERF_SAMPLE_STREAM_ID // ??? safe > PERF_SAMPLE_RAW // ??? safe, this is random data returned by the driver > PERF_SAMPLE_BRANCH_STACK // safe > PERF_SAMPLE_REGS_USER // safe > PERF_SAMPLE_STACK_USER // may access mm > PERF_SAMPLE_WEIGHT // ??? > PERF_SAMPLE_DATA_SRC // ??? Both should be safe, driver sets them. > PERF_SAMPLE_IDENTIFIER // safe > PERF_SAMPLE_TRANSACTION // ??? Safe, another random thing the driver can set. This was for transactional memory stuff. > PERF_SAMPLE_REGS_INTR // safe > PERF_SAMPLE_PHYS_ADDR // safe; handles mm==NULL && addr < TASK_SIZE > PERF_SAMPLE_AUX // ??? Safe, should be driver, PT for Intel, or that CoreSight for ARM. > PERF_SAMPLE_CGROUP // safe > PERF_SAMPLE_DATA_PAGE_SIZE // partial; doesn't check addr < TASK_SIZE > PERF_SAMPLE_CODE_PAGE_SIZE // partial; doesn't check addr < TASK_SIZE But does use init_mm when !mm, perf_get_page_size(). > PERF_SAMPLE_WEIGHT_STRUCT // ??? Safe, driver bits again. > > ... I think all the dodgy cases use mm somehow, so maybe the perf core > should check for current->mm? This then... I suppose. --- diff --git a/kernel/events/core.c b/kernel/events/core.c index f34c99f8ce8f..49944e4ec3e7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7439,6 +7439,10 @@ perf_sample_ustack_size(u16 stack_size, u16 header_size, if (!regs) return 0; + /* No mm, no stack, no dump. */ + if (!current->mm) + return 0; + /* * Check if we fit in with the requested stack size into the: * - TASK_SIZE @@ -8153,6 +8157,9 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) if (!kernel && !user) return &__empty_callchain; + if (!current->mm) + user = false; + callchain = get_perf_callchain(regs, 0, kernel, user, max_stack, crosstask, true); return callchain ?: &__empty_callchain;
On Wed, Jun 04, 2025 at 05:32:19PM +0200, Peter Zijlstra wrote: > On Wed, Jun 04, 2025 at 03:55:01PM +0100, Mark Rutland wrote: > > > I think we might need something in the perf core for cpu-bound events, assuming > > those can also potentially make samples. > > > > From a quick scan of perf_event_sample_format: > > PERF_SAMPLE_DATA_PAGE_SIZE // partial; doesn't check addr < TASK_SIZE > > PERF_SAMPLE_CODE_PAGE_SIZE // partial; doesn't check addr < TASK_SIZE > > But does use init_mm when !mm, perf_get_page_size(). Yeah; think there might be a distinct issue (at least on arm64) where it's possible to probe the depth of the kernel page tables, but that might only be a problem on arm64 due to the separate TTBR0/TTBR1 tables for the low/high halves. I'll go take another look; that needn't block the rest of this. [...] > > PERF_SAMPLE_WEIGHT_STRUCT // ??? > Safe, driver bits again. Thanks for digging through the rest of these! > > ... I think all the dodgy cases use mm somehow, so maybe the perf core > > should check for current->mm? > > This then... I suppose. That looks good to me! Mark. > --- > diff --git a/kernel/events/core.c b/kernel/events/core.c > index f34c99f8ce8f..49944e4ec3e7 100644 > --- a/kernel/events/core.c > +++ b/kernel/events/core.c > @@ -7439,6 +7439,10 @@ perf_sample_ustack_size(u16 stack_size, u16 header_size, > if (!regs) > return 0; > > + /* No mm, no stack, no dump. */ > + if (!current->mm) > + return 0; > + > /* > * Check if we fit in with the requested stack size into the: > * - TASK_SIZE > @@ -8153,6 +8157,9 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) > if (!kernel && !user) > return &__empty_callchain; > > + if (!current->mm) > + user = false; > + > callchain = get_perf_callchain(regs, 0, kernel, user, > max_stack, crosstask, true); > return callchain ?: &__empty_callchain;
On Wed, Jun 04, 2025 at 05:08:23PM +0100, Mark Rutland wrote:
> That looks good to me!
I now haz the below patch.
---
Subject: perf: Fix sample vs do_exit()
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu Jun 5 12:31:45 CEST 2025
Baisheng Gao reported an ARM64 crash, which Mark decoded as being a
synchronous external abort -- most likely due to trying to access
MMIO in bad ways.
The crash further shows perf trying to do a user stack sample while in
exit_mmap()'s tlb_finish_mmu() -- i.e. while tearing down the address
space it is trying to access.
It turns out that we stop perf after we tear down the userspace mm; a
receipie for disaster, since perf likes to access userspace for
various reasons.
Flip this order by moving up where we stop perf in do_exit().
Additionally, harden PERF_SAMPLE_CALLCHAIN and PERF_SAMPLE_STACK_USER
to abort when the current task does not have an mm (exit_mm() makes
sure to set current->mm = NULL; before commencing with the actual
teardown). Such that CPU wide events don't trip on this same problem.
Fixes: c5ebcedb566e ("perf: Add ability to attach user stack dump to sample")
Reported-by: Baisheng Gao <baisheng.gao@unisoc.com>
Suggested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
kernel/events/core.c | 7 +++++++
kernel/exit.c | 17 +++++++++--------
2 files changed, 16 insertions(+), 8 deletions(-)
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7439,6 +7439,10 @@ perf_sample_ustack_size(u16 stack_size,
if (!regs)
return 0;
+ /* No mm, no stack, no dump. */
+ if (!current->mm)
+ return 0;
+
/*
* Check if we fit in with the requested stack size into the:
* - TASK_SIZE
@@ -8150,6 +8154,9 @@ perf_callchain(struct perf_event *event,
const u32 max_stack = event->attr.sample_max_stack;
struct perf_callchain_entry *callchain;
+ if (!current->mm)
+ user = false;
+
if (!kernel && !user)
return &__empty_callchain;
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -944,6 +944,15 @@ void __noreturn do_exit(long code)
taskstats_exit(tsk, group_dead);
trace_sched_process_exit(tsk, group_dead);
+ /*
+ * Since sampling can touch ->mm, make sure to stop everything before we
+ * tear it down.
+ *
+ * Also flushes inherited counters to the parent - before the parent
+ * gets woken up by child-exit notifications.
+ */
+ perf_event_exit_task(tsk);
+
exit_mm();
if (group_dead)
@@ -959,14 +968,6 @@ void __noreturn do_exit(long code)
exit_task_work(tsk);
exit_thread(tsk);
- /*
- * Flush inherited counters to the parent - before the parent
- * gets woken up by child-exit notifications.
- *
- * because of cgroup mode, must be called before cgroup_exit()
- */
- perf_event_exit_task(tsk);
-
sched_autogroup_exit_task(tsk);
cgroup_exit(tsk);
The following commit has been merged into the perf/urgent branch of tip:
Commit-ID: 4f6fc782128355931527cefe3eb45338abd8ab39
Gitweb: https://git.kernel.org/tip/4f6fc782128355931527cefe3eb45338abd8ab39
Author: Peter Zijlstra <peterz@infradead.org>
AuthorDate: Thu, 05 Jun 2025 12:31:45 +02:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 05 Jun 2025 14:37:51 +02:00
perf: Fix sample vs do_exit()
Baisheng Gao reported an ARM64 crash, which Mark decoded as being a
synchronous external abort -- most likely due to trying to access
MMIO in bad ways.
The crash further shows perf trying to do a user stack sample while in
exit_mmap()'s tlb_finish_mmu() -- i.e. while tearing down the address
space it is trying to access.
It turns out that we stop perf after we tear down the userspace mm; a
receipie for disaster, since perf likes to access userspace for
various reasons.
Flip this order by moving up where we stop perf in do_exit().
Additionally, harden PERF_SAMPLE_CALLCHAIN and PERF_SAMPLE_STACK_USER
to abort when the current task does not have an mm (exit_mm() makes
sure to set current->mm = NULL; before commencing with the actual
teardown). Such that CPU wide events don't trip on this same problem.
Fixes: c5ebcedb566e ("perf: Add ability to attach user stack dump to sample")
Reported-by: Baisheng Gao <baisheng.gao@unisoc.com>
Suggested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20250605110815.GQ39944@noisy.programming.kicks-ass.net
---
kernel/events/core.c | 7 +++++++
kernel/exit.c | 17 +++++++++--------
2 files changed, 16 insertions(+), 8 deletions(-)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f34c99f..26dec1d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7439,6 +7439,10 @@ perf_sample_ustack_size(u16 stack_size, u16 header_size,
if (!regs)
return 0;
+ /* No mm, no stack, no dump. */
+ if (!current->mm)
+ return 0;
+
/*
* Check if we fit in with the requested stack size into the:
* - TASK_SIZE
@@ -8150,6 +8154,9 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
const u32 max_stack = event->attr.sample_max_stack;
struct perf_callchain_entry *callchain;
+ if (!current->mm)
+ user = false;
+
if (!kernel && !user)
return &__empty_callchain;
diff --git a/kernel/exit.c b/kernel/exit.c
index 3864503..1883150 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -944,6 +944,15 @@ void __noreturn do_exit(long code)
taskstats_exit(tsk, group_dead);
trace_sched_process_exit(tsk, group_dead);
+ /*
+ * Since sampling can touch ->mm, make sure to stop everything before we
+ * tear it down.
+ *
+ * Also flushes inherited counters to the parent - before the parent
+ * gets woken up by child-exit notifications.
+ */
+ perf_event_exit_task(tsk);
+
exit_mm();
if (group_dead)
@@ -959,14 +968,6 @@ void __noreturn do_exit(long code)
exit_task_work(tsk);
exit_thread(tsk);
- /*
- * Flush inherited counters to the parent - before the parent
- * gets woken up by child-exit notifications.
- *
- * because of cgroup mode, must be called before cgroup_exit()
- */
- perf_event_exit_task(tsk);
-
sched_autogroup_exit_task(tsk);
cgroup_exit(tsk);
On Thu, Apr 24, 2025 at 10:54:29AM +0800, Baisheng Gao wrote: > In order to fix the race condition between exit_mmap and > perf_output_sample below, forbidding to copy the user stack > of an exiting process. You cannot say "the race" and not describe the race. A stack dump is not a description of a race.
© 2016 - 2026 Red Hat, Inc.