kernel/bpf/stackmap.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-)
As Alexei suggested, the return value from get_perf_callchain() may be
reused if another task preempts and requests the stack after BPF program
switched to migrate disable.
Reported-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Tao Chen <chen.dylane@linux.dev>
---
kernel/bpf/stackmap.c | 14 +++++---------
1 file changed, 5 insertions(+), 9 deletions(-)
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 2e182a3ac4c..07892320906 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -314,8 +314,10 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
if (max_depth > sysctl_perf_event_max_stack)
max_depth = sysctl_perf_event_max_stack;
+ preempt_disable();
trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
false, false);
+ preempt_enable();
if (unlikely(!trace))
/* couldn't fetch the stack trace */
@@ -443,9 +445,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
if (sysctl_perf_event_max_stack < max_depth)
max_depth = sysctl_perf_event_max_stack;
- if (may_fault)
- rcu_read_lock(); /* need RCU for perf's callchain below */
-
+ preempt_disable();
if (trace_in)
trace = trace_in;
else if (kernel && task)
@@ -455,8 +455,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
crosstask, false);
if (unlikely(!trace) || trace->nr < skip) {
- if (may_fault)
- rcu_read_unlock();
+ preempt_enable();
goto err_fault;
}
@@ -474,10 +473,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
} else {
memcpy(buf, ips, copy_len);
}
-
- /* trace/ips should not be dereferenced after this point */
- if (may_fault)
- rcu_read_unlock();
+ preempt_enable();
if (user_build_id)
stack_map_get_build_id_offset(buf, trace_nr, user, may_fault);
--
2.48.1
On Mon, Sep 22, 2025 at 12:54 AM Tao Chen <chen.dylane@linux.dev> wrote: > > As Alexei suggested, the return value from get_perf_callchain() may be > reused if another task preempts and requests the stack after BPF program > switched to migrate disable. > > Reported-by: Alexei Starovoitov <ast@kernel.org> > Signed-off-by: Tao Chen <chen.dylane@linux.dev> > --- > kernel/bpf/stackmap.c | 14 +++++--------- > 1 file changed, 5 insertions(+), 9 deletions(-) > > diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c > index 2e182a3ac4c..07892320906 100644 > --- a/kernel/bpf/stackmap.c > +++ b/kernel/bpf/stackmap.c > @@ -314,8 +314,10 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, > if (max_depth > sysctl_perf_event_max_stack) > max_depth = sysctl_perf_event_max_stack; > > + preempt_disable(); > trace = get_perf_callchain(regs, 0, kernel, user, max_depth, > false, false); > + preempt_enable(); This is obviously wrong. As soon as preemption is enabled, trace can be overwritten. guard(preempt)(); can fix it, but the length of the preempt disabled section will be quite big. The way get_perf_callchain() api is written I don't see another option though. Unless we refactor it similar to bpf_try_get_buffers(). pw-bot: cr
在 2025/9/23 10:53, Alexei Starovoitov 写道:
> On Mon, Sep 22, 2025 at 12:54 AM Tao Chen <chen.dylane@linux.dev> wrote:
>>
>> As Alexei suggested, the return value from get_perf_callchain() may be
>> reused if another task preempts and requests the stack after BPF program
>> switched to migrate disable.
>>
>> Reported-by: Alexei Starovoitov <ast@kernel.org>
>> Signed-off-by: Tao Chen <chen.dylane@linux.dev>
>> ---
>> kernel/bpf/stackmap.c | 14 +++++---------
>> 1 file changed, 5 insertions(+), 9 deletions(-)
>>
>> diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
>> index 2e182a3ac4c..07892320906 100644
>> --- a/kernel/bpf/stackmap.c
>> +++ b/kernel/bpf/stackmap.c
>> @@ -314,8 +314,10 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
>> if (max_depth > sysctl_perf_event_max_stack)
>> max_depth = sysctl_perf_event_max_stack;
>>
>> + preempt_disable();
>> trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
>> false, false);
>> + preempt_enable();
>
> This is obviously wrong.
> As soon as preemption is enabled, trace can be overwritten.
> guard(preempt)();
> can fix it, but the length of the preempt disabled section
> will be quite big.
> The way get_perf_callchain() api is written I don't see
> another option though. Unless we refactor it similar
> to bpf_try_get_buffers().
>
> pw-bot: cr
Hi Alexei,
I tried to understand what you meant and looked at the implementation of
get_perf_callchain.
Only one perf_callchain_entry on every cpu right now.
callchain_cpus_entries(rcu global avariable)
↓
struct callchain_cpus_entries {
struct perf_callchain_entry *cpu_entries[];
|
} |-> perf_callchain_entry0 cpu0
perf_callchain_entry1 cpu1
…
perf_callchain_entryn cpun
If we want to realise it like bpf_try_get_buffers, we should
alloc a perf_callchain_entry array on every cpu right?
callchain_cpus_entries(rcu global avariable)
↓
struct callchain_cpus_entries {
struct perf_callchain_entry *cpu_entries[];
|
} |-> perf_callchain_entry0[N] cpu0
perf_callchain_entry1[N] cpu1
…
perf_callchain_entryn[N] cpun
--
Best Regards
Tao Chen
On Thu, Sep 25, 2025 at 10:45 AM Tao Chen <chen.dylane@linux.dev> wrote:
>
> 在 2025/9/23 10:53, Alexei Starovoitov 写道:
> > On Mon, Sep 22, 2025 at 12:54 AM Tao Chen <chen.dylane@linux.dev> wrote:
> >>
> >> As Alexei suggested, the return value from get_perf_callchain() may be
> >> reused if another task preempts and requests the stack after BPF program
> >> switched to migrate disable.
> >>
> >> Reported-by: Alexei Starovoitov <ast@kernel.org>
> >> Signed-off-by: Tao Chen <chen.dylane@linux.dev>
> >> ---
> >> kernel/bpf/stackmap.c | 14 +++++---------
> >> 1 file changed, 5 insertions(+), 9 deletions(-)
> >>
> >> diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
> >> index 2e182a3ac4c..07892320906 100644
> >> --- a/kernel/bpf/stackmap.c
> >> +++ b/kernel/bpf/stackmap.c
> >> @@ -314,8 +314,10 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
> >> if (max_depth > sysctl_perf_event_max_stack)
> >> max_depth = sysctl_perf_event_max_stack;
> >>
> >> + preempt_disable();
> >> trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
> >> false, false);
> >> + preempt_enable();
> >
> > This is obviously wrong.
> > As soon as preemption is enabled, trace can be overwritten.
> > guard(preempt)();
> > can fix it, but the length of the preempt disabled section
> > will be quite big.
> > The way get_perf_callchain() api is written I don't see
> > another option though. Unless we refactor it similar
> > to bpf_try_get_buffers().
> >
> > pw-bot: cr
>
> Hi Alexei,
>
> I tried to understand what you meant and looked at the implementation of
> get_perf_callchain.
>
> Only one perf_callchain_entry on every cpu right now.
>
> callchain_cpus_entries(rcu global avariable)
> ↓
> struct callchain_cpus_entries {
> struct perf_callchain_entry *cpu_entries[];
> |
> } |-> perf_callchain_entry0 cpu0
> perf_callchain_entry1 cpu1
> …
> perf_callchain_entryn cpun
>
>
> If we want to realise it like bpf_try_get_buffers, we should
> alloc a perf_callchain_entry array on every cpu right?
>
> callchain_cpus_entries(rcu global avariable)
> ↓
> struct callchain_cpus_entries {
> struct perf_callchain_entry *cpu_entries[];
> |
> } |-> perf_callchain_entry0[N] cpu0
> perf_callchain_entry1[N] cpu1
> …
> perf_callchain_entryn[N] cpun
Either allow a few entries per CPU (bpf_try_get_buffers allows up to 3
buffers per CPU), or extend get_perf_callchain() to accept
perf_callchain_entry from outside, and then we can do that in a
BPF-specific way.
>
> --
> Best Regards
> Tao Chen
© 2016 - 2026 Red Hat, Inc.