kernel/bpf/stackmap.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-)
As Alexei suggested, the return value from get_perf_callchain() may be
reused if another task preempts and requests the stack after BPF program
switched to migrate disable.
Reported-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Tao Chen <chen.dylane@linux.dev>
---
kernel/bpf/stackmap.c | 14 +++++---------
1 file changed, 5 insertions(+), 9 deletions(-)
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 2e182a3ac4c..07892320906 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -314,8 +314,10 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
if (max_depth > sysctl_perf_event_max_stack)
max_depth = sysctl_perf_event_max_stack;
+ preempt_disable();
trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
false, false);
+ preempt_enable();
if (unlikely(!trace))
/* couldn't fetch the stack trace */
@@ -443,9 +445,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
if (sysctl_perf_event_max_stack < max_depth)
max_depth = sysctl_perf_event_max_stack;
- if (may_fault)
- rcu_read_lock(); /* need RCU for perf's callchain below */
-
+ preempt_disable();
if (trace_in)
trace = trace_in;
else if (kernel && task)
@@ -455,8 +455,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
crosstask, false);
if (unlikely(!trace) || trace->nr < skip) {
- if (may_fault)
- rcu_read_unlock();
+ preempt_enable();
goto err_fault;
}
@@ -474,10 +473,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
} else {
memcpy(buf, ips, copy_len);
}
-
- /* trace/ips should not be dereferenced after this point */
- if (may_fault)
- rcu_read_unlock();
+ preempt_enable();
if (user_build_id)
stack_map_get_build_id_offset(buf, trace_nr, user, may_fault);
--
2.48.1
On Mon, Sep 22, 2025 at 12:54 AM Tao Chen <chen.dylane@linux.dev> wrote: > > As Alexei suggested, the return value from get_perf_callchain() may be > reused if another task preempts and requests the stack after BPF program > switched to migrate disable. > > Reported-by: Alexei Starovoitov <ast@kernel.org> > Signed-off-by: Tao Chen <chen.dylane@linux.dev> > --- > kernel/bpf/stackmap.c | 14 +++++--------- > 1 file changed, 5 insertions(+), 9 deletions(-) > > diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c > index 2e182a3ac4c..07892320906 100644 > --- a/kernel/bpf/stackmap.c > +++ b/kernel/bpf/stackmap.c > @@ -314,8 +314,10 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, > if (max_depth > sysctl_perf_event_max_stack) > max_depth = sysctl_perf_event_max_stack; > > + preempt_disable(); > trace = get_perf_callchain(regs, 0, kernel, user, max_depth, > false, false); > + preempt_enable(); This is obviously wrong. As soon as preemption is enabled, trace can be overwritten. guard(preempt)(); can fix it, but the length of the preempt disabled section will be quite big. The way get_perf_callchain() api is written I don't see another option though. Unless we refactor it similar to bpf_try_get_buffers(). pw-bot: cr
在 2025/9/23 10:53, Alexei Starovoitov 写道: > On Mon, Sep 22, 2025 at 12:54 AM Tao Chen <chen.dylane@linux.dev> wrote: >> >> As Alexei suggested, the return value from get_perf_callchain() may be >> reused if another task preempts and requests the stack after BPF program >> switched to migrate disable. >> >> Reported-by: Alexei Starovoitov <ast@kernel.org> >> Signed-off-by: Tao Chen <chen.dylane@linux.dev> >> --- >> kernel/bpf/stackmap.c | 14 +++++--------- >> 1 file changed, 5 insertions(+), 9 deletions(-) >> >> diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c >> index 2e182a3ac4c..07892320906 100644 >> --- a/kernel/bpf/stackmap.c >> +++ b/kernel/bpf/stackmap.c >> @@ -314,8 +314,10 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, >> if (max_depth > sysctl_perf_event_max_stack) >> max_depth = sysctl_perf_event_max_stack; >> >> + preempt_disable(); >> trace = get_perf_callchain(regs, 0, kernel, user, max_depth, >> false, false); >> + preempt_enable(); > > This is obviously wrong. > As soon as preemption is enabled, trace can be overwritten. > guard(preempt)(); > can fix it, but the length of the preempt disabled section > will be quite big. > The way get_perf_callchain() api is written I don't see > another option though. Unless we refactor it similar > to bpf_try_get_buffers(). > > pw-bot: cr Hi Alexei, I tried to understand what you meant and looked at the implementation of get_perf_callchain. Only one perf_callchain_entry on every cpu right now. callchain_cpus_entries(rcu global avariable) ↓ struct callchain_cpus_entries { struct perf_callchain_entry *cpu_entries[]; | } |-> perf_callchain_entry0 cpu0 perf_callchain_entry1 cpu1 … perf_callchain_entryn cpun If we want to realise it like bpf_try_get_buffers, we should alloc a perf_callchain_entry array on every cpu right? callchain_cpus_entries(rcu global avariable) ↓ struct callchain_cpus_entries { struct perf_callchain_entry *cpu_entries[]; | } |-> perf_callchain_entry0[N] cpu0 perf_callchain_entry1[N] cpu1 … perf_callchain_entryn[N] cpun -- Best Regards Tao Chen
On Thu, Sep 25, 2025 at 10:45 AM Tao Chen <chen.dylane@linux.dev> wrote: > > 在 2025/9/23 10:53, Alexei Starovoitov 写道: > > On Mon, Sep 22, 2025 at 12:54 AM Tao Chen <chen.dylane@linux.dev> wrote: > >> > >> As Alexei suggested, the return value from get_perf_callchain() may be > >> reused if another task preempts and requests the stack after BPF program > >> switched to migrate disable. > >> > >> Reported-by: Alexei Starovoitov <ast@kernel.org> > >> Signed-off-by: Tao Chen <chen.dylane@linux.dev> > >> --- > >> kernel/bpf/stackmap.c | 14 +++++--------- > >> 1 file changed, 5 insertions(+), 9 deletions(-) > >> > >> diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c > >> index 2e182a3ac4c..07892320906 100644 > >> --- a/kernel/bpf/stackmap.c > >> +++ b/kernel/bpf/stackmap.c > >> @@ -314,8 +314,10 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, > >> if (max_depth > sysctl_perf_event_max_stack) > >> max_depth = sysctl_perf_event_max_stack; > >> > >> + preempt_disable(); > >> trace = get_perf_callchain(regs, 0, kernel, user, max_depth, > >> false, false); > >> + preempt_enable(); > > > > This is obviously wrong. > > As soon as preemption is enabled, trace can be overwritten. > > guard(preempt)(); > > can fix it, but the length of the preempt disabled section > > will be quite big. > > The way get_perf_callchain() api is written I don't see > > another option though. Unless we refactor it similar > > to bpf_try_get_buffers(). > > > > pw-bot: cr > > Hi Alexei, > > I tried to understand what you meant and looked at the implementation of > get_perf_callchain. > > Only one perf_callchain_entry on every cpu right now. > > callchain_cpus_entries(rcu global avariable) > ↓ > struct callchain_cpus_entries { > struct perf_callchain_entry *cpu_entries[]; > | > } |-> perf_callchain_entry0 cpu0 > perf_callchain_entry1 cpu1 > … > perf_callchain_entryn cpun > > > If we want to realise it like bpf_try_get_buffers, we should > alloc a perf_callchain_entry array on every cpu right? > > callchain_cpus_entries(rcu global avariable) > ↓ > struct callchain_cpus_entries { > struct perf_callchain_entry *cpu_entries[]; > | > } |-> perf_callchain_entry0[N] cpu0 > perf_callchain_entry1[N] cpu1 > … > perf_callchain_entryn[N] cpun Either allow a few entries per CPU (bpf_try_get_buffers allows up to 3 buffers per CPU), or extend get_perf_callchain() to accept perf_callchain_entry from outside, and then we can do that in a BPF-specific way. > > -- > Best Regards > Tao Chen
© 2016 - 2025 Red Hat, Inc.