As Alexei noted, get_perf_callchain() return values may be reused
if a task is preempted after the BPF program enters migrate disable
mode. Drawing on the per-cpu design of bpf_perf_callchain_entries,
stack-allocated memory of bpf_perf_callchain_entry is used here.
Signed-off-by: Tao Chen <chen.dylane@linux.dev>
---
kernel/bpf/stackmap.c | 19 +++++++++++--------
1 file changed, 11 insertions(+), 8 deletions(-)
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 94e46b7f340..acd72c021c0 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -31,6 +31,11 @@ struct bpf_stack_map {
struct stack_map_bucket *buckets[] __counted_by(n_buckets);
};
+struct bpf_perf_callchain_entry {
+ u64 nr;
+ u64 ip[PERF_MAX_STACK_DEPTH];
+};
+
static inline bool stack_map_use_build_id(struct bpf_map *map)
{
return (map->map_flags & BPF_F_STACK_BUILD_ID);
@@ -305,6 +310,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
bool user = flags & BPF_F_USER_STACK;
struct perf_callchain_entry *trace;
bool kernel = !user;
+ struct bpf_perf_callchain_entry entry = { 0 };
if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
@@ -314,12 +320,8 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
if (max_depth > sysctl_perf_event_max_stack)
max_depth = sysctl_perf_event_max_stack;
- trace = get_perf_callchain(regs, NULL, kernel, user, max_depth,
- false, false);
-
- if (unlikely(!trace))
- /* couldn't fetch the stack trace */
- return -EFAULT;
+ trace = get_perf_callchain(regs, (struct perf_callchain_entry *)&entry,
+ kernel, user, max_depth, false, false);
return __bpf_get_stackid(map, trace, flags);
}
@@ -412,6 +414,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
bool user = flags & BPF_F_USER_STACK;
struct perf_callchain_entry *trace;
+ struct bpf_perf_callchain_entry entry = { 0 };
bool kernel = !user;
int err = -EINVAL;
u64 *ips;
@@ -451,8 +454,8 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
else if (kernel && task)
trace = get_callchain_entry_for_task(task, max_depth);
else
- trace = get_perf_callchain(regs, NULL, kernel, user, max_depth,
- crosstask, false);
+ trace = get_perf_callchain(regs, (struct perf_callchain_entry *)&entry,
+ kernel, user, max_depth, crosstask, false);
if (unlikely(!trace) || trace->nr < skip) {
if (may_fault)
--
2.48.1
On Tue, Oct 14, 2025 at 06:01:28PM +0800, Tao Chen wrote:
> As Alexei noted, get_perf_callchain() return values may be reused
> if a task is preempted after the BPF program enters migrate disable
> mode. Drawing on the per-cpu design of bpf_perf_callchain_entries,
> stack-allocated memory of bpf_perf_callchain_entry is used here.
>
> Signed-off-by: Tao Chen <chen.dylane@linux.dev>
> ---
> kernel/bpf/stackmap.c | 19 +++++++++++--------
> 1 file changed, 11 insertions(+), 8 deletions(-)
>
> diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
> index 94e46b7f340..acd72c021c0 100644
> --- a/kernel/bpf/stackmap.c
> +++ b/kernel/bpf/stackmap.c
> @@ -31,6 +31,11 @@ struct bpf_stack_map {
> struct stack_map_bucket *buckets[] __counted_by(n_buckets);
> };
>
> +struct bpf_perf_callchain_entry {
> + u64 nr;
> + u64 ip[PERF_MAX_STACK_DEPTH];
> +};
> +
> static inline bool stack_map_use_build_id(struct bpf_map *map)
> {
> return (map->map_flags & BPF_F_STACK_BUILD_ID);
> @@ -305,6 +310,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
> bool user = flags & BPF_F_USER_STACK;
> struct perf_callchain_entry *trace;
> bool kernel = !user;
> + struct bpf_perf_callchain_entry entry = { 0 };
so IIUC having entries on stack we do not need to do preempt_disable
you had in the previous version, right?
I saw Andrii's justification to have this on the stack, I think it's
fine, but does it have to be initialized? it seems that only used
entries are copied to map
jirka
>
> if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
> BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
> @@ -314,12 +320,8 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
> if (max_depth > sysctl_perf_event_max_stack)
> max_depth = sysctl_perf_event_max_stack;
>
> - trace = get_perf_callchain(regs, NULL, kernel, user, max_depth,
> - false, false);
> -
> - if (unlikely(!trace))
> - /* couldn't fetch the stack trace */
> - return -EFAULT;
> + trace = get_perf_callchain(regs, (struct perf_callchain_entry *)&entry,
> + kernel, user, max_depth, false, false);
>
> return __bpf_get_stackid(map, trace, flags);
> }
> @@ -412,6 +414,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
> u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
> bool user = flags & BPF_F_USER_STACK;
> struct perf_callchain_entry *trace;
> + struct bpf_perf_callchain_entry entry = { 0 };
> bool kernel = !user;
> int err = -EINVAL;
> u64 *ips;
> @@ -451,8 +454,8 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
> else if (kernel && task)
> trace = get_callchain_entry_for_task(task, max_depth);
> else
> - trace = get_perf_callchain(regs, NULL, kernel, user, max_depth,
> - crosstask, false);
> + trace = get_perf_callchain(regs, (struct perf_callchain_entry *)&entry,
> + kernel, user, max_depth, crosstask, false);
>
> if (unlikely(!trace) || trace->nr < skip) {
> if (may_fault)
> --
> 2.48.1
>
On Tue, Oct 14, 2025 at 5:14 AM Jiri Olsa <olsajiri@gmail.com> wrote:
>
> On Tue, Oct 14, 2025 at 06:01:28PM +0800, Tao Chen wrote:
> > As Alexei noted, get_perf_callchain() return values may be reused
> > if a task is preempted after the BPF program enters migrate disable
> > mode. Drawing on the per-cpu design of bpf_perf_callchain_entries,
> > stack-allocated memory of bpf_perf_callchain_entry is used here.
> >
> > Signed-off-by: Tao Chen <chen.dylane@linux.dev>
> > ---
> > kernel/bpf/stackmap.c | 19 +++++++++++--------
> > 1 file changed, 11 insertions(+), 8 deletions(-)
> >
> > diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
> > index 94e46b7f340..acd72c021c0 100644
> > --- a/kernel/bpf/stackmap.c
> > +++ b/kernel/bpf/stackmap.c
> > @@ -31,6 +31,11 @@ struct bpf_stack_map {
> > struct stack_map_bucket *buckets[] __counted_by(n_buckets);
> > };
> >
> > +struct bpf_perf_callchain_entry {
> > + u64 nr;
> > + u64 ip[PERF_MAX_STACK_DEPTH];
> > +};
> > +
> > static inline bool stack_map_use_build_id(struct bpf_map *map)
> > {
> > return (map->map_flags & BPF_F_STACK_BUILD_ID);
> > @@ -305,6 +310,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
> > bool user = flags & BPF_F_USER_STACK;
> > struct perf_callchain_entry *trace;
> > bool kernel = !user;
> > + struct bpf_perf_callchain_entry entry = { 0 };
>
> so IIUC having entries on stack we do not need to do preempt_disable
> you had in the previous version, right?
>
> I saw Andrii's justification to have this on the stack, I think it's
> fine, but does it have to be initialized? it seems that only used
> entries are copied to map
No. We're not adding 1k stack consumption.
pw-bot: cr
On Tue, Oct 14, 2025 at 8:02 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Oct 14, 2025 at 5:14 AM Jiri Olsa <olsajiri@gmail.com> wrote:
> >
> > On Tue, Oct 14, 2025 at 06:01:28PM +0800, Tao Chen wrote:
> > > As Alexei noted, get_perf_callchain() return values may be reused
> > > if a task is preempted after the BPF program enters migrate disable
> > > mode. Drawing on the per-cpu design of bpf_perf_callchain_entries,
> > > stack-allocated memory of bpf_perf_callchain_entry is used here.
> > >
> > > Signed-off-by: Tao Chen <chen.dylane@linux.dev>
> > > ---
> > > kernel/bpf/stackmap.c | 19 +++++++++++--------
> > > 1 file changed, 11 insertions(+), 8 deletions(-)
> > >
> > > diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
> > > index 94e46b7f340..acd72c021c0 100644
> > > --- a/kernel/bpf/stackmap.c
> > > +++ b/kernel/bpf/stackmap.c
> > > @@ -31,6 +31,11 @@ struct bpf_stack_map {
> > > struct stack_map_bucket *buckets[] __counted_by(n_buckets);
> > > };
> > >
> > > +struct bpf_perf_callchain_entry {
> > > + u64 nr;
> > > + u64 ip[PERF_MAX_STACK_DEPTH];
> > > +};
> > > +
we shouldn't introduce another type, there is perf_callchain_entry in
linux/perf_event.h, what's the problem with using that?
> > > static inline bool stack_map_use_build_id(struct bpf_map *map)
> > > {
> > > return (map->map_flags & BPF_F_STACK_BUILD_ID);
> > > @@ -305,6 +310,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
> > > bool user = flags & BPF_F_USER_STACK;
> > > struct perf_callchain_entry *trace;
> > > bool kernel = !user;
> > > + struct bpf_perf_callchain_entry entry = { 0 };
> >
> > so IIUC having entries on stack we do not need to do preempt_disable
> > you had in the previous version, right?
> >
> > I saw Andrii's justification to have this on the stack, I think it's
> > fine, but does it have to be initialized? it seems that only used
> > entries are copied to map
>
> No. We're not adding 1k stack consumption.
Right, and I thought we concluded as much last time, so it's a bit
surprising to see this in this patch.
Tao, you should go with 3 entries per CPU used in a stack-like
fashion. And then passing that entry into get_perf_callchain() (to
avoid one extra copy).
>
> pw-bot: cr
在 2025/10/17 04:39, Andrii Nakryiko 写道:
> On Tue, Oct 14, 2025 at 8:02 AM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
>>
>> On Tue, Oct 14, 2025 at 5:14 AM Jiri Olsa <olsajiri@gmail.com> wrote:
>>>
>>> On Tue, Oct 14, 2025 at 06:01:28PM +0800, Tao Chen wrote:
>>>> As Alexei noted, get_perf_callchain() return values may be reused
>>>> if a task is preempted after the BPF program enters migrate disable
>>>> mode. Drawing on the per-cpu design of bpf_perf_callchain_entries,
>>>> stack-allocated memory of bpf_perf_callchain_entry is used here.
>>>>
>>>> Signed-off-by: Tao Chen <chen.dylane@linux.dev>
>>>> ---
>>>> kernel/bpf/stackmap.c | 19 +++++++++++--------
>>>> 1 file changed, 11 insertions(+), 8 deletions(-)
>>>>
>>>> diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
>>>> index 94e46b7f340..acd72c021c0 100644
>>>> --- a/kernel/bpf/stackmap.c
>>>> +++ b/kernel/bpf/stackmap.c
>>>> @@ -31,6 +31,11 @@ struct bpf_stack_map {
>>>> struct stack_map_bucket *buckets[] __counted_by(n_buckets);
>>>> };
>>>>
>>>> +struct bpf_perf_callchain_entry {
>>>> + u64 nr;
>>>> + u64 ip[PERF_MAX_STACK_DEPTH];
>>>> +};
>>>> +
>
> we shouldn't introduce another type, there is perf_callchain_entry in
> linux/perf_event.h, what's the problem with using that?
perf_callchain_entry uses flexible array, DEFINE_PER_CPU seems do not
create buffer for this, for ease of use, the size of the ip array has
been explicitly defined.
struct perf_callchain_entry {
u64 nr;
u64 ip[]; /*
/proc/sys/kernel/perf_event_max_stack */
};
>
>>>> static inline bool stack_map_use_build_id(struct bpf_map *map)
>>>> {
>>>> return (map->map_flags & BPF_F_STACK_BUILD_ID);
>>>> @@ -305,6 +310,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
>>>> bool user = flags & BPF_F_USER_STACK;
>>>> struct perf_callchain_entry *trace;
>>>> bool kernel = !user;
>>>> + struct bpf_perf_callchain_entry entry = { 0 };
>>>
>>> so IIUC having entries on stack we do not need to do preempt_disable
>>> you had in the previous version, right?
>>>
>>> I saw Andrii's justification to have this on the stack, I think it's
>>> fine, but does it have to be initialized? it seems that only used
>>> entries are copied to map
>>
>> No. We're not adding 1k stack consumption.
>
> Right, and I thought we concluded as much last time, so it's a bit
> surprising to see this in this patch.
>
Ok, I feel like I'm missing some context from our previous exchange.
> Tao, you should go with 3 entries per CPU used in a stack-like
> fashion. And then passing that entry into get_perf_callchain() (to
> avoid one extra copy).
>
Got it. It is more clearer, will change it in v3.
>>
>> pw-bot: cr
--
Best Regards
Tao Chen
On Sat, Oct 18, 2025 at 12:51 AM Tao Chen <chen.dylane@linux.dev> wrote:
>
> 在 2025/10/17 04:39, Andrii Nakryiko 写道:
> > On Tue, Oct 14, 2025 at 8:02 AM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> >>
> >> On Tue, Oct 14, 2025 at 5:14 AM Jiri Olsa <olsajiri@gmail.com> wrote:
> >>>
> >>> On Tue, Oct 14, 2025 at 06:01:28PM +0800, Tao Chen wrote:
> >>>> As Alexei noted, get_perf_callchain() return values may be reused
> >>>> if a task is preempted after the BPF program enters migrate disable
> >>>> mode. Drawing on the per-cpu design of bpf_perf_callchain_entries,
> >>>> stack-allocated memory of bpf_perf_callchain_entry is used here.
> >>>>
> >>>> Signed-off-by: Tao Chen <chen.dylane@linux.dev>
> >>>> ---
> >>>> kernel/bpf/stackmap.c | 19 +++++++++++--------
> >>>> 1 file changed, 11 insertions(+), 8 deletions(-)
> >>>>
> >>>> diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
> >>>> index 94e46b7f340..acd72c021c0 100644
> >>>> --- a/kernel/bpf/stackmap.c
> >>>> +++ b/kernel/bpf/stackmap.c
> >>>> @@ -31,6 +31,11 @@ struct bpf_stack_map {
> >>>> struct stack_map_bucket *buckets[] __counted_by(n_buckets);
> >>>> };
> >>>>
> >>>> +struct bpf_perf_callchain_entry {
> >>>> + u64 nr;
> >>>> + u64 ip[PERF_MAX_STACK_DEPTH];
> >>>> +};
> >>>> +
> >
> > we shouldn't introduce another type, there is perf_callchain_entry in
> > linux/perf_event.h, what's the problem with using that?
>
> perf_callchain_entry uses flexible array, DEFINE_PER_CPU seems do not
> create buffer for this, for ease of use, the size of the ip array has
> been explicitly defined.
>
> struct perf_callchain_entry {
> u64 nr;
> u64 ip[]; /*
> /proc/sys/kernel/perf_event_max_stack */
> };
>
Ok, fair enough, but instead of casting between perf_callchain_entry
and bpf_perf_callchain_entry, why not put perf_callchain_entry inside
bpf_perf_callchain_entry as a first struct and pass a pointer to it.
That looks a bit more appropriate? Though I'm not sure if compiler
will complain about that flex array...
But on related note, I looked briefly at how perf gets those
perf_callchain_entries, and it does seem like it also has a small
stack of entries, so maybe we don't really need to invent anything
here. See PERF_NR_CONTEXTS and how that's used.
If instead of disabling preemption we disable migration, then I think
we should be good with relying on perf's callchain management, or am I
missing something?
> >
> >>>> static inline bool stack_map_use_build_id(struct bpf_map *map)
> >>>> {
> >>>> return (map->map_flags & BPF_F_STACK_BUILD_ID);
> >>>> @@ -305,6 +310,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
> >>>> bool user = flags & BPF_F_USER_STACK;
> >>>> struct perf_callchain_entry *trace;
> >>>> bool kernel = !user;
> >>>> + struct bpf_perf_callchain_entry entry = { 0 };
> >>>
> >>> so IIUC having entries on stack we do not need to do preempt_disable
> >>> you had in the previous version, right?
> >>>
> >>> I saw Andrii's justification to have this on the stack, I think it's
> >>> fine, but does it have to be initialized? it seems that only used
> >>> entries are copied to map
> >>
> >> No. We're not adding 1k stack consumption.
> >
> > Right, and I thought we concluded as much last time, so it's a bit
> > surprising to see this in this patch.
> >
>
> Ok, I feel like I'm missing some context from our previous exchange.
>
> > Tao, you should go with 3 entries per CPU used in a stack-like
> > fashion. And then passing that entry into get_perf_callchain() (to
> > avoid one extra copy).
> >
>
> Got it. It is more clearer, will change it in v3.
>
> >>
> >> pw-bot: cr
>
>
> --
> Best Regards
> Tao Chen
在 2025/10/22 00:37, Andrii Nakryiko 写道:
> On Sat, Oct 18, 2025 at 12:51 AM Tao Chen <chen.dylane@linux.dev> wrote:
>>
>> 在 2025/10/17 04:39, Andrii Nakryiko 写道:
>>> On Tue, Oct 14, 2025 at 8:02 AM Alexei Starovoitov
>>> <alexei.starovoitov@gmail.com> wrote:
>>>>
>>>> On Tue, Oct 14, 2025 at 5:14 AM Jiri Olsa <olsajiri@gmail.com> wrote:
>>>>>
>>>>> On Tue, Oct 14, 2025 at 06:01:28PM +0800, Tao Chen wrote:
>>>>>> As Alexei noted, get_perf_callchain() return values may be reused
>>>>>> if a task is preempted after the BPF program enters migrate disable
>>>>>> mode. Drawing on the per-cpu design of bpf_perf_callchain_entries,
>>>>>> stack-allocated memory of bpf_perf_callchain_entry is used here.
>>>>>>
>>>>>> Signed-off-by: Tao Chen <chen.dylane@linux.dev>
>>>>>> ---
>>>>>> kernel/bpf/stackmap.c | 19 +++++++++++--------
>>>>>> 1 file changed, 11 insertions(+), 8 deletions(-)
>>>>>>
>>>>>> diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
>>>>>> index 94e46b7f340..acd72c021c0 100644
>>>>>> --- a/kernel/bpf/stackmap.c
>>>>>> +++ b/kernel/bpf/stackmap.c
>>>>>> @@ -31,6 +31,11 @@ struct bpf_stack_map {
>>>>>> struct stack_map_bucket *buckets[] __counted_by(n_buckets);
>>>>>> };
>>>>>>
>>>>>> +struct bpf_perf_callchain_entry {
>>>>>> + u64 nr;
>>>>>> + u64 ip[PERF_MAX_STACK_DEPTH];
>>>>>> +};
>>>>>> +
>>>
>>> we shouldn't introduce another type, there is perf_callchain_entry in
>>> linux/perf_event.h, what's the problem with using that?
>>
>> perf_callchain_entry uses flexible array, DEFINE_PER_CPU seems do not
>> create buffer for this, for ease of use, the size of the ip array has
>> been explicitly defined.
>>
>> struct perf_callchain_entry {
>> u64 nr;
>> u64 ip[]; /*
>> /proc/sys/kernel/perf_event_max_stack */
>> };
>>
>
> Ok, fair enough, but instead of casting between perf_callchain_entry
> and bpf_perf_callchain_entry, why not put perf_callchain_entry inside
> bpf_perf_callchain_entry as a first struct and pass a pointer to it.
> That looks a bit more appropriate? Though I'm not sure if compiler
> will complain about that flex array...
>
> But on related note, I looked briefly at how perf gets those
> perf_callchain_entries, and it does seem like it also has a small
> stack of entries, so maybe we don't really need to invent anything
> here. See PERF_NR_CONTEXTS and how that's used.
>
It seems so. The implementation of get_callchain_entry and
put_callchain_entry is similar to a simple stack management.
struct perf_callchain_entry *get_callchain_entry(int *rctx)
{
int cpu;
struct callchain_cpus_entries *entries;
*rctx = get_recursion_context(this_cpu_ptr(callchain_recursion));
if (*rctx == -1)
return NULL;
entries = rcu_dereference(callchain_cpus_entries);
if (!entries) {
put_recursion_context(this_cpu_ptr(callchain_recursion), *rctx);
return NULL;
}
cpu = smp_processor_id();
return (((void *)entries->cpu_entries[cpu]) +
(*rctx * perf_callchain_entry__sizeof()));
}
void
put_callchain_entry(int rctx)
{
put_recursion_context(this_cpu_ptr(callchain_recursion), rctx);
}
> If instead of disabling preemption we disable migration, then I think
> we should be good with relying on perf's callchain management, or am I
> missing something?
>
Peter proposed refactoring the get_perf_callchain interface in v3, i
think after that we can still use perf callchain, but with the
refactored API, it can prevent being overwritten by preemption.
BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map
{
...
entry = bpf_get_perf_callchain;
__bpf_get_stackid(map, entry, flags);
bpf_put_callchain_entry(entry);
...
}
A simple specific implementation is as follows:
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index fd1d91017b9..1c7573f085f 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -67,6 +67,7 @@ struct perf_callchain_entry_ctx {
u32 nr;
short contexts;
bool contexts_maxed;
+ bool add_mark;
};
typedef unsigned long (*perf_copy_f)(void *dst, const void *src,
@@ -1718,9 +1719,18 @@ DECLARE_PER_CPU(struct perf_callchain_entry,
perf_callchain_entry);
extern void perf_callchain_user(struct perf_callchain_entry_ctx
*entry, struct pt_regs *regs);
extern void perf_callchain_kernel(struct perf_callchain_entry_ctx
*entry, struct pt_regs *regs);
+
+extern void __init_perf_callchain_ctx(struct perf_callchain_entry_ctx *ctx,
+ struct perf_callchain_entry *entry,
+ u32 max_stack, bool add_mark);
+
+extern void __get_perf_callchain_kernel(struct perf_callchain_entry_ctx
*ctx, struct pt_regs *regs);
+extern void __get_perf_callchain_user(struct perf_callchain_entry_ctx
*ctx, struct pt_regs *regs);
+
+
extern struct perf_callchain_entry *
get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
- u32 max_stack, bool crosstask, bool add_mark);
+ u32 max_stack, bool crosstask);
extern int get_callchain_buffers(int max_stack);
extern void put_callchain_buffers(void);
extern struct perf_callchain_entry *get_callchain_entry(int *rctx);
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 4d53cdd1374..3f2543e5244 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -297,14 +297,38 @@ static long __bpf_get_stackid(struct bpf_map *map,
return id;
}
+static struct perf_callchain_entry *entry
+bpf_get_perf_callchain(int *rctx, struct pt_regs *regs, bool kernel,
bool user, int max_stack)
+{
+ struct perf_callchain_entry_ctx ctx;
+ struct perf_callchain_entry *entry;
+
+ entry = get_callchain_entry(&rctx);
+ if (unlikely(!entry))
+ return NULL;
+
+ __init_perf_callchain_ctx(&ctx, entry)
+ if (kernel)
+ __get_perf_callchain_kernel(&ctx, regs);
+ if (user)
+ __get_perf_callchain_user(&ctx, regs);
+ return entry;
+}
+
+static bpf_put_perf_callchain(int rctx)
+{
+ put_callchain_entry(rctx);
+}
+
BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
u64, flags)
{
u32 max_depth = map->value_size / stack_map_data_size(map);
u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
bool user = flags & BPF_F_USER_STACK;
- struct perf_callchain_entry *trace;
+ struct perf_callchain_entry *entry;
bool kernel = !user;
+ int rctx, ret;
if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
@@ -314,14 +338,14 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *,
regs, struct bpf_map *, map,
if (max_depth > sysctl_perf_event_max_stack)
max_depth = sysctl_perf_event_max_stack;
- trace = get_perf_callchain(regs, kernel, user, max_depth,
- false, false);
-
- if (unlikely(!trace))
- /* couldn't fetch the stack trace */
+ entry = bpf_get_perf_callchain(&rctx, regs, kernel, user, max_depth);
+ if (unlikely(!entry))
return -EFAULT;
- return __bpf_get_stackid(map, trace, flags);
+ ret = __bpf_get_stackid(map, entry, flags);
+ bpf_put_callchain_entry(rctx);
+
+ return ret;
}
const struct bpf_func_proto bpf_get_stackid_proto = {
@@ -452,7 +476,7 @@ static long __bpf_get_stack(struct pt_regs *regs,
struct task_struct *task,
trace = get_callchain_entry_for_task(task, max_depth);
else
trace = get_perf_callchain(regs, kernel, user, max_depth,
- crosstask, false);
+ crosstask);
if (unlikely(!trace) || trace->nr < skip) {
if (may_fault)
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 808c0d7a31f..2c36e490625 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -216,13 +216,54 @@ static void
fixup_uretprobe_trampoline_entries(struct perf_callchain_entry *entr
#endif
}
+void __init_perf_callchain_ctx(struct perf_callchain_entry_ctx *ctx,
+ struct perf_callchain_entry *entry,
+ u32 max_stack, bool add_mark)
+
+{
+ ctx->entry = entry;
+ ctx->max_stack = max_stack;
+ ctx->nr = entry->nr = 0;
+ ctx->contexts = 0;
+ ctx->contexts_maxed = false;
+ ctx->add_mark = add_mark;
+}
+
+void __get_perf_callchain_kernel(struct perf_callchain_entry_ctx *ctx,
struct pt_regs *regs)
+{
+ if (user_mode(regs))
+ return;
+
+ if (ctx->add_mark)
+ perf_callchain_store_context(ctx, PERF_CONTEXT_KERNEL);
+ perf_callchain_kernel(ctx, regs);
+}
+
+void __get_perf_callchain_user(struct perf_callchain_entry_ctx *ctx,
struct pt_regs *regs)
+{
+ int start_entry_idx;
+
+ if (!user_mode(regs)) {
+ if (current->flags & (PF_KTHREAD | PF_USER_WORKER))
+ return;
+ regs = task_pt_regs(current);
+ }
+
+ if (ctx->add_mark)
+ perf_callchain_store_context(ctx, PERF_CONTEXT_USER);
+
+ start_entry_idx = ctx->nr;
+ perf_callchain_user(ctx, regs);
+ fixup_uretprobe_trampoline_entries(ctx->entry, start_entry_idx);
+}
+
struct perf_callchain_entry *
get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
- u32 max_stack, bool crosstask, bool add_mark)
+ u32 max_stack, bool crosstask)
{
struct perf_callchain_entry *entry;
struct perf_callchain_entry_ctx ctx;
- int rctx, start_entry_idx;
+ int rctx;
/* crosstask is not supported for user stacks */
if (crosstask && user && !kernel)
@@ -232,34 +273,14 @@ get_perf_callchain(struct pt_regs *regs, bool
kernel, bool user,
if (!entry)
return NULL;
- ctx.entry = entry;
- ctx.max_stack = max_stack;
- ctx.nr = entry->nr = 0;
- ctx.contexts = 0;
- ctx.contexts_maxed = false;
+ __init_perf_callchain_ctx(&ctx, entry, max_stack, true);
- if (kernel && !user_mode(regs)) {
- if (add_mark)
- perf_callchain_store_context(&ctx, PERF_CONTEXT_KERNEL);
- perf_callchain_kernel(&ctx, regs);
- }
-
- if (user && !crosstask) {
- if (!user_mode(regs)) {
- if (current->flags & (PF_KTHREAD | PF_USER_WORKER))
- goto exit_put;
- regs = task_pt_regs(current);
- }
+ if (kernel)
+ __get_perf_callchain_kernel(&ctx, regs);
- if (add_mark)
- perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
-
- start_entry_idx = entry->nr;
- perf_callchain_user(&ctx, regs);
- fixup_uretprobe_trampoline_entries(entry, start_entry_idx);
- }
+ if (user && !crosstask)
+ __get_perf_callchain_user(&ctx, regs);
-exit_put:
put_callchain_entry(rctx);
return entry;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7541f6f85fc..eb0f110593d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8218,7 +8218,7 @@ perf_callchain(struct perf_event *event, struct
pt_regs *regs)
return &__empty_callchain;
callchain = get_perf_callchain(regs, kernel, user,
- max_stack, crosstask, true);
+ max_stack, crosstask);
return callchain ?: &__empty_callchain;
}
>>>
>>>>>> static inline bool stack_map_use_build_id(struct bpf_map *map)
>>>>>> {
>>>>>> return (map->map_flags & BPF_F_STACK_BUILD_ID);
>>>>>> @@ -305,6 +310,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
>>>>>> bool user = flags & BPF_F_USER_STACK;
>>>>>> struct perf_callchain_entry *trace;
>>>>>> bool kernel = !user;
>>>>>> + struct bpf_perf_callchain_entry entry = { 0 };
>>>>>
>>>>> so IIUC having entries on stack we do not need to do preempt_disable
>>>>> you had in the previous version, right?
>>>>>
>>>>> I saw Andrii's justification to have this on the stack, I think it's
>>>>> fine, but does it have to be initialized? it seems that only used
>>>>> entries are copied to map
>>>>
>>>> No. We're not adding 1k stack consumption.
>>>
>>> Right, and I thought we concluded as much last time, so it's a bit
>>> surprising to see this in this patch.
>>>
>>
>> Ok, I feel like I'm missing some context from our previous exchange.
>>
>>> Tao, you should go with 3 entries per CPU used in a stack-like
>>> fashion. And then passing that entry into get_perf_callchain() (to
>>> avoid one extra copy).
>>>
>>
>> Got it. It is more clearer, will change it in v3.
>>
>>>>
>>>> pw-bot: cr
>>
>>
>> --
>> Best Regards
>> Tao Chen
--
Best Regards
Tao Chen
在 2025/10/14 20:14, Jiri Olsa 写道:
>> + struct bpf_perf_callchain_entry entry = { 0 };
> so IIUC having entries on stack we do not need to do preempt_disable
> you had in the previous version, right?
>
Yes, i think so, preempt_disable seems unnecessary.
> I saw Andrii's justification to have this on the stack, I think it's
> fine, but does it have to be initialized? it seems that only used
> entries are copied to map
That makes sense. Removing it definitely looks better.
--
Best Regards
Tao Chen
© 2016 - 2025 Red Hat, Inc.