[PATCH bpf-next v8 2/3] perf: Refactor get_perf_callchain

Tao Chen posted 3 patches 1 week, 5 days ago
[PATCH bpf-next v8 2/3] perf: Refactor get_perf_callchain
Posted by Tao Chen 1 week, 5 days ago
From BPF stack map, we want to ensure that the callchain buffer
will not be overwritten by other preemptive tasks and we also aim
to reduce the preempt disable interval, Based on the suggestions from Peter
and Andrrii, export new API __get_perf_callchain and the usage scenarios
are as follows from BPF side:

preempt_disable()
entry = get_callchain_entry()
preempt_enable()
__get_perf_callchain(entry)
put_callchain_entry(entry)

Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Tao Chen <chen.dylane@linux.dev>
---
 include/linux/perf_event.h |  5 +++++
 kernel/events/callchain.c  | 34 ++++++++++++++++++++++------------
 2 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index f0489843ebc..7132fa97bb1 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1722,6 +1722,11 @@ extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct
 extern struct perf_callchain_entry *
 get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
 		   u32 max_stack, bool crosstask, bool add_mark, u64 defer_cookie);
+
+extern int __get_perf_callchain(struct perf_callchain_entry *entry, struct pt_regs *regs,
+				bool kernel, bool user, u32 max_stack, bool crosstask, bool add_mark,
+				u64 defer_cookie);
+
 extern int get_callchain_buffers(int max_stack);
 extern void put_callchain_buffers(void);
 extern struct perf_callchain_entry *get_callchain_entry(void);
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 6cdbc5937b1..f9789d10fa4 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -221,21 +221,15 @@ static void fixup_uretprobe_trampoline_entries(struct perf_callchain_entry *entr
 #endif
 }
 
-struct perf_callchain_entry *
-get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
-		   u32 max_stack, bool crosstask, bool add_mark, u64 defer_cookie)
+int __get_perf_callchain(struct perf_callchain_entry *entry, struct pt_regs *regs, bool kernel,
+			 bool user, u32 max_stack, bool crosstask, bool add_mark, u64 defer_cookie)
 {
-	struct perf_callchain_entry *entry;
 	struct perf_callchain_entry_ctx ctx;
 	int start_entry_idx;
 
 	/* crosstask is not supported for user stacks */
 	if (crosstask && user && !kernel)
-		return NULL;
-
-	entry = get_callchain_entry();
-	if (!entry)
-		return NULL;
+		return -EINVAL;
 
 	ctx.entry		= entry;
 	ctx.max_stack		= max_stack;
@@ -252,7 +246,7 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
 	if (user && !crosstask) {
 		if (!user_mode(regs)) {
 			if (current->flags & (PF_KTHREAD | PF_USER_WORKER))
-				goto exit_put;
+				return 0;
 			regs = task_pt_regs(current);
 		}
 
@@ -265,7 +259,7 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
 			 */
 			perf_callchain_store_context(&ctx, PERF_CONTEXT_USER_DEFERRED);
 			perf_callchain_store_context(&ctx, defer_cookie);
-			goto exit_put;
+			return 0;
 		}
 
 		if (add_mark)
@@ -275,9 +269,25 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
 		perf_callchain_user(&ctx, regs);
 		fixup_uretprobe_trampoline_entries(entry, start_entry_idx);
 	}
+	return 0;
+}
+
+struct perf_callchain_entry *
+get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
+		   u32 max_stack, bool crosstask, bool add_mark, u64 defer_cookie)
+{
+	struct perf_callchain_entry *entry;
+	int ret;
+
+	entry = get_callchain_entry();
+	if (!entry)
+		return NULL;
 
-exit_put:
+	ret = __get_perf_callchain(entry, regs, kernel, user, max_stack, crosstask, add_mark,
+				   defer_cookie);
 	put_callchain_entry(entry);
+	if (ret)
+		entry = NULL;
 
 	return entry;
 }
-- 
2.48.1
Re: [PATCH bpf-next v8 2/3] perf: Refactor get_perf_callchain
Posted by Andrii Nakryiko 3 days, 13 hours ago
On Sun, Jan 25, 2026 at 11:45 PM Tao Chen <chen.dylane@linux.dev> wrote:
>
> From BPF stack map, we want to ensure that the callchain buffer
> will not be overwritten by other preemptive tasks and we also aim
> to reduce the preempt disable interval, Based on the suggestions from Peter
> and Andrrii, export new API __get_perf_callchain and the usage scenarios
> are as follows from BPF side:
>
> preempt_disable()
> entry = get_callchain_entry()
> preempt_enable()
> __get_perf_callchain(entry)
> put_callchain_entry(entry)
>
> Suggested-by: Andrii Nakryiko <andrii@kernel.org>
> Signed-off-by: Tao Chen <chen.dylane@linux.dev>
> ---
>  include/linux/perf_event.h |  5 +++++
>  kernel/events/callchain.c  | 34 ++++++++++++++++++++++------------
>  2 files changed, 27 insertions(+), 12 deletions(-)
>

Looking at the whole __bpf_get_stack() logic again, why didn't we just
do something like this:

diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index da3d328f5c15..80364561611c 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -460,8 +460,8 @@ static long __bpf_get_stack(struct pt_regs *regs,
struct task_struct *task,

        max_depth = stack_map_calculate_max_depth(size, elem_size, flags);

-       if (may_fault)
-               rcu_read_lock(); /* need RCU for perf's callchain below */
+       if (!trace_in)
+               preempt_disable();

        if (trace_in) {
                trace = trace_in;
@@ -474,8 +474,8 @@ static long __bpf_get_stack(struct pt_regs *regs,
struct task_struct *task,
        }

        if (unlikely(!trace) || trace->nr < skip) {
-               if (may_fault)
-                       rcu_read_unlock();
+               if (!trace_in)
+                       preempt_enable();
                goto err_fault;
        }

@@ -494,8 +494,8 @@ static long __bpf_get_stack(struct pt_regs *regs,
struct task_struct *task,
        }

        /* trace/ips should not be dereferenced after this point */
-       if (may_fault)
-               rcu_read_unlock();
+       if (!trace_in)
+               preempt_enable();

        if (user_build_id)
                stack_map_get_build_id_offset(buf, trace_nr, user, may_fault);


?

Build ID parsing is happening after we copied data from perf's
callchain_entry into user-provided memory, so raw callchain retrieval
can be done with preemption disabled, as it's supposed to be brief.
Build ID parsing part which indeed might fault and be much slower will
be done well after that (we even have a comment there saying that
trace/ips should not be touched).

Am I missing something?

[...]
Re: [PATCH bpf-next v8 2/3] perf: Refactor get_perf_callchain
Posted by Tao Chen 2 days, 8 hours ago
在 2026/2/4 09:08, Andrii Nakryiko 写道:
> On Sun, Jan 25, 2026 at 11:45 PM Tao Chen <chen.dylane@linux.dev> wrote:
>>
>>  From BPF stack map, we want to ensure that the callchain buffer
>> will not be overwritten by other preemptive tasks and we also aim
>> to reduce the preempt disable interval, Based on the suggestions from Peter
>> and Andrrii, export new API __get_perf_callchain and the usage scenarios
>> are as follows from BPF side:
>>
>> preempt_disable()
>> entry = get_callchain_entry()
>> preempt_enable()
>> __get_perf_callchain(entry)
>> put_callchain_entry(entry)
>>
>> Suggested-by: Andrii Nakryiko <andrii@kernel.org>
>> Signed-off-by: Tao Chen <chen.dylane@linux.dev>
>> ---
>>   include/linux/perf_event.h |  5 +++++
>>   kernel/events/callchain.c  | 34 ++++++++++++++++++++++------------
>>   2 files changed, 27 insertions(+), 12 deletions(-)
>>
> 
> Looking at the whole __bpf_get_stack() logic again, why didn't we just
> do something like this:
> 
> diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
> index da3d328f5c15..80364561611c 100644
> --- a/kernel/bpf/stackmap.c
> +++ b/kernel/bpf/stackmap.c
> @@ -460,8 +460,8 @@ static long __bpf_get_stack(struct pt_regs *regs,
> struct task_struct *task,
> 
>          max_depth = stack_map_calculate_max_depth(size, elem_size, flags);
> 
> -       if (may_fault)
> -               rcu_read_lock(); /* need RCU for perf's callchain below */
> +       if (!trace_in)
> +               preempt_disable();
> 
>          if (trace_in) {
>                  trace = trace_in;
> @@ -474,8 +474,8 @@ static long __bpf_get_stack(struct pt_regs *regs,
> struct task_struct *task,
>          }
> 
>          if (unlikely(!trace) || trace->nr < skip) {
> -               if (may_fault)
> -                       rcu_read_unlock();
> +               if (!trace_in)
> +                       preempt_enable();
>                  goto err_fault;
>          }
> 
> @@ -494,8 +494,8 @@ static long __bpf_get_stack(struct pt_regs *regs,
> struct task_struct *task,
>          }
> 
>          /* trace/ips should not be dereferenced after this point */
> -       if (may_fault)
> -               rcu_read_unlock();
> +       if (!trace_in)
> +               preempt_enable();
> 
>          if (user_build_id)
>                  stack_map_get_build_id_offset(buf, trace_nr, user, may_fault);
> 
> 
> ?
> 
> Build ID parsing is happening after we copied data from perf's
> callchain_entry into user-provided memory, so raw callchain retrieval
> can be done with preemption disabled, as it's supposed to be brief.
> Build ID parsing part which indeed might fault and be much slower will
> be done well after that (we even have a comment there saying that
> trace/ips should not be touched).
> 
> Am I missing something?

Yes it looks good for bpf_get_stack, and I also proposed a similar 
change previously. But Alexei suggested that we should reduce the 
preemption-disabled section protected in bpf_get_stackid if we do like 
bpf_get_stack. Maybe we can change it first for bpf_get_stack?

https://lore.kernel.org/bpf/20250922075333.1452803-1-chen.dylane@linux.dev/

> 
> [...]


-- 
Best Regards
Tao Chen
Re: [PATCH bpf-next v8 2/3] perf: Refactor get_perf_callchain
Posted by Andrii Nakryiko 1 day, 21 hours ago
On Wed, Feb 4, 2026 at 10:16 PM Tao Chen <chen.dylane@linux.dev> wrote:
>
> 在 2026/2/4 09:08, Andrii Nakryiko 写道:
> > On Sun, Jan 25, 2026 at 11:45 PM Tao Chen <chen.dylane@linux.dev> wrote:
> >>
> >>  From BPF stack map, we want to ensure that the callchain buffer
> >> will not be overwritten by other preemptive tasks and we also aim
> >> to reduce the preempt disable interval, Based on the suggestions from Peter
> >> and Andrrii, export new API __get_perf_callchain and the usage scenarios
> >> are as follows from BPF side:
> >>
> >> preempt_disable()
> >> entry = get_callchain_entry()
> >> preempt_enable()
> >> __get_perf_callchain(entry)
> >> put_callchain_entry(entry)
> >>
> >> Suggested-by: Andrii Nakryiko <andrii@kernel.org>
> >> Signed-off-by: Tao Chen <chen.dylane@linux.dev>
> >> ---
> >>   include/linux/perf_event.h |  5 +++++
> >>   kernel/events/callchain.c  | 34 ++++++++++++++++++++++------------
> >>   2 files changed, 27 insertions(+), 12 deletions(-)
> >>
> >
> > Looking at the whole __bpf_get_stack() logic again, why didn't we just
> > do something like this:
> >
> > diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
> > index da3d328f5c15..80364561611c 100644
> > --- a/kernel/bpf/stackmap.c
> > +++ b/kernel/bpf/stackmap.c
> > @@ -460,8 +460,8 @@ static long __bpf_get_stack(struct pt_regs *regs,
> > struct task_struct *task,
> >
> >          max_depth = stack_map_calculate_max_depth(size, elem_size, flags);
> >
> > -       if (may_fault)
> > -               rcu_read_lock(); /* need RCU for perf's callchain below */
> > +       if (!trace_in)
> > +               preempt_disable();
> >
> >          if (trace_in) {
> >                  trace = trace_in;
> > @@ -474,8 +474,8 @@ static long __bpf_get_stack(struct pt_regs *regs,
> > struct task_struct *task,
> >          }
> >
> >          if (unlikely(!trace) || trace->nr < skip) {
> > -               if (may_fault)
> > -                       rcu_read_unlock();
> > +               if (!trace_in)
> > +                       preempt_enable();
> >                  goto err_fault;
> >          }
> >
> > @@ -494,8 +494,8 @@ static long __bpf_get_stack(struct pt_regs *regs,
> > struct task_struct *task,
> >          }
> >
> >          /* trace/ips should not be dereferenced after this point */
> > -       if (may_fault)
> > -               rcu_read_unlock();
> > +       if (!trace_in)
> > +               preempt_enable();
> >
> >          if (user_build_id)
> >                  stack_map_get_build_id_offset(buf, trace_nr, user, may_fault);
> >
> >
> > ?
> >
> > Build ID parsing is happening after we copied data from perf's
> > callchain_entry into user-provided memory, so raw callchain retrieval
> > can be done with preemption disabled, as it's supposed to be brief.
> > Build ID parsing part which indeed might fault and be much slower will
> > be done well after that (we even have a comment there saying that
> > trace/ips should not be touched).
> >
> > Am I missing something?
>
> Yes it looks good for bpf_get_stack, and I also proposed a similar
> change previously. But Alexei suggested that we should reduce the
> preemption-disabled section protected in bpf_get_stackid if we do like
> bpf_get_stack. Maybe we can change it first for bpf_get_stack?
>
> https://lore.kernel.org/bpf/20250922075333.1452803-1-chen.dylane@linux.dev/

This is broken because we are still using trace after you re-enabled
preemption. We need to keep preemption disabled until we copy captured
stack trace data from ips into our own memory.

>
> >
> > [...]
>
>
> --
> Best Regards
> Tao Chen
Re: [PATCH bpf-next v8 2/3] perf: Refactor get_perf_callchain
Posted by Tao Chen 1 day, 5 hours ago
在 2026/2/6 01:34, Andrii Nakryiko 写道:
> On Wed, Feb 4, 2026 at 10:16 PM Tao Chen <chen.dylane@linux.dev> wrote:
>>
>> 在 2026/2/4 09:08, Andrii Nakryiko 写道:
>>> On Sun, Jan 25, 2026 at 11:45 PM Tao Chen <chen.dylane@linux.dev> wrote:
>>>>
>>>>   From BPF stack map, we want to ensure that the callchain buffer
>>>> will not be overwritten by other preemptive tasks and we also aim
>>>> to reduce the preempt disable interval, Based on the suggestions from Peter
>>>> and Andrrii, export new API __get_perf_callchain and the usage scenarios
>>>> are as follows from BPF side:
>>>>
>>>> preempt_disable()
>>>> entry = get_callchain_entry()
>>>> preempt_enable()
>>>> __get_perf_callchain(entry)
>>>> put_callchain_entry(entry)
>>>>
>>>> Suggested-by: Andrii Nakryiko <andrii@kernel.org>
>>>> Signed-off-by: Tao Chen <chen.dylane@linux.dev>
>>>> ---
>>>>    include/linux/perf_event.h |  5 +++++
>>>>    kernel/events/callchain.c  | 34 ++++++++++++++++++++++------------
>>>>    2 files changed, 27 insertions(+), 12 deletions(-)
>>>>
>>>
>>> Looking at the whole __bpf_get_stack() logic again, why didn't we just
>>> do something like this:
>>>
>>> diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
>>> index da3d328f5c15..80364561611c 100644
>>> --- a/kernel/bpf/stackmap.c
>>> +++ b/kernel/bpf/stackmap.c
>>> @@ -460,8 +460,8 @@ static long __bpf_get_stack(struct pt_regs *regs,
>>> struct task_struct *task,
>>>
>>>           max_depth = stack_map_calculate_max_depth(size, elem_size, flags);
>>>
>>> -       if (may_fault)
>>> -               rcu_read_lock(); /* need RCU for perf's callchain below */
>>> +       if (!trace_in)
>>> +               preempt_disable();
>>>
>>>           if (trace_in) {
>>>                   trace = trace_in;
>>> @@ -474,8 +474,8 @@ static long __bpf_get_stack(struct pt_regs *regs,
>>> struct task_struct *task,
>>>           }
>>>
>>>           if (unlikely(!trace) || trace->nr < skip) {
>>> -               if (may_fault)
>>> -                       rcu_read_unlock();
>>> +               if (!trace_in)
>>> +                       preempt_enable();
>>>                   goto err_fault;
>>>           }
>>>
>>> @@ -494,8 +494,8 @@ static long __bpf_get_stack(struct pt_regs *regs,
>>> struct task_struct *task,
>>>           }
>>>
>>>           /* trace/ips should not be dereferenced after this point */
>>> -       if (may_fault)
>>> -               rcu_read_unlock();
>>> +       if (!trace_in)
>>> +               preempt_enable();
>>>
>>>           if (user_build_id)
>>>                   stack_map_get_build_id_offset(buf, trace_nr, user, may_fault);
>>>
>>>
>>> ?
>>>
>>> Build ID parsing is happening after we copied data from perf's
>>> callchain_entry into user-provided memory, so raw callchain retrieval
>>> can be done with preemption disabled, as it's supposed to be brief.
>>> Build ID parsing part which indeed might fault and be much slower will
>>> be done well after that (we even have a comment there saying that
>>> trace/ips should not be touched).
>>>

Hi Andrii,

Building upon your work, I have also added preempt_disable() to 
bpf_get_stackid, and try to reduce the length of preempt section.
Please review, thanks.

https://lore.kernel.org/bpf/20260206090653.1336687-1-chen.dylane@linux.dev

>>> Am I missing something?
>>
>> Yes it looks good for bpf_get_stack, and I also proposed a similar
>> change previously. But Alexei suggested that we should reduce the
>> preemption-disabled section protected in bpf_get_stackid if we do like
>> bpf_get_stack. Maybe we can change it first for bpf_get_stack?
>>
>> https://lore.kernel.org/bpf/20250922075333.1452803-1-chen.dylane@linux.dev/
> 
> This is broken because we are still using trace after you re-enabled
> preemption. We need to keep preemption disabled until we copy captured
> stack trace data from ips into our own memory.
> 
>>
>>>
>>> [...]
>>
>>
>> --
>> Best Regards
>> Tao Chen
> 


-- 
Best Regards
Tao Chen
Re: [PATCH bpf-next v8 2/3] perf: Refactor get_perf_callchain
Posted by Peter Zijlstra 1 week, 3 days ago
On Mon, Jan 26, 2026 at 03:43:30PM +0800, Tao Chen wrote:
> From BPF stack map, we want to ensure that the callchain buffer
> will not be overwritten by other preemptive tasks and we also aim
> to reduce the preempt disable interval, Based on the suggestions from Peter
> and Andrrii, export new API __get_perf_callchain and the usage scenarios
> are as follows from BPF side:
> 
> preempt_disable()
> entry = get_callchain_entry()
> preempt_enable()
> __get_perf_callchain(entry)
> put_callchain_entry(entry)

That makes no sense, this means any other task on that CPU is getting
screwed over.

Why are you worried about the preempt_disable() here? If this were an
interrupt context we'd still do that unwind -- but then with IRQs
disabled.
Re: [PATCH bpf-next v8 2/3] perf: Refactor get_perf_callchain
Posted by Andrii Nakryiko 1 week, 2 days ago
On Wed, Jan 28, 2026 at 1:10 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Mon, Jan 26, 2026 at 03:43:30PM +0800, Tao Chen wrote:
> > From BPF stack map, we want to ensure that the callchain buffer
> > will not be overwritten by other preemptive tasks and we also aim
> > to reduce the preempt disable interval, Based on the suggestions from Peter
> > and Andrrii, export new API __get_perf_callchain and the usage scenarios
> > are as follows from BPF side:
> >
> > preempt_disable()
> > entry = get_callchain_entry()
> > preempt_enable()
> > __get_perf_callchain(entry)
> > put_callchain_entry(entry)
>
> That makes no sense, this means any other task on that CPU is getting
> screwed over.

Yes, unfortunately, but unless we dynamically allocate new entry each
time and/or keep per-current entry cached there isn't much choice we
have here, no?

Maybe that's what we have to do, honestly, because
get_perf_callchain() usage we have right now from sleepable BPF isn't
great no matter with or without changes in this patch set.

>
> Why are you worried about the preempt_disable() here? If this were an
> interrupt context we'd still do that unwind -- but then with IRQs
> disabled.

Because __bpf_get_stack() from kernel/bpf/stackmap.c can be called
from sleepable/faultable context and also we can do a rather expensive
build ID resolution (either in sleepable or not, which only changes if
build ID parsing logic waits for file backed pages to be paged in or
not).
Re: [PATCH bpf-next v8 2/3] perf: Refactor get_perf_callchain
Posted by Peter Zijlstra 1 week, 1 day ago
On Wed, Jan 28, 2026 at 11:12:09AM -0800, Andrii Nakryiko wrote:
> On Wed, Jan 28, 2026 at 1:10 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Mon, Jan 26, 2026 at 03:43:30PM +0800, Tao Chen wrote:
> > > From BPF stack map, we want to ensure that the callchain buffer
> > > will not be overwritten by other preemptive tasks and we also aim
> > > to reduce the preempt disable interval, Based on the suggestions from Peter
> > > and Andrrii, export new API __get_perf_callchain and the usage scenarios
> > > are as follows from BPF side:
> > >
> > > preempt_disable()
> > > entry = get_callchain_entry()
> > > preempt_enable()
> > > __get_perf_callchain(entry)
> > > put_callchain_entry(entry)
> >
> > That makes no sense, this means any other task on that CPU is getting
> > screwed over.
> 
> Yes, unfortunately, but unless we dynamically allocate new entry each
> time and/or keep per-current entry cached there isn't much choice we
> have here, no?
> 
> Maybe that's what we have to do, honestly, because
> get_perf_callchain() usage we have right now from sleepable BPF isn't
> great no matter with or without changes in this patch set.

All of the perf stuff is based on the fact that if we do it from IRQ/NMI
context, we can certainly do it with preemption disabled.

Bending the interface this way is just horrible.

> > Why are you worried about the preempt_disable() here? If this were an
> > interrupt context we'd still do that unwind -- but then with IRQs
> > disabled.
> 
> Because __bpf_get_stack() from kernel/bpf/stackmap.c can be called
> from sleepable/faultable context and also we can do a rather expensive
> build ID resolution (either in sleepable or not, which only changes if
> build ID parsing logic waits for file backed pages to be paged in or
> not).

<rant>
So stack_map_get_build_id_offset() is a piece of crap -- and I've always
said it was. And I hate that any of that ever got merged -- its the
pinnacle of bad engineering and simply refusing to do the right thing in
the interest of hack now, fix never :/
</rant>

Anyway, as you well know, we now *do* have lockless vma lookups and
should be able to do this buildid thing much saner.

Also, there appears to be no buildid caching what so ever, surely that
would help some.

(and I'm not sure I've ever understood why the buildid crap needs to be
in this path in any case)
Re: [PATCH bpf-next v8 2/3] perf: Refactor get_perf_callchain
Posted by Andrii Nakryiko 1 week ago
On Fri, Jan 30, 2026 at 3:31 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Wed, Jan 28, 2026 at 11:12:09AM -0800, Andrii Nakryiko wrote:
> > On Wed, Jan 28, 2026 at 1:10 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > On Mon, Jan 26, 2026 at 03:43:30PM +0800, Tao Chen wrote:
> > > > From BPF stack map, we want to ensure that the callchain buffer
> > > > will not be overwritten by other preemptive tasks and we also aim
> > > > to reduce the preempt disable interval, Based on the suggestions from Peter
> > > > and Andrrii, export new API __get_perf_callchain and the usage scenarios
> > > > are as follows from BPF side:
> > > >
> > > > preempt_disable()
> > > > entry = get_callchain_entry()
> > > > preempt_enable()
> > > > __get_perf_callchain(entry)
> > > > put_callchain_entry(entry)
> > >
> > > That makes no sense, this means any other task on that CPU is getting
> > > screwed over.
> >
> > Yes, unfortunately, but unless we dynamically allocate new entry each
> > time and/or keep per-current entry cached there isn't much choice we
> > have here, no?
> >
> > Maybe that's what we have to do, honestly, because
> > get_perf_callchain() usage we have right now from sleepable BPF isn't
> > great no matter with or without changes in this patch set.
>
> All of the perf stuff is based on the fact that if we do it from IRQ/NMI
> context, we can certainly do it with preemption disabled.
>
> Bending the interface this way is just horrible.

Sure, but I'm just trying to help mitigate the issue at hand (way too
long preemption disabled region). I agree that we should do something
better in terms of perf_callchain_entry retrieval and reuse, but maybe
one thing at a time?

>
> > > Why are you worried about the preempt_disable() here? If this were an
> > > interrupt context we'd still do that unwind -- but then with IRQs
> > > disabled.
> >
> > Because __bpf_get_stack() from kernel/bpf/stackmap.c can be called
> > from sleepable/faultable context and also we can do a rather expensive
> > build ID resolution (either in sleepable or not, which only changes if
> > build ID parsing logic waits for file backed pages to be paged in or
> > not).
>
> <rant>
> So stack_map_get_build_id_offset() is a piece of crap -- and I've always
> said it was. And I hate that any of that ever got merged -- its the
> pinnacle of bad engineering and simply refusing to do the right thing in
> the interest of hack now, fix never :/
> </rant>

<hug><nod></hug>

>
> Anyway, as you well know, we now *do* have lockless vma lookups and
> should be able to do this buildid thing much saner.

Yes, probably, and I am aware of a mmap_lock use inside
stack_map_get_build_id_offset() being problematic, we'll need to fix
this as well. One step at a time.

>
> Also, there appears to be no buildid caching what so ever, surely that
> would help some.

Jiri Olsa proposed caching build id per file or per inode some time
back, there was vehement opposition to it. And doing some locked
global resizable hash that might need to be used from NMI sounds
horrible, tbh. So we have what we have today.

>
> (and I'm not sure I've ever understood why the buildid crap needs to be
> in this path in any case)

Yeah, perhaps, I haven't dealt with stack_map_get_build_id_offset()
much, so will need to go a bit deeper and analyze. As I said we have
mmap_lock problem there, so we need to address that. I'll think if/how
we can improve this.

Do I understand correctly that you'd rather just not touch all this
for now and we should just drop this patch set?
Re: [PATCH bpf-next v8 2/3] perf: Refactor get_perf_callchain
Posted by Peter Zijlstra 4 days, 18 hours ago
On Fri, Jan 30, 2026 at 12:04:45PM -0800, Andrii Nakryiko wrote:

> > Also, there appears to be no buildid caching what so ever, surely that
> > would help some.
> 
> Jiri Olsa proposed caching build id per file or per inode some time
> back, there was vehement opposition to it. And doing some locked
> global resizable hash that might need to be used from NMI sounds
> horrible, tbh. So we have what we have today.

See kernel/module/tree_lookup.c and include/linux/rbtree_latch.h :-)
Re: [PATCH bpf-next v8 2/3] perf: Refactor get_perf_callchain
Posted by Andrii Nakryiko 3 days, 14 hours ago
On Mon, Feb 2, 2026 at 11:59 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Fri, Jan 30, 2026 at 12:04:45PM -0800, Andrii Nakryiko wrote:
>
> > > Also, there appears to be no buildid caching what so ever, surely that
> > > would help some.
> >
> > Jiri Olsa proposed caching build id per file or per inode some time
> > back, there was vehement opposition to it. And doing some locked
> > global resizable hash that might need to be used from NMI sounds
> > horrible, tbh. So we have what we have today.
>
> See kernel/module/tree_lookup.c and include/linux/rbtree_latch.h :-)

Ah, neat idea. Each node is really two parallel nodes for each of the
latched copies. TIL, good to know and thanks!
Re: [PATCH bpf-next v8 2/3] perf: Refactor get_perf_callchain
Posted by Tao Chen 1 week, 2 days ago
在 2026/1/28 17:10, Peter Zijlstra 写道:
> On Mon, Jan 26, 2026 at 03:43:30PM +0800, Tao Chen wrote:
>>  From BPF stack map, we want to ensure that the callchain buffer
>> will not be overwritten by other preemptive tasks and we also aim
>> to reduce the preempt disable interval, Based on the suggestions from Peter
>> and Andrrii, export new API __get_perf_callchain and the usage scenarios
>> are as follows from BPF side:
>>
>> preempt_disable()
>> entry = get_callchain_entry()
>> preempt_enable()
>> __get_perf_callchain(entry)
>> put_callchain_entry(entry)
> 
> That makes no sense, this means any other task on that CPU is getting
> screwed over.
> 
> Why are you worried about the preempt_disable() here? If this were an
> interrupt context we'd still do that unwind -- but then with IRQs
> disabled.

Hi Peter,

Right now, obtaining stack information in BPF includes 2 steps:
1.get callchain
2.store callchain in bpf map or copy to buffer

There is no preempt disable in BPF now, When obtaining the stack 
information of Process A, Process A may be preempted by Process B. With 
the same logic, we then acquire the stack information of Process B. 
However, when execution resumes to Process A, the callchain buffer will 
store the stack information of Process B. Because each context(task, 
soft irq, irq, nmi) has only one callchain entry.

       taskA                             taskB
1.callchain(A) = get_perf_callchain
		<-- preepmted by B   callchain(B) = get_perf_callchain	
2.stack_map(callchain(B))
	

So we want to ensure that when task A is in use, the preepmt task B 
cannot be used. The approach involves deferring the put_callchain_entry 
until the stack is captured and saved in the stack_map.

       taskA                             taskB
1.callchain(A) = __get_perf_callchain
		<-- preepmted by B   callchain(B) = __get_perf_callchain
2.stack_map(callchain(A))
3.put_callchain_entry()		
	
taskB can not get the callchain because taskA hold it.

And the preempt_disable() for get_callchain_entry was suggested from 
Yonghong in v4
https://lore.kernel.org/bpf/c352f357-1417-47b5-9d8c-28d99f20f5a6@linux.dev/

Please correct me if I'm mistaken. Thanks.

-- 
Best Regards
Tao Chen
Re: [PATCH bpf-next v8 2/3] perf: Refactor get_perf_callchain
Posted by Andrii Nakryiko 1 week, 3 days ago
On Sun, Jan 25, 2026 at 11:45 PM Tao Chen <chen.dylane@linux.dev> wrote:
>
> From BPF stack map, we want to ensure that the callchain buffer
> will not be overwritten by other preemptive tasks and we also aim
> to reduce the preempt disable interval, Based on the suggestions from Peter
> and Andrrii, export new API __get_perf_callchain and the usage scenarios
> are as follows from BPF side:
>
> preempt_disable()
> entry = get_callchain_entry()
> preempt_enable()
> __get_perf_callchain(entry)
> put_callchain_entry(entry)
>
> Suggested-by: Andrii Nakryiko <andrii@kernel.org>
> Signed-off-by: Tao Chen <chen.dylane@linux.dev>
> ---
>  include/linux/perf_event.h |  5 +++++
>  kernel/events/callchain.c  | 34 ++++++++++++++++++++++------------
>  2 files changed, 27 insertions(+), 12 deletions(-)

[...]

> +struct perf_callchain_entry *
> +get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
> +                  u32 max_stack, bool crosstask, bool add_mark, u64 defer_cookie)
> +{
> +       struct perf_callchain_entry *entry;
> +       int ret;
> +
> +       entry = get_callchain_entry();
> +       if (!entry)
> +               return NULL;
>
> -exit_put:
> +       ret = __get_perf_callchain(entry, regs, kernel, user, max_stack, crosstask, add_mark,
> +                                  defer_cookie);
>         put_callchain_entry(entry);
> +       if (ret)
> +               entry = NULL;
>

purely stylistical nit, so this can be ignored if you disagree, but I
find code that modifies some variable before returning it slightly
less preferable to more explicit:


if (__get_perf_callchain(...)) {
    put_callchain_entry(entry);
    return NULL;
}

return entry;

>         return entry;
>  }

> --
> 2.48.1
>
Re: [PATCH bpf-next v8 2/3] perf: Refactor get_perf_callchain
Posted by Tao Chen 1 week, 3 days ago
在 2026/1/28 05:07, Andrii Nakryiko 写道:
> On Sun, Jan 25, 2026 at 11:45 PM Tao Chen <chen.dylane@linux.dev> wrote:
>>
>>  From BPF stack map, we want to ensure that the callchain buffer
>> will not be overwritten by other preemptive tasks and we also aim
>> to reduce the preempt disable interval, Based on the suggestions from Peter
>> and Andrrii, export new API __get_perf_callchain and the usage scenarios
>> are as follows from BPF side:
>>
>> preempt_disable()
>> entry = get_callchain_entry()
>> preempt_enable()
>> __get_perf_callchain(entry)
>> put_callchain_entry(entry)
>>
>> Suggested-by: Andrii Nakryiko <andrii@kernel.org>
>> Signed-off-by: Tao Chen <chen.dylane@linux.dev>
>> ---
>>   include/linux/perf_event.h |  5 +++++
>>   kernel/events/callchain.c  | 34 ++++++++++++++++++++++------------
>>   2 files changed, 27 insertions(+), 12 deletions(-)
> 
> [...]
> 
>> +struct perf_callchain_entry *
>> +get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
>> +                  u32 max_stack, bool crosstask, bool add_mark, u64 defer_cookie)
>> +{
>> +       struct perf_callchain_entry *entry;
>> +       int ret;
>> +
>> +       entry = get_callchain_entry();
>> +       if (!entry)
>> +               return NULL;
>>
>> -exit_put:
>> +       ret = __get_perf_callchain(entry, regs, kernel, user, max_stack, crosstask, add_mark,
>> +                                  defer_cookie);
>>          put_callchain_entry(entry);
>> +       if (ret)
>> +               entry = NULL;
>>
> 
> purely stylistical nit, so this can be ignored if you disagree, but I
> find code that modifies some variable before returning it slightly
> less preferable to more explicit:
> 
> 
> if (__get_perf_callchain(...)) {
>      put_callchain_entry(entry);
>      return NULL;
> }
> 
> return entry;
> 
>>          return entry;
>>   }
> 

agree, will change it in v9, thanks.

>> --
>> 2.48.1
>>


-- 
Best Regards
Tao Chen