arch/x86/kernel/alternative.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-)
eBPF programs can be run 20,000,000+ times per second on busy servers.
Whenever /proc/sys/kernel/bpf_stats_enabled is turned off,
hundreds of calls sites are patched from text_poke_bp_batch()
and we see a critical loss of performance due to false sharing
on bp_desc.refs lasting up to three seconds.
51.30% server_bin [kernel.kallsyms] [k] poke_int3_handler
|
|--46.45%--poke_int3_handler
| exc_int3
| asm_exc_int3
| |
| |--24.26%--cls_bpf_classify
| | tcf_classify
| | __dev_queue_xmit
| | ip6_finish_output2
| | ip6_output
| | ip6_xmit
| | inet6_csk_xmit
| | __tcp_transmit_skb
| | |
| | |--9.00%--tcp_v6_do_rcv
| | | tcp_v6_rcv
| | | ip6_protocol_deliver_rcu
| | | ip6_rcv_finish
| | | ipv6_rcv
| | | __netif_receive_skb
| | | process_backlog
| | | __napi_poll
| | | net_rx_action
| | | __softirqentry_text_start
| | | asm_call_sysvec_on_stack
| | | do_softirq_own_stack
Fix this by replacing bp_desc.refs with a per-cpu bp_refs.
Before the patch, on a host with 240 cpus (480 threads):
echo 0 >/proc/sys/kernel/bpf_stats_enabled
text_poke_bp_batch(nr_entries=2)
text_poke_bp_batch+1
text_poke_finish+27
arch_jump_label_transform_apply+22
jump_label_update+98
__static_key_slow_dec_cpuslocked+64
static_key_slow_dec+31
bpf_stats_handler+236
proc_sys_call_handler+396
vfs_write+761
ksys_write+102
do_syscall_64+107
entry_SYSCALL_64_after_hwframe+103
Took 324 usec
text_poke_bp_batch(nr_entries=164)
text_poke_bp_batch+1
text_poke_finish+27
arch_jump_label_transform_apply+22
jump_label_update+98
__static_key_slow_dec_cpuslocked+64
static_key_slow_dec+31
bpf_stats_handler+236
proc_sys_call_handler+396
vfs_write+761
ksys_write+102
do_syscall_64+107
entry_SYSCALL_64_after_hwframe+103
Took 2655300 usec
After this patch:
echo 0 >/proc/sys/kernecho 0 >/proc/sys/kernel/bpf_stats_enabled
text_poke_bp_batch(nr_entries=2)
text_poke_bp_batch+1
text_poke_finish+27
arch_jump_label_transform_apply+22
jump_label_update+98
__static_key_slow_dec_cpuslocked+64
static_key_slow_dec+31
bpf_stats_handler+236
proc_sys_call_handler+396
vfs_write+761
ksys_write+102
do_syscall_64+107
entry_SYSCALL_64_after_hwframe+103
Took 519 usec
text_poke_bp_batch(nr_entries=164)
text_poke_bp_batch+1
text_poke_finish+27
arch_jump_label_transform_apply+22
jump_label_update+98
__static_key_slow_dec_cpuslocked+64
static_key_slow_dec+31
bpf_stats_handler+236
proc_sys_call_handler+396
vfs_write+761
ksys_write+102
do_syscall_64+107
entry_SYSCALL_64_after_hwframe+103
Took 702 usec
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
arch/x86/kernel/alternative.c | 30 ++++++++++++++++++------------
1 file changed, 18 insertions(+), 12 deletions(-)
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index c71b575bf229..d7afbf822c45 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -2137,28 +2137,29 @@ struct text_poke_loc {
struct bp_patching_desc {
struct text_poke_loc *vec;
int nr_entries;
- atomic_t refs;
};
+static DEFINE_PER_CPU(atomic_t, bp_refs);
+
static struct bp_patching_desc bp_desc;
static __always_inline
struct bp_patching_desc *try_get_desc(void)
{
- struct bp_patching_desc *desc = &bp_desc;
+ atomic_t *refs = this_cpu_ptr(&bp_refs);
- if (!raw_atomic_inc_not_zero(&desc->refs))
+ if (!raw_atomic_inc_not_zero(refs))
return NULL;
- return desc;
+ return &bp_desc;
}
static __always_inline void put_desc(void)
{
- struct bp_patching_desc *desc = &bp_desc;
+ atomic_t *refs = this_cpu_ptr(&bp_refs);
smp_mb__before_atomic();
- raw_atomic_dec(&desc->refs);
+ raw_atomic_dec(refs);
}
static __always_inline void *text_poke_addr(struct text_poke_loc *tp)
@@ -2191,9 +2192,9 @@ noinstr int poke_int3_handler(struct pt_regs *regs)
* Having observed our INT3 instruction, we now must observe
* bp_desc with non-zero refcount:
*
- * bp_desc.refs = 1 INT3
- * WMB RMB
- * write INT3 if (bp_desc.refs != 0)
+ * bp_refs = 1 INT3
+ * WMB RMB
+ * write INT3 if (bp_refs != 0)
*/
smp_rmb();
@@ -2299,7 +2300,8 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
* Corresponds to the implicit memory barrier in try_get_desc() to
* ensure reading a non-zero refcount provides up to date bp_desc data.
*/
- atomic_set_release(&bp_desc.refs, 1);
+ for_each_possible_cpu(i)
+ atomic_set_release(per_cpu_ptr(&bp_refs, i), 1);
/*
* Function tracing can enable thousands of places that need to be
@@ -2413,8 +2415,12 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
/*
* Remove and wait for refs to be zero.
*/
- if (!atomic_dec_and_test(&bp_desc.refs))
- atomic_cond_read_acquire(&bp_desc.refs, !VAL);
+ for_each_possible_cpu(i) {
+ atomic_t *refs = per_cpu_ptr(&bp_refs, i);
+
+ if (!atomic_dec_and_test(refs))
+ atomic_cond_read_acquire(refs, !VAL);
+ }
}
static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
--
2.49.0.395.g12beb8f557-goog
* Eric Dumazet <edumazet@google.com> wrote:
> eBPF programs can be run 20,000,000+ times per second on busy servers.
>
> Whenever /proc/sys/kernel/bpf_stats_enabled is turned off,
> hundreds of calls sites are patched from text_poke_bp_batch()
> and we see a critical loss of performance due to false sharing
> on bp_desc.refs lasting up to three seconds.
> @@ -2413,8 +2415,12 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
> /*
> * Remove and wait for refs to be zero.
> */
> - if (!atomic_dec_and_test(&bp_desc.refs))
> - atomic_cond_read_acquire(&bp_desc.refs, !VAL);
> + for_each_possible_cpu(i) {
> + atomic_t *refs = per_cpu_ptr(&bp_refs, i);
> +
> + if (!atomic_dec_and_test(refs))
> + atomic_cond_read_acquire(refs, !VAL);
> + }
So your patch changes text_poke_bp_batch() to busy-spin-wait for
bp_refs to go to zero on all 480 CPUs.
Your measurement is using /proc/sys/kernel/bpf_stats_enabled on a
single CPU, right?
What's the adversarial workload here? Spamming bpf_stats_enabled on all
CPUs in parallel? Or mixing it with some other text_poke_bp_batch()
user if bpf_stats_enabled serializes access?
Does anything undesirable happen in that case?
Thanks,
Ingo
On Sun, Mar 23, 2025 at 10:38 PM Ingo Molnar <mingo@kernel.org> wrote:
>
>
> * Eric Dumazet <edumazet@google.com> wrote:
>
> > eBPF programs can be run 20,000,000+ times per second on busy servers.
> >
> > Whenever /proc/sys/kernel/bpf_stats_enabled is turned off,
> > hundreds of calls sites are patched from text_poke_bp_batch()
> > and we see a critical loss of performance due to false sharing
> > on bp_desc.refs lasting up to three seconds.
>
> > @@ -2413,8 +2415,12 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
> > /*
> > * Remove and wait for refs to be zero.
> > */
> > - if (!atomic_dec_and_test(&bp_desc.refs))
> > - atomic_cond_read_acquire(&bp_desc.refs, !VAL);
> > + for_each_possible_cpu(i) {
> > + atomic_t *refs = per_cpu_ptr(&bp_refs, i);
> > +
> > + if (!atomic_dec_and_test(refs))
> > + atomic_cond_read_acquire(refs, !VAL);
> > + }
>
> So your patch changes text_poke_bp_batch() to busy-spin-wait for
> bp_refs to go to zero on all 480 CPUs.
>
> Your measurement is using /proc/sys/kernel/bpf_stats_enabled on a
> single CPU, right?
Yes, some daemon enables bpf_stats for a small amount of time (one
second) to gather stats
on eBPF run time costs. (bpftool prog | grep run_time)
One eBPF selftest can also do this.
tools/testing/selftests/bpf/prog_tests/enable_stats.c
>
> What's the adversarial workload here? Spamming bpf_stats_enabled on all
> CPUs in parallel? Or mixing it with some other text_poke_bp_batch()
> user if bpf_stats_enabled serializes access?
The workload is having ~480 cpus running various eBPF programs all
over the places,
In the perf bit I added in the changelog, we see an eBPF program
hooked at the xmit of each packet.
But the fd = bpf_enable_stats(BPF_STATS_RUN_TIME) / .... / close(fd)
only happens from time to time,
because of the supposed extra cost of fetching two extra time stamps.
BTW, before the patch stats on my test host look like
105: sched_cls name hn_egress tag 699fc5eea64144e3 gpl run_time_ns
3009063719 run_cnt 82757845
-> average cost is 36 nsec per call
And after the patch :
105: sched_cls name hn_egress tag 699fc5eea64144e3 gpl run_time_ns
1928223019 run_cnt 67682728
-> average cost is 28 nsec per call
>
> Does anything undesirable happen in that case?
The case of multiple threads trying to flip bpf_stats_enabled is
handled by bpf_stats_enabled_mutex.
Thanks !
>
> Thanks,
>
> Ingo
* Eric Dumazet <edumazet@google.com> wrote:
> > What's the adversarial workload here? Spamming bpf_stats_enabled on all
> > CPUs in parallel? Or mixing it with some other text_poke_bp_batch()
> > user if bpf_stats_enabled serializes access?
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> > Does anything undesirable happen in that case?
>
> The case of multiple threads trying to flip bpf_stats_enabled is
> handled by bpf_stats_enabled_mutex.
So my suggested workload wasn't adversarial enough due to
bpf_stats_enabled_mutex: how about some other workload that doesn't
serialize access to text_poke_bp_batch()?
Thanks,
Ingo
On Mon, Mar 24, 2025 at 8:16 AM Ingo Molnar <mingo@kernel.org> wrote: > > > * Eric Dumazet <edumazet@google.com> wrote: > > > > What's the adversarial workload here? Spamming bpf_stats_enabled on all > > > CPUs in parallel? Or mixing it with some other text_poke_bp_batch() > > > user if bpf_stats_enabled serializes access? > ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ > > > > Does anything undesirable happen in that case? > > > > The case of multiple threads trying to flip bpf_stats_enabled is > > handled by bpf_stats_enabled_mutex. > > So my suggested workload wasn't adversarial enough due to > bpf_stats_enabled_mutex: how about some other workload that doesn't > serialize access to text_poke_bp_batch()? Do you have a specific case in mind that I can test on these big platforms ? text_poke_bp_batch() calls themselves are serialized by text_mutex, it is not clear what you are looking for. Thanks.
* Eric Dumazet <edumazet@google.com> wrote: > Do you have a specific case in mind that I can test on these big > platforms ? No. I was thinking of large-scale kprobes or ftrace patching - but you are right that the text_mutex should naturally serialize all the write-side code here. Mind adding your second round of test results to the changelog as well, which improved per call overhead from 36 to 28 nsecs? Thanks, Ingo
On Mon, Mar 24, 2025 at 9:02 AM Ingo Molnar <mingo@kernel.org> wrote:
>
>
> * Eric Dumazet <edumazet@google.com> wrote:
>
> > Do you have a specific case in mind that I can test on these big
> > platforms ?
>
> No. I was thinking of large-scale kprobes or ftrace patching - but you
> are right that the text_mutex should naturally serialize all the
> write-side code here.
>
> Mind adding your second round of test results to the changelog as well,
> which improved per call overhead from 36 to 28 nsecs?
Sure thing, thanks !
Note the 36 to 28 nsec was on a test host, not really under production stress.
As all of our production runs with the old code, I can not really tell
what would be the effective change once new kernels are rolled out.
When updating an unique and shared atomic_t from 480 cpus (worst case
scenario), we need more than 40000 cycles per operation.
perf stat atomic_bench -T480
The atomic counter is 21904528, total_cycles=2095231571464, 95652 avg
cycles per update
[05] 7866 in
[32,64[ cycles (53 avg)
[06] 2196 in
[64,128[ cycles (81 avg)
[07] 2942 in
[128,256[ cycles (202 avg)
[08] 1865 in
[256,512[ cycles (383 avg)
[09] 4251 in
[512,1024[ cycles (780 avg)
[10] 72248 in
[1024,2048[ cycles (1722 avg)
[11] *** 438110 in
[2048,4096[ cycles (3217 avg)
[12] *********** 1703927 in
[4096,8192[ cycles (6199 avg)
[13] ************************** 3869889 in
[8192,16384[ cycles (12320 avg)
[14] *************************** 4040952 in
[16384,32768[ cycles (25185 avg)
[15] ************************************************** 7261596 in
[32768,65536[ cycles (46884 avg)
[16] ****************** 2688791 in
[65536,131072[ cycles (83552 avg)
[17] * 253104 in
[131072,262144[ cycles (189642 avg)
[18] ** 326075 in
[262144,524288[ cycles (349319 avg)
[19] ****** 901293 in
[524288,1048576[ cycles (890724 avg)
[20] ** 321711 in
[1048576,2097152[ cycles (1205250 avg)
[21] 6616 in
[2097152,4194304[ cycles (2436096 avg)
Performance counter stats for './atomic_bench -T480':
964,194.88 msec task-clock # 467.120 CPUs
utilized
13,795 context-switches # 14.307 M/sec
480 cpu-migrations # 0.498 M/sec
1,605 page-faults # 1.665 M/sec
3,182,241,468,867 cycles # 3300416.170 GHz
11,077,646,267 instructions # 0.00 insn per
cycle
1,711,894,269 branches # 1775466.627 M/sec
3,747,877 branch-misses # 0.22% of all
branches
2.064128692 seconds time elapsed
I said the atomic_cond_read_acquire(refs, !VAL) was not called in my tests,
but it is a valid situation, we should not add a WARN_ON_ONCE().
I will simply add the unlikely()
Thanks.
On Mon, Mar 24, 2025 at 8:47 AM Eric Dumazet <edumazet@google.com> wrote:
>
> On Mon, Mar 24, 2025 at 8:16 AM Ingo Molnar <mingo@kernel.org> wrote:
> >
> >
> > * Eric Dumazet <edumazet@google.com> wrote:
> >
> > > > What's the adversarial workload here? Spamming bpf_stats_enabled on all
> > > > CPUs in parallel? Or mixing it with some other text_poke_bp_batch()
> > > > user if bpf_stats_enabled serializes access?
> > ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> >
> > > > Does anything undesirable happen in that case?
> > >
> > > The case of multiple threads trying to flip bpf_stats_enabled is
> > > handled by bpf_stats_enabled_mutex.
> >
> > So my suggested workload wasn't adversarial enough due to
> > bpf_stats_enabled_mutex: how about some other workload that doesn't
> > serialize access to text_poke_bp_batch()?
>
> Do you have a specific case in mind that I can test on these big platforms ?
>
> text_poke_bp_batch() calls themselves are serialized by text_mutex, it
> is not clear what you are looking for.
BTW the atomic_cond_read_acquire() part is never called even during my
stress test.
We could add this eventually:
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index d7afbf822c45..5d364e990055 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -2418,7 +2418,7 @@ static void text_poke_bp_batch(struct
text_poke_loc *tp, unsigned int nr_entries
for_each_possible_cpu(i) {
atomic_t *refs = per_cpu_ptr(&bp_refs, i);
- if (!atomic_dec_and_test(refs))
+ if (unlikely(!atomic_dec_and_test(refs)))
atomic_cond_read_acquire(refs, !VAL);
}
}
On Mon, Mar 24, 2025 at 08:53:31AM +0100, Eric Dumazet wrote: > BTW the atomic_cond_read_acquire() part is never called even during my > stress test. Yes, IIRC this is due to text_poke_sync() serializing the state, as that does a synchronous IPI broadcast, which by necessity requires all previous INT3 handlers to complete. You can only hit that case if the INT3 remains after step-3 (IOW you're actively writing INT3 into the text). This is exceedingly rare.
* Peter Zijlstra <peterz@infradead.org> wrote: > On Mon, Mar 24, 2025 at 08:53:31AM +0100, Eric Dumazet wrote: > > > BTW the atomic_cond_read_acquire() part is never called even during my > > stress test. > > Yes, IIRC this is due to text_poke_sync() serializing the state, as that > does a synchronous IPI broadcast, which by necessity requires all > previous INT3 handlers to complete. > > You can only hit that case if the INT3 remains after step-3 (IOW you're > actively writing INT3 into the text). This is exceedingly rare. Might make sense to add a comment for that. Also, any strong objections against doing this in the namespace: s/bp_/int3_ ? Half of the code already calls it a variant of 'int3', half of it 'bp', which I had to think for a couple of seconds goes for breakpoint, not base pointer ... ;-) Might as well standardize on int3_ and call it a day? Thanks, Ingo
On Tue, Mar 25, 2025 at 09:41:10AM +0100, Ingo Molnar wrote: > > * Peter Zijlstra <peterz@infradead.org> wrote: > > > On Mon, Mar 24, 2025 at 08:53:31AM +0100, Eric Dumazet wrote: > > > > > BTW the atomic_cond_read_acquire() part is never called even during my > > > stress test. > > > > Yes, IIRC this is due to text_poke_sync() serializing the state, as that > > does a synchronous IPI broadcast, which by necessity requires all > > previous INT3 handlers to complete. > > > > You can only hit that case if the INT3 remains after step-3 (IOW you're > > actively writing INT3 into the text). This is exceedingly rare. > > Might make sense to add a comment for that. Sure, find below. > Also, any strong objections against doing this in the namespace: > > s/bp_/int3_ > > ? > > Half of the code already calls it a variant of 'int3', half of it 'bp', > which I had to think for a couple of seconds goes for breakpoint, not > base pointer ... ;-) It actually is breakpoint, as in INT3 raises #BP. For complete confusion the things that are commonly known as debug breakpoints, those things in DR7, they raise #DB or debug exceptions. > Might as well standardize on int3_ and call it a day? Yeah, perhaps. At some point you've got to know that INT3->#BP and DR7->#DB and it all sorta makes sense, but *shrug* :-) --- diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index bf82c6f7d690..01e94603e767 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2749,6 +2749,13 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries /* * Remove and wait for refs to be zero. + * + * Notably, if after step-3 above the INT3 got removed, then the + * text_poke_sync() will have serialized against any running INT3 + * handlers and the below spin-wait will not happen. + * + * IOW. unless the replacement instruction is INT3, this case goes + * unused. */ if (!atomic_dec_and_test(&bp_desc.refs)) atomic_cond_read_acquire(&bp_desc.refs, !VAL);
* Peter Zijlstra <peterz@infradead.org> wrote: > On Tue, Mar 25, 2025 at 09:41:10AM +0100, Ingo Molnar wrote: > > > > * Peter Zijlstra <peterz@infradead.org> wrote: > > > > > On Mon, Mar 24, 2025 at 08:53:31AM +0100, Eric Dumazet wrote: > > > > > > > BTW the atomic_cond_read_acquire() part is never called even during my > > > > stress test. > > > > > > Yes, IIRC this is due to text_poke_sync() serializing the state, as that > > > does a synchronous IPI broadcast, which by necessity requires all > > > previous INT3 handlers to complete. > > > > > > You can only hit that case if the INT3 remains after step-3 (IOW you're > > > actively writing INT3 into the text). This is exceedingly rare. > > > > Might make sense to add a comment for that. > > Sure, find below. > > > Also, any strong objections against doing this in the namespace: > > > > s/bp_/int3_ > > > > ? > > > > Half of the code already calls it a variant of 'int3', half of it 'bp', > > which I had to think for a couple of seconds goes for breakpoint, not > > base pointer ... ;-) > > It actually is breakpoint, as in INT3 raises #BP. For complete confusion > the things that are commonly known as debug breakpoints, those things in > DR7, they raise #DB or debug exceptions. Yeah, it's a software breakpoint, swbp, that raises the #BP trap. 'bp' is confusingly aliased (in my brain at least) with 'base pointer' register naming and assembler syntax: as in bp, ebp, rbp. So I'd prefer if it was named consistently: text_poke_int3_batch() text_poke_int3_handler() ... Not the current mishmash of: text_poke_bp_batch() poke_int3_handler() ... Does this make more sense? > > Might as well standardize on int3_ and call it a day? > > Yeah, perhaps. At some point you've got to know that INT3->#BP and > DR7->#DB and it all sorta makes sense, but *shrug* :-) Yeah, so I do know what #BP is, but what the heck disambiguates the two meanings of _bp and why do we have the above jungle of an inconsistent namespace? :-) Picking _int3 would neatly solve all of that. > diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c > index bf82c6f7d690..01e94603e767 100644 > --- a/arch/x86/kernel/alternative.c > +++ b/arch/x86/kernel/alternative.c > @@ -2749,6 +2749,13 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries > > /* > * Remove and wait for refs to be zero. > + * > + * Notably, if after step-3 above the INT3 got removed, then the > + * text_poke_sync() will have serialized against any running INT3 > + * handlers and the below spin-wait will not happen. > + * > + * IOW. unless the replacement instruction is INT3, this case goes > + * unused. > */ > if (!atomic_dec_and_test(&bp_desc.refs)) > atomic_cond_read_acquire(&bp_desc.refs, !VAL); Thanks! I stuck this into tip:x86/alternatives, with your SOB. Ingo
On Tue, Mar 25, 2025 at 12:26:36PM +0100, Ingo Molnar wrote: > Yeah, so I do know what #BP is, but what the heck disambiguates the two > meanings of _bp and why do we have the above jungle of an inconsistent > namespace? :-) > > Picking _int3 would neatly solve all of that. Sure; the most obvious case where BP would make sense, the trap entry point, we already use int3 so yeah, make it int3 throughout.
* Peter Zijlstra <peterz@infradead.org> wrote: > On Tue, Mar 25, 2025 at 12:26:36PM +0100, Ingo Molnar wrote: > > > Yeah, so I do know what #BP is, but what the heck disambiguates the two > > meanings of _bp and why do we have the above jungle of an inconsistent > > namespace? :-) > > > > Picking _int3 would neatly solve all of that. > > Sure; the most obvious case where BP would make sense, the trap entry > point, we already use int3 so yeah, make it int3 throughout. Okay - I just sent out a series to do this. Found a few other things to fix/improve along the way: https://lore.kernel.org/r/20250327205355.378659-1-mingo@kernel.org Thanks, Ingo
The following commit has been merged into the x86/alternatives branch of tip:
Commit-ID: 451283cd40bceccc02397d0e5b2b6eda6047b1ed
Gitweb: https://git.kernel.org/tip/451283cd40bceccc02397d0e5b2b6eda6047b1ed
Author: Peter Zijlstra <peterz@infradead.org>
AuthorDate: Tue, 25 Mar 2025 11:30:47 +01:00
Committer: Ingo Molnar <mingo@kernel.org>
CommitterDate: Tue, 25 Mar 2025 12:28:35 +01:00
x86/alternatives: Document the text_poke_bp_batch() synchronization rules a bit more
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Link: https://lore.kernel.org/r/20250325103047.GH36322@noisy.programming.kicks-ass.net
---
arch/x86/kernel/alternative.c | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 85089c7..5f44814 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -2751,6 +2751,13 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
/*
* Remove and wait for refs to be zero.
+ *
+ * Notably, if after step-3 above the INT3 got removed, then the
+ * text_poke_sync() will have serialized against any running INT3
+ * handlers and the below spin-wait will not happen.
+ *
+ * IOW. unless the replacement instruction is INT3, this case goes
+ * unused.
*/
for_each_possible_cpu(i) {
atomic_t *refs = per_cpu_ptr(&bp_refs, i);
* Eric Dumazet <edumazet@google.com> wrote:
> On Mon, Mar 24, 2025 at 8:47 AM Eric Dumazet <edumazet@google.com> wrote:
> >
> > On Mon, Mar 24, 2025 at 8:16 AM Ingo Molnar <mingo@kernel.org> wrote:
> > >
> > >
> > > * Eric Dumazet <edumazet@google.com> wrote:
> > >
> > > > > What's the adversarial workload here? Spamming bpf_stats_enabled on all
> > > > > CPUs in parallel? Or mixing it with some other text_poke_bp_batch()
> > > > > user if bpf_stats_enabled serializes access?
> > > ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> > >
> > > > > Does anything undesirable happen in that case?
> > > >
> > > > The case of multiple threads trying to flip bpf_stats_enabled is
> > > > handled by bpf_stats_enabled_mutex.
> > >
> > > So my suggested workload wasn't adversarial enough due to
> > > bpf_stats_enabled_mutex: how about some other workload that doesn't
> > > serialize access to text_poke_bp_batch()?
> >
> > Do you have a specific case in mind that I can test on these big platforms ?
> >
> > text_poke_bp_batch() calls themselves are serialized by text_mutex, it
> > is not clear what you are looking for.
>
>
> BTW the atomic_cond_read_acquire() part is never called even during my
> stress test.
Yeah, that code threw me off - can it really happen with text_mutex
serializing all of it?
> @@ -2418,7 +2418,7 @@ static void text_poke_bp_batch(struct
> text_poke_loc *tp, unsigned int nr_entries
> for_each_possible_cpu(i) {
> atomic_t *refs = per_cpu_ptr(&bp_refs, i);
>
> - if (!atomic_dec_and_test(refs))
> + if (unlikely(!atomic_dec_and_test(refs)))
> atomic_cond_read_acquire(refs, !VAL);
If it could never happen then this should that condition be a
WARN_ON_ONCE() perhaps?
Thanks,
Ingo
© 2016 - 2026 Red Hat, Inc.