[v1] x86/alternatives: remove false sharing in poke_int3_handler()

[PATCH] x86/alternatives: remove false sharing in poke_int3_handler()

Posted by Eric Dumazet 10 months, 3 weeks ago

eBPF programs can be run 20,000,000+ times per second on busy servers.

Whenever /proc/sys/kernel/bpf_stats_enabled is turned off,
hundreds of calls sites are patched from text_poke_bp_batch()
and we see a critical loss of performance due to false sharing
on bp_desc.refs lasting up to three seconds.

   51.30%  server_bin       [kernel.kallsyms]           [k] poke_int3_handler
            |
            |--46.45%--poke_int3_handler
            |          exc_int3
            |          asm_exc_int3
            |          |
            |          |--24.26%--cls_bpf_classify
            |          |          tcf_classify
            |          |          __dev_queue_xmit
            |          |          ip6_finish_output2
            |          |          ip6_output
            |          |          ip6_xmit
            |          |          inet6_csk_xmit
            |          |          __tcp_transmit_skb
            |          |          |
            |          |          |--9.00%--tcp_v6_do_rcv
            |          |          |          tcp_v6_rcv
            |          |          |          ip6_protocol_deliver_rcu
            |          |          |          ip6_rcv_finish
            |          |          |          ipv6_rcv
            |          |          |          __netif_receive_skb
            |          |          |          process_backlog
            |          |          |          __napi_poll
            |          |          |          net_rx_action
            |          |          |          __softirqentry_text_start
            |          |          |          asm_call_sysvec_on_stack
            |          |          |          do_softirq_own_stack

Fix this by replacing bp_desc.refs with a per-cpu bp_refs.

Before the patch, on a host with 240 cpus (480 threads):

echo 0 >/proc/sys/kernel/bpf_stats_enabled
text_poke_bp_batch(nr_entries=2)
        text_poke_bp_batch+1
        text_poke_finish+27
        arch_jump_label_transform_apply+22
        jump_label_update+98
        __static_key_slow_dec_cpuslocked+64
        static_key_slow_dec+31
        bpf_stats_handler+236
        proc_sys_call_handler+396
        vfs_write+761
        ksys_write+102
        do_syscall_64+107
        entry_SYSCALL_64_after_hwframe+103
Took 324 usec

text_poke_bp_batch(nr_entries=164)
        text_poke_bp_batch+1
        text_poke_finish+27
        arch_jump_label_transform_apply+22
        jump_label_update+98
        __static_key_slow_dec_cpuslocked+64
        static_key_slow_dec+31
        bpf_stats_handler+236
        proc_sys_call_handler+396
        vfs_write+761
        ksys_write+102
        do_syscall_64+107
        entry_SYSCALL_64_after_hwframe+103
Took 2655300 usec

After this patch:

echo 0 >/proc/sys/kernecho 0 >/proc/sys/kernel/bpf_stats_enabled
text_poke_bp_batch(nr_entries=2)
        text_poke_bp_batch+1
        text_poke_finish+27
        arch_jump_label_transform_apply+22
        jump_label_update+98
        __static_key_slow_dec_cpuslocked+64
        static_key_slow_dec+31
        bpf_stats_handler+236
        proc_sys_call_handler+396
        vfs_write+761
        ksys_write+102
        do_syscall_64+107
        entry_SYSCALL_64_after_hwframe+103
Took 519 usec

text_poke_bp_batch(nr_entries=164)
        text_poke_bp_batch+1
        text_poke_finish+27
        arch_jump_label_transform_apply+22
        jump_label_update+98
        __static_key_slow_dec_cpuslocked+64
        static_key_slow_dec+31
        bpf_stats_handler+236
        proc_sys_call_handler+396
        vfs_write+761
        ksys_write+102
        do_syscall_64+107
        entry_SYSCALL_64_after_hwframe+103
Took 702 usec

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 arch/x86/kernel/alternative.c | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index c71b575bf229..d7afbf822c45 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -2137,28 +2137,29 @@ struct text_poke_loc {
 struct bp_patching_desc {
 	struct text_poke_loc *vec;
 	int nr_entries;
-	atomic_t refs;
 };
 
+static DEFINE_PER_CPU(atomic_t, bp_refs);
+
 static struct bp_patching_desc bp_desc;
 
 static __always_inline
 struct bp_patching_desc *try_get_desc(void)
 {
-	struct bp_patching_desc *desc = &bp_desc;
+	atomic_t *refs = this_cpu_ptr(&bp_refs);
 
-	if (!raw_atomic_inc_not_zero(&desc->refs))
+	if (!raw_atomic_inc_not_zero(refs))
 		return NULL;
 
-	return desc;
+	return &bp_desc;
 }
 
 static __always_inline void put_desc(void)
 {
-	struct bp_patching_desc *desc = &bp_desc;
+	atomic_t *refs = this_cpu_ptr(&bp_refs);
 
 	smp_mb__before_atomic();
-	raw_atomic_dec(&desc->refs);
+	raw_atomic_dec(refs);
 }
 
 static __always_inline void *text_poke_addr(struct text_poke_loc *tp)
@@ -2191,9 +2192,9 @@ noinstr int poke_int3_handler(struct pt_regs *regs)
 	 * Having observed our INT3 instruction, we now must observe
 	 * bp_desc with non-zero refcount:
 	 *
-	 *	bp_desc.refs = 1		INT3
-	 *	WMB				RMB
-	 *	write INT3			if (bp_desc.refs != 0)
+	 *	bp_refs = 1		INT3
+	 *	WMB			RMB
+	 *	write INT3		if (bp_refs != 0)
 	 */
 	smp_rmb();
 
@@ -2299,7 +2300,8 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
 	 * Corresponds to the implicit memory barrier in try_get_desc() to
 	 * ensure reading a non-zero refcount provides up to date bp_desc data.
 	 */
-	atomic_set_release(&bp_desc.refs, 1);
+	for_each_possible_cpu(i)
+		atomic_set_release(per_cpu_ptr(&bp_refs, i), 1);
 
 	/*
 	 * Function tracing can enable thousands of places that need to be
@@ -2413,8 +2415,12 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
 	/*
 	 * Remove and wait for refs to be zero.
 	 */
-	if (!atomic_dec_and_test(&bp_desc.refs))
-		atomic_cond_read_acquire(&bp_desc.refs, !VAL);
+	for_each_possible_cpu(i) {
+		atomic_t *refs = per_cpu_ptr(&bp_refs, i);
+
+		if (!atomic_dec_and_test(refs))
+			atomic_cond_read_acquire(refs, !VAL);
+	}
 }
 
 static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
-- 
2.49.0.395.g12beb8f557-goog

Re: [PATCH] x86/alternatives: remove false sharing in poke_int3_handler()

Posted by Ingo Molnar 10 months, 3 weeks ago

* Eric Dumazet <edumazet@google.com> wrote:

> eBPF programs can be run 20,000,000+ times per second on busy servers.
> 
> Whenever /proc/sys/kernel/bpf_stats_enabled is turned off,
> hundreds of calls sites are patched from text_poke_bp_batch()
> and we see a critical loss of performance due to false sharing
> on bp_desc.refs lasting up to three seconds.

> @@ -2413,8 +2415,12 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
>  	/*
>  	 * Remove and wait for refs to be zero.
>  	 */
> -	if (!atomic_dec_and_test(&bp_desc.refs))
> -		atomic_cond_read_acquire(&bp_desc.refs, !VAL);
> +	for_each_possible_cpu(i) {
> +		atomic_t *refs = per_cpu_ptr(&bp_refs, i);
> +
> +		if (!atomic_dec_and_test(refs))
> +			atomic_cond_read_acquire(refs, !VAL);
> +	}

So your patch changes text_poke_bp_batch() to busy-spin-wait for 
bp_refs to go to zero on all 480 CPUs.

Your measurement is using /proc/sys/kernel/bpf_stats_enabled on a 
single CPU, right?

What's the adversarial workload here? Spamming bpf_stats_enabled on all 
CPUs in parallel? Or mixing it with some other text_poke_bp_batch() 
user if bpf_stats_enabled serializes access?

Does anything undesirable happen in that case?

Thanks,

	Ingo

Re: [PATCH] x86/alternatives: remove false sharing in poke_int3_handler()

Posted by Eric Dumazet 10 months, 3 weeks ago

On Sun, Mar 23, 2025 at 10:38 PM Ingo Molnar <mingo@kernel.org> wrote:
>
>
> * Eric Dumazet <edumazet@google.com> wrote:
>
> > eBPF programs can be run 20,000,000+ times per second on busy servers.
> >
> > Whenever /proc/sys/kernel/bpf_stats_enabled is turned off,
> > hundreds of calls sites are patched from text_poke_bp_batch()
> > and we see a critical loss of performance due to false sharing
> > on bp_desc.refs lasting up to three seconds.
>
> > @@ -2413,8 +2415,12 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
> >       /*
> >        * Remove and wait for refs to be zero.
> >        */
> > -     if (!atomic_dec_and_test(&bp_desc.refs))
> > -             atomic_cond_read_acquire(&bp_desc.refs, !VAL);
> > +     for_each_possible_cpu(i) {
> > +             atomic_t *refs = per_cpu_ptr(&bp_refs, i);
> > +
> > +             if (!atomic_dec_and_test(refs))
> > +                     atomic_cond_read_acquire(refs, !VAL);
> > +     }
>
> So your patch changes text_poke_bp_batch() to busy-spin-wait for
> bp_refs to go to zero on all 480 CPUs.
>
> Your measurement is using /proc/sys/kernel/bpf_stats_enabled on a
> single CPU, right?

Yes, some daemon enables bpf_stats for a small amount of time (one
second) to gather stats
on eBPF run time costs.  (bpftool prog | grep run_time)

One eBPF selftest can also do this.

tools/testing/selftests/bpf/prog_tests/enable_stats.c


>
> What's the adversarial workload here? Spamming bpf_stats_enabled on all
> CPUs in parallel? Or mixing it with some other text_poke_bp_batch()
> user if bpf_stats_enabled serializes access?

The workload is having ~480 cpus running various eBPF programs all
over the places,

In the perf bit I added in the changelog, we see an eBPF program
hooked at the xmit of each packet.

But the fd = bpf_enable_stats(BPF_STATS_RUN_TIME)  / .... / close(fd)
only happens from time to time,
because of the supposed extra cost of fetching two extra time stamps.

BTW, before the patch stats on my test host look like

105: sched_cls  name hn_egress  tag 699fc5eea64144e3  gpl run_time_ns
3009063719 run_cnt 82757845
-> average cost is 36 nsec per call

And after the patch :

105: sched_cls  name hn_egress  tag 699fc5eea64144e3  gpl run_time_ns
1928223019 run_cnt 67682728
 -> average cost is 28 nsec per call

>
> Does anything undesirable happen in that case?

The case of multiple threads trying to flip bpf_stats_enabled is
handled by bpf_stats_enabled_mutex.

Thanks !

>
> Thanks,
>
>         Ingo

Re: [PATCH] x86/alternatives: remove false sharing in poke_int3_handler()

Posted by Ingo Molnar 10 months, 3 weeks ago

* Eric Dumazet <edumazet@google.com> wrote:

> > What's the adversarial workload here? Spamming bpf_stats_enabled on all
> > CPUs in parallel? Or mixing it with some other text_poke_bp_batch()
> > user if bpf_stats_enabled serializes access?
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

> > Does anything undesirable happen in that case?
> 
> The case of multiple threads trying to flip bpf_stats_enabled is
> handled by bpf_stats_enabled_mutex.

So my suggested workload wasn't adversarial enough due to 
bpf_stats_enabled_mutex: how about some other workload that doesn't 
serialize access to text_poke_bp_batch()?

Thanks,

	Ingo

Re: [PATCH] x86/alternatives: remove false sharing in poke_int3_handler()

Posted by Eric Dumazet 10 months, 3 weeks ago

On Mon, Mar 24, 2025 at 8:16 AM Ingo Molnar <mingo@kernel.org> wrote:
>
>
> * Eric Dumazet <edumazet@google.com> wrote:
>
> > > What's the adversarial workload here? Spamming bpf_stats_enabled on all
> > > CPUs in parallel? Or mixing it with some other text_poke_bp_batch()
> > > user if bpf_stats_enabled serializes access?
>             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>
> > > Does anything undesirable happen in that case?
> >
> > The case of multiple threads trying to flip bpf_stats_enabled is
> > handled by bpf_stats_enabled_mutex.
>
> So my suggested workload wasn't adversarial enough due to
> bpf_stats_enabled_mutex: how about some other workload that doesn't
> serialize access to text_poke_bp_batch()?

Do you have a specific case in mind that I can test on these big platforms ?

text_poke_bp_batch() calls themselves are serialized by text_mutex, it
is not clear what you are looking for.

Thanks.

Re: [PATCH] x86/alternatives: remove false sharing in poke_int3_handler()

Posted by Ingo Molnar 10 months, 3 weeks ago

* Eric Dumazet <edumazet@google.com> wrote:

> Do you have a specific case in mind that I can test on these big 
> platforms ?

No. I was thinking of large-scale kprobes or ftrace patching - but you 
are right that the text_mutex should naturally serialize all the 
write-side code here.

Mind adding your second round of test results to the changelog as well, 
which improved per call overhead from 36 to 28 nsecs?

Thanks,

	Ingo

Re: [PATCH] x86/alternatives: remove false sharing in poke_int3_handler()

Posted by Eric Dumazet 10 months, 3 weeks ago

On Mon, Mar 24, 2025 at 9:02 AM Ingo Molnar <mingo@kernel.org> wrote:
>
>
> * Eric Dumazet <edumazet@google.com> wrote:
>
> > Do you have a specific case in mind that I can test on these big
> > platforms ?
>
> No. I was thinking of large-scale kprobes or ftrace patching - but you
> are right that the text_mutex should naturally serialize all the
> write-side code here.
>
> Mind adding your second round of test results to the changelog as well,
> which improved per call overhead from 36 to 28 nsecs?

Sure thing, thanks !

Note the 36 to 28 nsec was on a test host, not really under production stress.

As all of our production runs with the old code, I can not really tell
what would be the effective change once new kernels are rolled out.

When updating an unique and shared atomic_t from 480 cpus (worst case
scenario), we need more than 40000 cycles per operation.

perf stat atomic_bench -T480

The atomic counter is 21904528, total_cycles=2095231571464, 95652 avg
cycles per update
[05]                                                    7866 in
[32,64[ cycles (53 avg)
[06]                                                    2196 in
[64,128[ cycles (81 avg)
[07]                                                    2942 in
[128,256[ cycles (202 avg)
[08]                                                    1865 in
[256,512[ cycles (383 avg)
[09]                                                    4251 in
[512,1024[ cycles (780 avg)
[10]                                                    72248 in
[1024,2048[ cycles (1722 avg)
[11] ***                                                438110 in
[2048,4096[ cycles (3217 avg)
[12] ***********                                        1703927 in
[4096,8192[ cycles (6199 avg)
[13] **************************                         3869889 in
[8192,16384[ cycles (12320 avg)
[14] ***************************                        4040952 in
[16384,32768[ cycles (25185 avg)
[15] ************************************************** 7261596 in
[32768,65536[ cycles (46884 avg)
[16] ******************                                 2688791 in
[65536,131072[ cycles (83552 avg)
[17] *                                                  253104 in
[131072,262144[ cycles (189642 avg)
[18] **                                                 326075 in
[262144,524288[ cycles (349319 avg)
[19] ******                                             901293 in
[524288,1048576[ cycles (890724 avg)
[20] **                                                 321711 in
[1048576,2097152[ cycles (1205250 avg)
[21]                                                    6616 in
[2097152,4194304[ cycles (2436096 avg)

 Performance counter stats for './atomic_bench -T480':

        964,194.88 msec task-clock                #  467.120 CPUs
utilized
            13,795      context-switches          #   14.307 M/sec
               480      cpu-migrations            #    0.498 M/sec
             1,605      page-faults               #    1.665 M/sec
 3,182,241,468,867      cycles                    # 3300416.170 GHz
    11,077,646,267      instructions              #    0.00  insn per
cycle
     1,711,894,269      branches                  # 1775466.627 M/sec
         3,747,877      branch-misses             #    0.22% of all
branches

       2.064128692 seconds time elapsed


I said the  atomic_cond_read_acquire(refs, !VAL) was not called in my tests,
but it is a valid situation, we should not add a WARN_ON_ONCE().

I will simply add the unlikely()

Thanks.

Re: [PATCH] x86/alternatives: remove false sharing in poke_int3_handler()

Posted by Eric Dumazet 10 months, 3 weeks ago

On Mon, Mar 24, 2025 at 8:47 AM Eric Dumazet <edumazet@google.com> wrote:
>
> On Mon, Mar 24, 2025 at 8:16 AM Ingo Molnar <mingo@kernel.org> wrote:
> >
> >
> > * Eric Dumazet <edumazet@google.com> wrote:
> >
> > > > What's the adversarial workload here? Spamming bpf_stats_enabled on all
> > > > CPUs in parallel? Or mixing it with some other text_poke_bp_batch()
> > > > user if bpf_stats_enabled serializes access?
> >             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> >
> > > > Does anything undesirable happen in that case?
> > >
> > > The case of multiple threads trying to flip bpf_stats_enabled is
> > > handled by bpf_stats_enabled_mutex.
> >
> > So my suggested workload wasn't adversarial enough due to
> > bpf_stats_enabled_mutex: how about some other workload that doesn't
> > serialize access to text_poke_bp_batch()?
>
> Do you have a specific case in mind that I can test on these big platforms ?
>
> text_poke_bp_batch() calls themselves are serialized by text_mutex, it
> is not clear what you are looking for.


BTW the atomic_cond_read_acquire() part is never called even during my
stress test.

We could add this eventually:

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index d7afbf822c45..5d364e990055 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -2418,7 +2418,7 @@ static void text_poke_bp_batch(struct
text_poke_loc *tp, unsigned int nr_entries
        for_each_possible_cpu(i) {
                atomic_t *refs = per_cpu_ptr(&bp_refs, i);

-               if (!atomic_dec_and_test(refs))
+               if (unlikely(!atomic_dec_and_test(refs)))
                        atomic_cond_read_acquire(refs, !VAL);
        }
 }

Re: [PATCH] x86/alternatives: remove false sharing in poke_int3_handler()

Posted by Peter Zijlstra 10 months, 3 weeks ago

On Mon, Mar 24, 2025 at 08:53:31AM +0100, Eric Dumazet wrote:

> BTW the atomic_cond_read_acquire() part is never called even during my
> stress test.

Yes, IIRC this is due to text_poke_sync() serializing the state, as that
does a synchronous IPI broadcast, which by necessity requires all
previous INT3 handlers to complete.

You can only hit that case if the INT3 remains after step-3 (IOW you're
actively writing INT3 into the text). This is exceedingly rare.

Re: [PATCH] x86/alternatives: remove false sharing in poke_int3_handler()

Posted by Ingo Molnar 10 months, 2 weeks ago

* Peter Zijlstra <peterz@infradead.org> wrote:

> On Mon, Mar 24, 2025 at 08:53:31AM +0100, Eric Dumazet wrote:
> 
> > BTW the atomic_cond_read_acquire() part is never called even during my
> > stress test.
> 
> Yes, IIRC this is due to text_poke_sync() serializing the state, as that
> does a synchronous IPI broadcast, which by necessity requires all
> previous INT3 handlers to complete.
> 
> You can only hit that case if the INT3 remains after step-3 (IOW you're
> actively writing INT3 into the text). This is exceedingly rare.

Might make sense to add a comment for that.

Also, any strong objections against doing this in the namespace:

  s/bp_/int3_

?

Half of the code already calls it a variant of 'int3', half of it 'bp', 
which I had to think for a couple of seconds goes for breakpoint, not 
base pointer ... ;-)

Might as well standardize on int3_ and call it a day?

Thanks,

	Ingo

Re: [PATCH] x86/alternatives: remove false sharing in poke_int3_handler()

Posted by Peter Zijlstra 10 months, 2 weeks ago

On Tue, Mar 25, 2025 at 09:41:10AM +0100, Ingo Molnar wrote:
> 
> * Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > On Mon, Mar 24, 2025 at 08:53:31AM +0100, Eric Dumazet wrote:
> > 
> > > BTW the atomic_cond_read_acquire() part is never called even during my
> > > stress test.
> > 
> > Yes, IIRC this is due to text_poke_sync() serializing the state, as that
> > does a synchronous IPI broadcast, which by necessity requires all
> > previous INT3 handlers to complete.
> > 
> > You can only hit that case if the INT3 remains after step-3 (IOW you're
> > actively writing INT3 into the text). This is exceedingly rare.
> 
> Might make sense to add a comment for that.

Sure, find below.

> Also, any strong objections against doing this in the namespace:
> 
>   s/bp_/int3_
> 
> ?
> 
> Half of the code already calls it a variant of 'int3', half of it 'bp', 
> which I had to think for a couple of seconds goes for breakpoint, not 
> base pointer ... ;-)

It actually is breakpoint, as in INT3 raises #BP. For complete confusion
the things that are commonly known as debug breakpoints, those things in
DR7, they raise #DB or debug exceptions.

> Might as well standardize on int3_ and call it a day?

Yeah, perhaps. At some point you've got to know that INT3->#BP and
DR7->#DB and it all sorta makes sense, but *shrug* :-)


---
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index bf82c6f7d690..01e94603e767 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -2749,6 +2749,13 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
 
 	/*
 	 * Remove and wait for refs to be zero.
+	 *
+	 * Notably, if after step-3 above the INT3 got removed, then the
+	 * text_poke_sync() will have serialized against any running INT3
+	 * handlers and the below spin-wait will not happen.
+	 *
+	 * IOW. unless the replacement instruction is INT3, this case goes
+	 * unused.
 	 */
 	if (!atomic_dec_and_test(&bp_desc.refs))
 		atomic_cond_read_acquire(&bp_desc.refs, !VAL);

Re: [PATCH] x86/alternatives: remove false sharing in poke_int3_handler()

Posted by Ingo Molnar 10 months, 2 weeks ago

* Peter Zijlstra <peterz@infradead.org> wrote:

> On Tue, Mar 25, 2025 at 09:41:10AM +0100, Ingo Molnar wrote:
> > 
> > * Peter Zijlstra <peterz@infradead.org> wrote:
> > 
> > > On Mon, Mar 24, 2025 at 08:53:31AM +0100, Eric Dumazet wrote:
> > > 
> > > > BTW the atomic_cond_read_acquire() part is never called even during my
> > > > stress test.
> > > 
> > > Yes, IIRC this is due to text_poke_sync() serializing the state, as that
> > > does a synchronous IPI broadcast, which by necessity requires all
> > > previous INT3 handlers to complete.
> > > 
> > > You can only hit that case if the INT3 remains after step-3 (IOW you're
> > > actively writing INT3 into the text). This is exceedingly rare.
> > 
> > Might make sense to add a comment for that.
> 
> Sure, find below.
> 
> > Also, any strong objections against doing this in the namespace:
> > 
> >   s/bp_/int3_
> > 
> > ?
> > 
> > Half of the code already calls it a variant of 'int3', half of it 'bp', 
> > which I had to think for a couple of seconds goes for breakpoint, not 
> > base pointer ... ;-)
> 
> It actually is breakpoint, as in INT3 raises #BP. For complete confusion
> the things that are commonly known as debug breakpoints, those things in
> DR7, they raise #DB or debug exceptions.

Yeah, it's a software breakpoint, swbp, that raises the #BP trap.

'bp' is confusingly aliased (in my brain at least) with 'base pointer' 
register naming and assembler syntax: as in bp, ebp, rbp.

So I'd prefer if it was named consistently:

  text_poke_int3_batch()
  text_poke_int3_handler()
  ...

Not the current mishmash of:

  text_poke_bp_batch()
  poke_int3_handler()
  ...

Does this make more sense?

> > Might as well standardize on int3_ and call it a day?
> 
> Yeah, perhaps. At some point you've got to know that INT3->#BP and
> DR7->#DB and it all sorta makes sense, but *shrug* :-)

Yeah, so I do know what #BP is, but what the heck disambiguates the two 
meanings of _bp and why do we have the above jungle of an inconsistent 
namespace? :-)

Picking _int3 would neatly solve all of that.

> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
> index bf82c6f7d690..01e94603e767 100644
> --- a/arch/x86/kernel/alternative.c
> +++ b/arch/x86/kernel/alternative.c
> @@ -2749,6 +2749,13 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
>  
>  	/*
>  	 * Remove and wait for refs to be zero.
> +	 *
> +	 * Notably, if after step-3 above the INT3 got removed, then the
> +	 * text_poke_sync() will have serialized against any running INT3
> +	 * handlers and the below spin-wait will not happen.
> +	 *
> +	 * IOW. unless the replacement instruction is INT3, this case goes
> +	 * unused.
>  	 */
>  	if (!atomic_dec_and_test(&bp_desc.refs))
>  		atomic_cond_read_acquire(&bp_desc.refs, !VAL);

Thanks! I stuck this into tip:x86/alternatives, with your SOB.

	Ingo

Re: [PATCH] x86/alternatives: remove false sharing in poke_int3_handler()

Posted by Peter Zijlstra 10 months, 2 weeks ago

On Tue, Mar 25, 2025 at 12:26:36PM +0100, Ingo Molnar wrote:

> Yeah, so I do know what #BP is, but what the heck disambiguates the two 
> meanings of _bp and why do we have the above jungle of an inconsistent 
> namespace? :-)
> 
> Picking _int3 would neatly solve all of that.

Sure; the most obvious case where BP would make sense, the trap entry
point, we already use int3 so yeah, make it int3 throughout.

Re: [PATCH] x86/alternatives: remove false sharing in poke_int3_handler()

Posted by Ingo Molnar 10 months, 2 weeks ago

* Peter Zijlstra <peterz@infradead.org> wrote:

> On Tue, Mar 25, 2025 at 12:26:36PM +0100, Ingo Molnar wrote:
> 
> > Yeah, so I do know what #BP is, but what the heck disambiguates the two 
> > meanings of _bp and why do we have the above jungle of an inconsistent 
> > namespace? :-)
> > 
> > Picking _int3 would neatly solve all of that.
> 
> Sure; the most obvious case where BP would make sense, the trap entry
> point, we already use int3 so yeah, make it int3 throughout.

Okay - I just sent out a series to do this. Found a few other things to 
fix/improve along the way:

	https://lore.kernel.org/r/20250327205355.378659-1-mingo@kernel.org

Thanks,

	Ingo

[tip: x86/alternatives] x86/alternatives: Document the text_poke_bp_batch() synchronization rules a bit more

Posted by tip-bot2 for Peter Zijlstra 10 months, 2 weeks ago

The following commit has been merged into the x86/alternatives branch of tip:

Commit-ID:     451283cd40bceccc02397d0e5b2b6eda6047b1ed
Gitweb:        https://git.kernel.org/tip/451283cd40bceccc02397d0e5b2b6eda6047b1ed
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Tue, 25 Mar 2025 11:30:47 +01:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Tue, 25 Mar 2025 12:28:35 +01:00

x86/alternatives: Document the text_poke_bp_batch() synchronization rules a bit more

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Link: https://lore.kernel.org/r/20250325103047.GH36322@noisy.programming.kicks-ass.net
---
 arch/x86/kernel/alternative.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 85089c7..5f44814 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -2751,6 +2751,13 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
 
 	/*
 	 * Remove and wait for refs to be zero.
+	 *
+	 * Notably, if after step-3 above the INT3 got removed, then the
+	 * text_poke_sync() will have serialized against any running INT3
+	 * handlers and the below spin-wait will not happen.
+	 *
+	 * IOW. unless the replacement instruction is INT3, this case goes
+	 * unused.
 	 */
 	for_each_possible_cpu(i) {
 		atomic_t *refs = per_cpu_ptr(&bp_refs, i);

Re: [PATCH] x86/alternatives: remove false sharing in poke_int3_handler()

Posted by Ingo Molnar 10 months, 3 weeks ago

* Eric Dumazet <edumazet@google.com> wrote:

> On Mon, Mar 24, 2025 at 8:47 AM Eric Dumazet <edumazet@google.com> wrote:
> >
> > On Mon, Mar 24, 2025 at 8:16 AM Ingo Molnar <mingo@kernel.org> wrote:
> > >
> > >
> > > * Eric Dumazet <edumazet@google.com> wrote:
> > >
> > > > > What's the adversarial workload here? Spamming bpf_stats_enabled on all
> > > > > CPUs in parallel? Or mixing it with some other text_poke_bp_batch()
> > > > > user if bpf_stats_enabled serializes access?
> > >             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> > >
> > > > > Does anything undesirable happen in that case?
> > > >
> > > > The case of multiple threads trying to flip bpf_stats_enabled is
> > > > handled by bpf_stats_enabled_mutex.
> > >
> > > So my suggested workload wasn't adversarial enough due to
> > > bpf_stats_enabled_mutex: how about some other workload that doesn't
> > > serialize access to text_poke_bp_batch()?
> >
> > Do you have a specific case in mind that I can test on these big platforms ?
> >
> > text_poke_bp_batch() calls themselves are serialized by text_mutex, it
> > is not clear what you are looking for.
> 
> 
> BTW the atomic_cond_read_acquire() part is never called even during my
> stress test.

Yeah, that code threw me off - can it really happen with text_mutex 
serializing all of it?

> @@ -2418,7 +2418,7 @@ static void text_poke_bp_batch(struct
> text_poke_loc *tp, unsigned int nr_entries
>         for_each_possible_cpu(i) {
>                 atomic_t *refs = per_cpu_ptr(&bp_refs, i);
> 
> -               if (!atomic_dec_and_test(refs))
> +               if (unlikely(!atomic_dec_and_test(refs)))
>                         atomic_cond_read_acquire(refs, !VAL);

If it could never happen then this should that condition be a 
WARN_ON_ONCE() perhaps?

Thanks,

	Ingo