[PATCH] x86/irqflags: Force a register output in native_save_fl()

Eric Dumazet posted 1 patch 1 month, 3 weeks ago
arch/x86/include/asm/irqflags.h | 7 +------
1 file changed, 1 insertion(+), 6 deletions(-)
[PATCH] x86/irqflags: Force a register output in native_save_fl()
Posted by Eric Dumazet 1 month, 3 weeks ago
clang is generating very inefficient code for native_save_fl() which
is used for local_irq_save() in few critical spots.

Allowing the "pop %0" to use memory :

1) forces the compiler to add annoying stack canaries when
   CONFIG_STACKPROTECTOR_STRONG=y in many places.

2) Almost always is followed by an immediate "move memory,register"

One good example is _raw_spin_lock_irqsave, with 8 extra instructions

ffffffff82067a30 <_raw_spin_lock_irqsave>:
ffffffff82067a30:       ...
ffffffff82067a39:       53                      push   %rbx

// Three instructions to ajust the stack, read the per-cpu canary
// and copy it to 8(%rsp)
ffffffff82067a3a:       48 83 ec 10             sub    $0x10,%rsp
ffffffff82067a3e:       65 48 8b 05 da 15 45 02 mov    %gs:0x24515da(%rip),%rax        # <__stack_chk_guard>
ffffffff82067a46:       48 89 44 24 08          mov    %rax,0x8(%rsp)

ffffffff82067a4b:       9c                      pushf

// instead of pop %rbx, compiler uses 2 instructions.
ffffffff82067a4c:       8f 04 24                pop    (%rsp)
ffffffff82067a4f:       48 8b 1c 24             mov    (%rsp),%rbx

ffffffff82067a53:       fa                      cli
ffffffff82067a54:       b9 01 00 00 00          mov    $0x1,%ecx
ffffffff82067a59:       31 c0                   xor    %eax,%eax
ffffffff82067a5b:       f0 0f b1 0f             lock cmpxchg %ecx,(%rdi)
ffffffff82067a5f:       75 1d                   jne    ffffffff82067a7e <_raw_spin_lock_irqsave+0x4e>

// three instructions to check the stack canary
ffffffff82067a61:       65 48 8b 05 b7 15 45 02 mov    %gs:0x24515b7(%rip),%rax        # <__stack_chk_guard>
ffffffff82067a69:       48 3b 44 24 08          cmp    0x8(%rsp),%rax
ffffffff82067a6e:       75 17                   jne    ffffffff82067a87

...

// One extra instruction to adjust the stack.
ffffffff82067a73:       48 83 c4 10             add    $0x10,%rsp
...

// One more instruction in case the stack was mangled.
ffffffff82067a87:       e8 a4 35 ff ff          call   ffffffff8205b030 <__stack_chk_fail>

This patch changes almost nothing for gcc, but saves 23153 bytes
of text with clang, while allowing more functions to be inlined.

$ size vmlinux.gcc.before vmlinux.gcc.after vmlinux.clang.before vmlinux.clang.after
    text     data     bss      dec     hex	filename
45564911 25005415 4708896 75279221 47cab75	vmlinux.gcc.before
45564901 25005414 4708896 75279211 47cab6b	vmlinux.gcc.after
45115647 24638569 5537072 75291288 47cda98	vmlinux.clang.before
45092494 24638585 5545000 75276079 47c9f2f	vmlinux.clang.after

$ scripts/bloat-o-meter -t vmlinux.clang.before vmlinux.clang.after
add/remove: 0/3 grow/shrink: 22/533 up/down: 5162/-25088 (-19926)
Function                                     old     new   delta
__noinstr_text_start                         640    3584   +2944
wakeup_cpu_via_vmgexit                      1002    1447    +445
rcu_tasks_trace_pregp_step                  1052    1454    +402
snp_kexec_finish                            1290    1527    +237
check_all_holdout_tasks_trace                909    1106    +197
x2apic_send_IPI_mask_allbutself               38     198    +160
hpet_set_rtc_irq_bit                         118     265    +147
x2apic_send_IPI_mask                          38     184    +146
ring_buffer_poll_wait                        261     405    +144
rb_watermark_hit                             253     386    +133
...
tcp_wfree                                    402     332     -70
stacktrace_trigger                           133      62     -71
w1_touch_bit                                 418     343     -75
w1_triplet                                   446     370     -76
link_create                                  980     902     -78
drain_dead_softirq_workfn                    425     347     -78
kcryptd_queue_crypt                          253     174     -79
perf_event_aux_pause                         448     368     -80
idle_worker_timeout                          320     240     -80
srcu_funnel_exp_start                        418     333     -85
call_rcu                                     751     666     -85
enable_IR_x2apic                             279     191     -88
bpf_link_free                                432     342     -90
synchronize_rcu                              497     403     -94
identify_cpu                                2665    2569     -96
ftrace_modify_all_code                       355     258     -97
load_gs_index                                212     104    -108
verity_end_io                                369     257    -112
bpf_prog_detach                              672     555    -117
__x2apic_send_IPI_mask                       552     275    -277
snp_cleanup_vmsa                             284       -    -284
__end_rodata                              606208  602112   -4096
Total: Before=28577927, After=28558001, chg -0.07%

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 arch/x86/include/asm/irqflags.h | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index b30e5474c18e1be63b7c69354c26ae6a6cb02731..a06bdc51b6e3d04c06352c28389b10ec66551a0e 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -18,14 +18,9 @@ extern __always_inline unsigned long native_save_fl(void)
 {
 	unsigned long flags;
 
-	/*
-	 * "=rm" is safe here, because "pop" adjusts the stack before
-	 * it evaluates its effective address -- this is part of the
-	 * documented behavior of the "pop" instruction.
-	 */
 	asm volatile("# __raw_save_flags\n\t"
 		     "pushf ; pop %0"
-		     : "=rm" (flags)
+		     : "=r" (flags)
 		     : /* no input */
 		     : "memory");
 
-- 
2.52.0.313.g674ac2bdf7-goog
Re: [PATCH] x86/irqflags: Force a register output in native_save_fl()
Posted by Uros Bizjak 1 month, 2 weeks ago

On 12/18/25 10:54, Eric Dumazet wrote:
> clang is generating very inefficient code for native_save_fl() which
> is used for local_irq_save() in few critical spots.
> 
> Allowing the "pop %0" to use memory :
> 
> 1) forces the compiler to add annoying stack canaries when
>     CONFIG_STACKPROTECTOR_STRONG=y in many places.
> 
> 2) Almost always is followed by an immediate "move memory,register"
> 
> One good example is _raw_spin_lock_irqsave, with 8 extra instructions
> 
> ffffffff82067a30 <_raw_spin_lock_irqsave>:
> ffffffff82067a30:       ...
> ffffffff82067a39:       53                      push   %rbx
> 
> // Three instructions to ajust the stack, read the per-cpu canary
> // and copy it to 8(%rsp)
> ffffffff82067a3a:       48 83 ec 10             sub    $0x10,%rsp
> ffffffff82067a3e:       65 48 8b 05 da 15 45 02 mov    %gs:0x24515da(%rip),%rax        # <__stack_chk_guard>
> ffffffff82067a46:       48 89 44 24 08          mov    %rax,0x8(%rsp)
> 
> ffffffff82067a4b:       9c                      pushf
> 
> // instead of pop %rbx, compiler uses 2 instructions.
> ffffffff82067a4c:       8f 04 24                pop    (%rsp)
> ffffffff82067a4f:       48 8b 1c 24             mov    (%rsp),%rbx
> 
> ffffffff82067a53:       fa                      cli
> ffffffff82067a54:       b9 01 00 00 00          mov    $0x1,%ecx
> ffffffff82067a59:       31 c0                   xor    %eax,%eax
> ffffffff82067a5b:       f0 0f b1 0f             lock cmpxchg %ecx,(%rdi)
> ffffffff82067a5f:       75 1d                   jne    ffffffff82067a7e <_raw_spin_lock_irqsave+0x4e>
> 
> // three instructions to check the stack canary
> ffffffff82067a61:       65 48 8b 05 b7 15 45 02 mov    %gs:0x24515b7(%rip),%rax        # <__stack_chk_guard>
> ffffffff82067a69:       48 3b 44 24 08          cmp    0x8(%rsp),%rax
> ffffffff82067a6e:       75 17                   jne    ffffffff82067a87
> 
> ...
> 
> // One extra instruction to adjust the stack.
> ffffffff82067a73:       48 83 c4 10             add    $0x10,%rsp
> ...
> 
> // One more instruction in case the stack was mangled.
> ffffffff82067a87:       e8 a4 35 ff ff          call   ffffffff8205b030 <__stack_chk_fail>
> 
> This patch changes almost nothing for gcc, but saves 23153 bytes
> of text with clang, while allowing more functions to be inlined.
> 
> $ size vmlinux.gcc.before vmlinux.gcc.after vmlinux.clang.before vmlinux.clang.after
>      text     data     bss      dec     hex	filename
> 45564911 25005415 4708896 75279221 47cab75	vmlinux.gcc.before
> 45564901 25005414 4708896 75279211 47cab6b	vmlinux.gcc.after
> 45115647 24638569 5537072 75291288 47cda98	vmlinux.clang.before
> 45092494 24638585 5545000 75276079 47c9f2f	vmlinux.clang.after
> 
> $ scripts/bloat-o-meter -t vmlinux.clang.before vmlinux.clang.after
> add/remove: 0/3 grow/shrink: 22/533 up/down: 5162/-25088 (-19926)
> Function                                     old     new   delta
> __noinstr_text_start                         640    3584   +2944
> wakeup_cpu_via_vmgexit                      1002    1447    +445
> rcu_tasks_trace_pregp_step                  1052    1454    +402
> snp_kexec_finish                            1290    1527    +237
> check_all_holdout_tasks_trace                909    1106    +197
> x2apic_send_IPI_mask_allbutself               38     198    +160
> hpet_set_rtc_irq_bit                         118     265    +147
> x2apic_send_IPI_mask                          38     184    +146
> ring_buffer_poll_wait                        261     405    +144
> rb_watermark_hit                             253     386    +133
> ...
> tcp_wfree                                    402     332     -70
> stacktrace_trigger                           133      62     -71
> w1_touch_bit                                 418     343     -75
> w1_triplet                                   446     370     -76
> link_create                                  980     902     -78
> drain_dead_softirq_workfn                    425     347     -78
> kcryptd_queue_crypt                          253     174     -79
> perf_event_aux_pause                         448     368     -80
> idle_worker_timeout                          320     240     -80
> srcu_funnel_exp_start                        418     333     -85
> call_rcu                                     751     666     -85
> enable_IR_x2apic                             279     191     -88
> bpf_link_free                                432     342     -90
> synchronize_rcu                              497     403     -94
> identify_cpu                                2665    2569     -96
> ftrace_modify_all_code                       355     258     -97
> load_gs_index                                212     104    -108
> verity_end_io                                369     257    -112
> bpf_prog_detach                              672     555    -117
> __x2apic_send_IPI_mask                       552     275    -277
> snp_cleanup_vmsa                             284       -    -284
> __end_rodata                              606208  602112   -4096
> Total: Before=28577927, After=28558001, chg -0.07%
> 
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> ---
>   arch/x86/include/asm/irqflags.h | 7 +------
>   1 file changed, 1 insertion(+), 6 deletions(-)
> 
> diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
> index b30e5474c18e1be63b7c69354c26ae6a6cb02731..a06bdc51b6e3d04c06352c28389b10ec66551a0e 100644
> --- a/arch/x86/include/asm/irqflags.h
> +++ b/arch/x86/include/asm/irqflags.h
> @@ -18,14 +18,9 @@ extern __always_inline unsigned long native_save_fl(void)
>   {
>   	unsigned long flags;
>   
> -	/*
> -	 * "=rm" is safe here, because "pop" adjusts the stack before
> -	 * it evaluates its effective address -- this is part of the
> -	 * documented behavior of the "pop" instruction.
> -	 */
>   	asm volatile("# __raw_save_flags\n\t"
>   		     "pushf ; pop %0"
> -		     : "=rm" (flags)
> +		     : "=r" (flags)

You should define and use something like ASM_INPUT_RM macro 
(ASM_OUTPUT_RM ?) instead to avoid penalizing GCC. Please see 
include/linux/compiler-clang.h.

Uros.
Re: [PATCH] x86/irqflags: Force a register output in native_save_fl()
Posted by Eric Dumazet 1 month, 2 weeks ago
On Thu, Dec 18, 2025 at 6:41 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>
>

>
> You should define and use something like ASM_INPUT_RM macro
> (ASM_OUTPUT_RM ?) instead to avoid penalizing GCC. Please see
> include/linux/compiler-clang.h.

This is a possibility indeed, thanks.
Re: [PATCH] x86/irqflags: Force a register output in native_save_fl()
Posted by H. Peter Anvin 1 month, 3 weeks ago
On December 18, 2025 1:54:33 AM PST, Eric Dumazet <edumazet@google.com> wrote:
>clang is generating very inefficient code for native_save_fl() which
>is used for local_irq_save() in few critical spots.
>
>Allowing the "pop %0" to use memory :
>
>1) forces the compiler to add annoying stack canaries when
>   CONFIG_STACKPROTECTOR_STRONG=y in many places.
>
>2) Almost always is followed by an immediate "move memory,register"
>
>One good example is _raw_spin_lock_irqsave, with 8 extra instructions
>
>ffffffff82067a30 <_raw_spin_lock_irqsave>:
>ffffffff82067a30:       ...
>ffffffff82067a39:       53                      push   %rbx
>
>// Three instructions to ajust the stack, read the per-cpu canary
>// and copy it to 8(%rsp)
>ffffffff82067a3a:       48 83 ec 10             sub    $0x10,%rsp
>ffffffff82067a3e:       65 48 8b 05 da 15 45 02 mov    %gs:0x24515da(%rip),%rax        # <__stack_chk_guard>
>ffffffff82067a46:       48 89 44 24 08          mov    %rax,0x8(%rsp)
>
>ffffffff82067a4b:       9c                      pushf
>
>// instead of pop %rbx, compiler uses 2 instructions.
>ffffffff82067a4c:       8f 04 24                pop    (%rsp)
>ffffffff82067a4f:       48 8b 1c 24             mov    (%rsp),%rbx
>
>ffffffff82067a53:       fa                      cli
>ffffffff82067a54:       b9 01 00 00 00          mov    $0x1,%ecx
>ffffffff82067a59:       31 c0                   xor    %eax,%eax
>ffffffff82067a5b:       f0 0f b1 0f             lock cmpxchg %ecx,(%rdi)
>ffffffff82067a5f:       75 1d                   jne    ffffffff82067a7e <_raw_spin_lock_irqsave+0x4e>
>
>// three instructions to check the stack canary
>ffffffff82067a61:       65 48 8b 05 b7 15 45 02 mov    %gs:0x24515b7(%rip),%rax        # <__stack_chk_guard>
>ffffffff82067a69:       48 3b 44 24 08          cmp    0x8(%rsp),%rax
>ffffffff82067a6e:       75 17                   jne    ffffffff82067a87
>
>...
>
>// One extra instruction to adjust the stack.
>ffffffff82067a73:       48 83 c4 10             add    $0x10,%rsp
>...
>
>// One more instruction in case the stack was mangled.
>ffffffff82067a87:       e8 a4 35 ff ff          call   ffffffff8205b030 <__stack_chk_fail>
>
>This patch changes almost nothing for gcc, but saves 23153 bytes
>of text with clang, while allowing more functions to be inlined.
>
>$ size vmlinux.gcc.before vmlinux.gcc.after vmlinux.clang.before vmlinux.clang.after
>    text     data     bss      dec     hex	filename
>45564911 25005415 4708896 75279221 47cab75	vmlinux.gcc.before
>45564901 25005414 4708896 75279211 47cab6b	vmlinux.gcc.after
>45115647 24638569 5537072 75291288 47cda98	vmlinux.clang.before
>45092494 24638585 5545000 75276079 47c9f2f	vmlinux.clang.after
>
>$ scripts/bloat-o-meter -t vmlinux.clang.before vmlinux.clang.after
>add/remove: 0/3 grow/shrink: 22/533 up/down: 5162/-25088 (-19926)
>Function                                     old     new   delta
>__noinstr_text_start                         640    3584   +2944
>wakeup_cpu_via_vmgexit                      1002    1447    +445
>rcu_tasks_trace_pregp_step                  1052    1454    +402
>snp_kexec_finish                            1290    1527    +237
>check_all_holdout_tasks_trace                909    1106    +197
>x2apic_send_IPI_mask_allbutself               38     198    +160
>hpet_set_rtc_irq_bit                         118     265    +147
>x2apic_send_IPI_mask                          38     184    +146
>ring_buffer_poll_wait                        261     405    +144
>rb_watermark_hit                             253     386    +133
>...
>tcp_wfree                                    402     332     -70
>stacktrace_trigger                           133      62     -71
>w1_touch_bit                                 418     343     -75
>w1_triplet                                   446     370     -76
>link_create                                  980     902     -78
>drain_dead_softirq_workfn                    425     347     -78
>kcryptd_queue_crypt                          253     174     -79
>perf_event_aux_pause                         448     368     -80
>idle_worker_timeout                          320     240     -80
>srcu_funnel_exp_start                        418     333     -85
>call_rcu                                     751     666     -85
>enable_IR_x2apic                             279     191     -88
>bpf_link_free                                432     342     -90
>synchronize_rcu                              497     403     -94
>identify_cpu                                2665    2569     -96
>ftrace_modify_all_code                       355     258     -97
>load_gs_index                                212     104    -108
>verity_end_io                                369     257    -112
>bpf_prog_detach                              672     555    -117
>__x2apic_send_IPI_mask                       552     275    -277
>snp_cleanup_vmsa                             284       -    -284
>__end_rodata                              606208  602112   -4096
>Total: Before=28577927, After=28558001, chg -0.07%
>
>Signed-off-by: Eric Dumazet <edumazet@google.com>
>---
> arch/x86/include/asm/irqflags.h | 7 +------
> 1 file changed, 1 insertion(+), 6 deletions(-)
>
>diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
>index b30e5474c18e1be63b7c69354c26ae6a6cb02731..a06bdc51b6e3d04c06352c28389b10ec66551a0e 100644
>--- a/arch/x86/include/asm/irqflags.h
>+++ b/arch/x86/include/asm/irqflags.h
>@@ -18,14 +18,9 @@ extern __always_inline unsigned long native_save_fl(void)
> {
> 	unsigned long flags;
> 
>-	/*
>-	 * "=rm" is safe here, because "pop" adjusts the stack before
>-	 * it evaluates its effective address -- this is part of the
>-	 * documented behavior of the "pop" instruction.
>-	 */
> 	asm volatile("# __raw_save_flags\n\t"
> 		     "pushf ; pop %0"
>-		     : "=rm" (flags)
>+		     : "=r" (flags)
> 		     : /* no input */
> 		     : "memory");
> 

Maybe report a bug to the clang team?
Re: [PATCH] x86/irqflags: Force a register output in native_save_fl()
Posted by Eric Dumazet 1 month, 3 weeks ago
On Thu, Dec 18, 2025 at 11:13 AM H. Peter Anvin <hpa@zytor.com> wrote:
>
> On December 18, 2025 1:54:33 AM PST, Eric Dumazet <edumazet@google.com> wrote:
> >clang is generating very inefficient code for native_save_fl() which
> >is used for local_irq_save() in few critical spots.
> >
> >Allowing the "pop %0" to use memory :
> >
> >1) forces the compiler to add annoying stack canaries when
> >   CONFIG_STACKPROTECTOR_STRONG=y in many places.
> >
> >2) Almost always is followed by an immediate "move memory,register"
> >
> >One good example is _raw_spin_lock_irqsave, with 8 extra instructions
> >
> >ffffffff82067a30 <_raw_spin_lock_irqsave>:
> >ffffffff82067a30:       ...
> >ffffffff82067a39:       53                      push   %rbx
> >
> >// Three instructions to ajust the stack, read the per-cpu canary
> >// and copy it to 8(%rsp)
> >ffffffff82067a3a:       48 83 ec 10             sub    $0x10,%rsp
> >ffffffff82067a3e:       65 48 8b 05 da 15 45 02 mov    %gs:0x24515da(%rip),%rax        # <__stack_chk_guard>
> >ffffffff82067a46:       48 89 44 24 08          mov    %rax,0x8(%rsp)
> >
> >ffffffff82067a4b:       9c                      pushf
> >
> >// instead of pop %rbx, compiler uses 2 instructions.
> >ffffffff82067a4c:       8f 04 24                pop    (%rsp)
> >ffffffff82067a4f:       48 8b 1c 24             mov    (%rsp),%rbx
> >
> >ffffffff82067a53:       fa                      cli
> >ffffffff82067a54:       b9 01 00 00 00          mov    $0x1,%ecx
> >ffffffff82067a59:       31 c0                   xor    %eax,%eax
> >ffffffff82067a5b:       f0 0f b1 0f             lock cmpxchg %ecx,(%rdi)
> >ffffffff82067a5f:       75 1d                   jne    ffffffff82067a7e <_raw_spin_lock_irqsave+0x4e>
> >
> >// three instructions to check the stack canary
> >ffffffff82067a61:       65 48 8b 05 b7 15 45 02 mov    %gs:0x24515b7(%rip),%rax        # <__stack_chk_guard>
> >ffffffff82067a69:       48 3b 44 24 08          cmp    0x8(%rsp),%rax
> >ffffffff82067a6e:       75 17                   jne    ffffffff82067a87
> >
> >...
> >
> >// One extra instruction to adjust the stack.
> >ffffffff82067a73:       48 83 c4 10             add    $0x10,%rsp
> >...
> >
> >// One more instruction in case the stack was mangled.
> >ffffffff82067a87:       e8 a4 35 ff ff          call   ffffffff8205b030 <__stack_chk_fail>
> >
> >This patch changes almost nothing for gcc, but saves 23153 bytes
> >of text with clang, while allowing more functions to be inlined.
> >
> >$ size vmlinux.gcc.before vmlinux.gcc.after vmlinux.clang.before vmlinux.clang.after
> >    text     data     bss      dec     hex     filename
> >45564911 25005415 4708896 75279221 47cab75     vmlinux.gcc.before
> >45564901 25005414 4708896 75279211 47cab6b     vmlinux.gcc.after
> >45115647 24638569 5537072 75291288 47cda98     vmlinux.clang.before
> >45092494 24638585 5545000 75276079 47c9f2f     vmlinux.clang.after
> >
> >$ scripts/bloat-o-meter -t vmlinux.clang.before vmlinux.clang.after
> >add/remove: 0/3 grow/shrink: 22/533 up/down: 5162/-25088 (-19926)
> >Function                                     old     new   delta
> >__noinstr_text_start                         640    3584   +2944
> >wakeup_cpu_via_vmgexit                      1002    1447    +445
> >rcu_tasks_trace_pregp_step                  1052    1454    +402
> >snp_kexec_finish                            1290    1527    +237
> >check_all_holdout_tasks_trace                909    1106    +197
> >x2apic_send_IPI_mask_allbutself               38     198    +160
> >hpet_set_rtc_irq_bit                         118     265    +147
> >x2apic_send_IPI_mask                          38     184    +146
> >ring_buffer_poll_wait                        261     405    +144
> >rb_watermark_hit                             253     386    +133
> >...
> >tcp_wfree                                    402     332     -70
> >stacktrace_trigger                           133      62     -71
> >w1_touch_bit                                 418     343     -75
> >w1_triplet                                   446     370     -76
> >link_create                                  980     902     -78
> >drain_dead_softirq_workfn                    425     347     -78
> >kcryptd_queue_crypt                          253     174     -79
> >perf_event_aux_pause                         448     368     -80
> >idle_worker_timeout                          320     240     -80
> >srcu_funnel_exp_start                        418     333     -85
> >call_rcu                                     751     666     -85
> >enable_IR_x2apic                             279     191     -88
> >bpf_link_free                                432     342     -90
> >synchronize_rcu                              497     403     -94
> >identify_cpu                                2665    2569     -96
> >ftrace_modify_all_code                       355     258     -97
> >load_gs_index                                212     104    -108
> >verity_end_io                                369     257    -112
> >bpf_prog_detach                              672     555    -117
> >__x2apic_send_IPI_mask                       552     275    -277
> >snp_cleanup_vmsa                             284       -    -284
> >__end_rodata                              606208  602112   -4096
> >Total: Before=28577927, After=28558001, chg -0.07%
> >
> >Signed-off-by: Eric Dumazet <edumazet@google.com>
> >---
> > arch/x86/include/asm/irqflags.h | 7 +------
> > 1 file changed, 1 insertion(+), 6 deletions(-)
> >
> >diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
> >index b30e5474c18e1be63b7c69354c26ae6a6cb02731..a06bdc51b6e3d04c06352c28389b10ec66551a0e 100644
> >--- a/arch/x86/include/asm/irqflags.h
> >+++ b/arch/x86/include/asm/irqflags.h
> >@@ -18,14 +18,9 @@ extern __always_inline unsigned long native_save_fl(void)
> > {
> >       unsigned long flags;
> >
> >-      /*
> >-       * "=rm" is safe here, because "pop" adjusts the stack before
> >-       * it evaluates its effective address -- this is part of the
> >-       * documented behavior of the "pop" instruction.
> >-       */
> >       asm volatile("# __raw_save_flags\n\t"
> >                    "pushf ; pop %0"
> >-                   : "=rm" (flags)
> >+                   : "=r" (flags)
> >                    : /* no input */
> >                    : "memory");
> >
>
> Maybe report a bug to the clang team?

This has been done 8 years ago. No progress so far...

They claim we should use __builtin_ia32_readeflags_u64 /
__builtin_ia32_writeeflags_u64, which seems unlikely to please x86
maintainers ?
Re: [PATCH] x86/irqflags: Force a register output in native_save_fl()
Posted by Eric Dumazet 1 month, 3 weeks ago
On Thu, Dec 18, 2025 at 11:15 AM Eric Dumazet <edumazet@google.com> wrote:
>
> On Thu, Dec 18, 2025 at 11:13 AM H. Peter Anvin <hpa@zytor.com> wrote:
> >

> >
> > Maybe report a bug to the clang team?
>
> This has been done 8 years ago. No progress so far...
>
> They claim we should use __builtin_ia32_readeflags_u64 /
> __builtin_ia32_writeeflags_u64, which seems unlikely to please x86
> maintainers ?

Although it seems we already use them in tools/testing/selftests/x86/helpers.h
Re: [PATCH] x86/irqflags: Force a register output in native_save_fl()
Posted by Peter Zijlstra 1 month, 3 weeks ago
On Thu, Dec 18, 2025 at 11:15:37AM +0100, Eric Dumazet wrote:

> > >--- a/arch/x86/include/asm/irqflags.h
> > >+++ b/arch/x86/include/asm/irqflags.h
> > >@@ -18,14 +18,9 @@ extern __always_inline unsigned long native_save_fl(void)
> > > {
> > >       unsigned long flags;
> > >
> > >-      /*
> > >-       * "=rm" is safe here, because "pop" adjusts the stack before
> > >-       * it evaluates its effective address -- this is part of the
> > >-       * documented behavior of the "pop" instruction.
> > >-       */
> > >       asm volatile("# __raw_save_flags\n\t"
> > >                    "pushf ; pop %0"
> > >-                   : "=rm" (flags)
> > >+                   : "=r" (flags)
> > >                    : /* no input */
> > >                    : "memory");
> > >
> >
> > Maybe report a bug to the clang team?
> 
> This has been done 8 years ago. No progress so far...
> 
> They claim we should use __builtin_ia32_readeflags_u64 /
> __builtin_ia32_writeeflags_u64, which seems unlikely to please x86
> maintainers ?

Yeah, because the "rm" thing not being supported properly goes way
further than just this one case. And it really is abysmal that clang
hasn't managed to implement it in all that time :/
Re: [PATCH] x86/irqflags: Force a register output in native_save_fl()
Posted by Eric Dumazet 1 month, 3 weeks ago
On Thu, Dec 18, 2025 at 11:22 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Thu, Dec 18, 2025 at 11:15:37AM +0100, Eric Dumazet wrote:
>
> > > >--- a/arch/x86/include/asm/irqflags.h
> > > >+++ b/arch/x86/include/asm/irqflags.h
> > > >@@ -18,14 +18,9 @@ extern __always_inline unsigned long native_save_fl(void)
> > > > {
> > > >       unsigned long flags;
> > > >
> > > >-      /*
> > > >-       * "=rm" is safe here, because "pop" adjusts the stack before
> > > >-       * it evaluates its effective address -- this is part of the
> > > >-       * documented behavior of the "pop" instruction.
> > > >-       */
> > > >       asm volatile("# __raw_save_flags\n\t"
> > > >                    "pushf ; pop %0"
> > > >-                   : "=rm" (flags)
> > > >+                   : "=r" (flags)
> > > >                    : /* no input */
> > > >                    : "memory");
> > > >
> > >
> > > Maybe report a bug to the clang team?
> >
> > This has been done 8 years ago. No progress so far...
> >
> > They claim we should use __builtin_ia32_readeflags_u64 /
> > __builtin_ia32_writeeflags_u64, which seems unlikely to please x86
> > maintainers ?
>
> Yeah, because the "rm" thing not being supported properly goes way
> further than just this one case. And it really is abysmal that clang
> hasn't managed to implement it in all that time :/

Let me know if you prefer a V2 with this instead. It saves ~190 bytes
more with clang ;)

Thanks !

diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index b30e5474c18e1be63b7c69354c26ae6a6cb02731..0209ae149f2b5f9ddacc6c8066a6d3191696ec42
100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -16,20 +16,11 @@
 extern inline unsigned long native_save_fl(void);
 extern __always_inline unsigned long native_save_fl(void)
 {
-       unsigned long flags;
-
-       /*
-        * "=rm" is safe here, because "pop" adjusts the stack before
-        * it evaluates its effective address -- this is part of the
-        * documented behavior of the "pop" instruction.
-        */
-       asm volatile("# __raw_save_flags\n\t"
-                    "pushf ; pop %0"
-                    : "=rm" (flags)
-                    : /* no input */
-                    : "memory");
-
-       return flags;
+#ifdef CONFIG_X86_64
+       return __builtin_ia32_readeflags_u64();
+#else
+       return __builtin_ia32_readeflags_u32();
+#endif
 }

 static __always_inline void native_irq_disable(void)