[PATCH v2] x86/asm: Use asm_inline() instead of asm() in __untagged_addr()

Uros Bizjak posted 1 patch 9 months ago
There is a newer version of this series
arch/x86/include/asm/uaccess_64.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
[PATCH v2] x86/asm: Use asm_inline() instead of asm() in __untagged_addr()
Posted by Uros Bizjak 9 months ago
Use asm_inline() to instruct the compiler that the size of asm()
is the minimum size of one instruction, ignoring how many instructions
the compiler thinks it is. ALTERNATIVE macro that expands to several
pseudo directives causes instruction length estimate to count
more than 20 instructions.

bloat-o-meter reports minimal code size increase
(x86_64 defconfig with CONFIG_ADDRESS_MASKING, gcc-14.2.1):

  add/remove: 2/2 grow/shrink: 5/1 up/down: 2365/-1995 (370)

	Function                          old     new   delta
	-----------------------------------------------------
	do_get_mempolicy                    -    1449   +1449
	copy_nodes_to_user                  -     226    +226
	__x64_sys_get_mempolicy            35     213    +178
	syscall_user_dispatch_set_config  157     332    +175
	__ia32_sys_get_mempolicy           31     206    +175
	set_syscall_user_dispatch          29     181    +152
	__do_sys_mremap                  2073    2083     +10
	sp_insert                         133     117     -16
	task_set_syscall_user_dispatch    172       -    -172
	kernel_get_mempolicy             1807       -   -1807

  Total: Before=21423151, After=21423521, chg +0.00%

The code size increase is due to the compiler inlining
more functions that inline untagged_addr(), e.g:

task_set_syscall_user_dispatch() is now fully inlined in
set_syscall_user_dispatch():

	000000000010b7e0 <set_syscall_user_dispatch>:
	  10b7e0:	f3 0f 1e fa          	endbr64
	  10b7e4:	49 89 c8             	mov    %rcx,%r8
	  10b7e7:	48 89 d1             	mov    %rdx,%rcx
	  10b7ea:	48 89 f2             	mov    %rsi,%rdx
	  10b7ed:	48 89 fe             	mov    %rdi,%rsi
	  10b7f0:	65 48 8b 3d 00 00 00 	mov    %gs:0x0(%rip),%rdi
	  10b7f7:	00
	  10b7f8:	e9 03 fe ff ff       	jmp    10b600 <task_set_syscall_user_dispatch>

that after inlining becomes:

	000000000010b730 <set_syscall_user_dispatch>:
	  10b730:	f3 0f 1e fa          	endbr64
	  10b734:	65 48 8b 05 00 00 00 	mov    %gs:0x0(%rip),%rax
	  10b73b:	00
	  10b73c:	48 85 ff             	test   %rdi,%rdi
	  10b73f:	74 54                	je     10b795 <set_syscall_user_dispatch+0x65>
	  10b741:	48 83 ff 01          	cmp    $0x1,%rdi
	  10b745:	74 06                	je     10b74d <set_syscall_user_dispatch+0x1d>
	  10b747:	b8 ea ff ff ff       	mov    $0xffffffea,%eax
	  10b74c:	c3                   	ret
	  10b74d:	48 85 f6             	test   %rsi,%rsi
	  10b750:	75 7b                	jne    10b7cd <set_syscall_user_dispatch+0x9d>
	  10b752:	48 85 c9             	test   %rcx,%rcx
	  10b755:	74 1a                	je     10b771 <set_syscall_user_dispatch+0x41>
	  10b757:	48 89 cf             	mov    %rcx,%rdi
	  10b75a:	49 b8 ef cd ab 89 67 	movabs $0x123456789abcdef,%r8
	  10b761:	45 23 01
	  10b764:	90                   	nop
	  10b765:	90                   	nop
	  10b766:	90                   	nop
	  10b767:	90                   	nop
	  10b768:	90                   	nop
	  10b769:	90                   	nop
	  10b76a:	90                   	nop
	  10b76b:	90                   	nop
	  10b76c:	49 39 f8             	cmp    %rdi,%r8
	  10b76f:	72 6e                	jb     10b7df <set_syscall_user_dispatch+0xaf>
	  10b771:	48 89 88 48 08 00 00 	mov    %rcx,0x848(%rax)
	  10b778:	48 89 b0 50 08 00 00 	mov    %rsi,0x850(%rax)
	  10b77f:	48 89 90 58 08 00 00 	mov    %rdx,0x858(%rax)
	  10b786:	c6 80 60 08 00 00 00 	movb   $0x0,0x860(%rax)
	  10b78d:	f0 80 48 08 20       	lock orb $0x20,0x8(%rax)
	  10b792:	31 c0                	xor    %eax,%eax
	  10b794:	c3                   	ret
	  10b795:	48 09 d1             	or     %rdx,%rcx
	  10b798:	48 09 f1             	or     %rsi,%rcx
	  10b79b:	75 aa                	jne    10b747 <set_syscall_user_dispatch+0x17>
	  10b79d:	48 c7 80 48 08 00 00 	movq   $0x0,0x848(%rax)
	  10b7a4:	00 00 00 00
	  10b7a8:	48 c7 80 50 08 00 00 	movq   $0x0,0x850(%rax)
	  10b7af:	00 00 00 00
	  10b7b3:	48 c7 80 58 08 00 00 	movq   $0x0,0x858(%rax)
	  10b7ba:	00 00 00 00
	  10b7be:	c6 80 60 08 00 00 00 	movb   $0x0,0x860(%rax)
	  10b7c5:	f0 80 60 08 df       	lock andb $0xdf,0x8(%rax)
	  10b7ca:	31 c0                	xor    %eax,%eax
	  10b7cc:	c3                   	ret
	  10b7cd:	48 8d 3c 16          	lea    (%rsi,%rdx,1),%rdi
	  10b7d1:	48 39 fe             	cmp    %rdi,%rsi
	  10b7d4:	0f 82 78 ff ff ff    	jb     10b752 <set_syscall_user_dispatch+0x22>
	  10b7da:	e9 68 ff ff ff       	jmp    10b747 <set_syscall_user_dispatch+0x17>
	  10b7df:	b8 f2 ff ff ff       	mov    $0xfffffff2,%eax
	  10b7e4:	c3                   	ret

Please note a series of NOPs that get replaced with an alternative:

	    11f0:	65 48 23 05 00 00 00 	and    %gs:0x0(%rip),%rax
	    11f7:	00

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
---
v2: Include asm dumps of inlining in the commit message.
---
 arch/x86/include/asm/uaccess_64.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index c52f0133425b..3c1bec3a0405 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -26,8 +26,8 @@ extern unsigned long USER_PTR_MAX;
  */
 static inline unsigned long __untagged_addr(unsigned long addr)
 {
-	asm (ALTERNATIVE("",
-			 "and " __percpu_arg([mask]) ", %[addr]", X86_FEATURE_LAM)
+	asm_inline (ALTERNATIVE("", "and " __percpu_arg([mask]) ", %[addr]",
+				X86_FEATURE_LAM)
 	     : [addr] "+r" (addr)
 	     : [mask] "m" (__my_cpu_var(tlbstate_untag_mask)));
 
-- 
2.48.1
Re: [PATCH v2] x86/asm: Use asm_inline() instead of asm() in __untagged_addr()
Posted by Ingo Molnar 9 months ago
* Uros Bizjak <ubizjak@gmail.com> wrote:

> Use asm_inline() to instruct the compiler that the size of asm()
> is the minimum size of one instruction, ignoring how many instructions
> the compiler thinks it is. ALTERNATIVE macro that expands to several
> pseudo directives causes instruction length estimate to count
> more than 20 instructions.
> 
> bloat-o-meter reports minimal code size increase
> (x86_64 defconfig with CONFIG_ADDRESS_MASKING, gcc-14.2.1):
> 
>   add/remove: 2/2 grow/shrink: 5/1 up/down: 2365/-1995 (370)
> 
> 	Function                          old     new   delta
> 	-----------------------------------------------------
> 	do_get_mempolicy                    -    1449   +1449
> 	copy_nodes_to_user                  -     226    +226
> 	__x64_sys_get_mempolicy            35     213    +178
> 	syscall_user_dispatch_set_config  157     332    +175
> 	__ia32_sys_get_mempolicy           31     206    +175
> 	set_syscall_user_dispatch          29     181    +152
> 	__do_sys_mremap                  2073    2083     +10
> 	sp_insert                         133     117     -16
> 	task_set_syscall_user_dispatch    172       -    -172
> 	kernel_get_mempolicy             1807       -   -1807
> 
>   Total: Before=21423151, After=21423521, chg +0.00%
> 
> The code size increase is due to the compiler inlining
> more functions that inline untagged_addr(), e.g:
> 
> task_set_syscall_user_dispatch() is now fully inlined in
> set_syscall_user_dispatch():
> 
> 	000000000010b7e0 <set_syscall_user_dispatch>:
> 	  10b7e0:	f3 0f 1e fa          	endbr64
> 	  10b7e4:	49 89 c8             	mov    %rcx,%r8
> 	  10b7e7:	48 89 d1             	mov    %rdx,%rcx
> 	  10b7ea:	48 89 f2             	mov    %rsi,%rdx
> 	  10b7ed:	48 89 fe             	mov    %rdi,%rsi
> 	  10b7f0:	65 48 8b 3d 00 00 00 	mov    %gs:0x0(%rip),%rdi
> 	  10b7f7:	00
> 	  10b7f8:	e9 03 fe ff ff       	jmp    10b600 <task_set_syscall_user_dispatch>

So this was a tail-call optimization that jumped to a standalone 
<task_set_syscall_user_dispatch>, right? So now we'll avoid the 
tail-jump and maybe a bit of the register parameter shuffling? Which 
would explain the bloatometer delta of -172 for 
task_set_syscall_user_dispatch?

Could you also cite the first relevant bits of <task_set_syscall_user_dispatch>?

I don't seem to be able to reproduce this inlining decision, my version 
of GCC is:

  gcc version 14.2.0 (Ubuntu 14.2.0-4ubuntu2) 

which is one patch version older than your 14.2.1.

I tried GCC 11, 12, 13 as well, but none did this tail optimization, 
all appear to be inlining <task_set_syscall_user_dispatch> into 
<set_syscall_user_dispatch>. What am I missing?

Another question, where do the size increases in these functions come 
from:

>       __x64_sys_get_mempolicy            35     213    +178
>       syscall_user_dispatch_set_config  157     332    +175
>       __ia32_sys_get_mempolicy           31     206    +175
>       set_syscall_user_dispatch          29     181    +152

(I have to ask, because I have trouble reproducing with my toolchain so 
I cannot look at this myself.)

Thanks,

	Ingo
Re: [PATCH v2] x86/asm: Use asm_inline() instead of asm() in __untagged_addr()
Posted by Uros Bizjak 9 months ago
On Tue, Mar 18, 2025 at 12:21 AM Ingo Molnar <mingo@kernel.org> wrote:
>
>
> * Uros Bizjak <ubizjak@gmail.com> wrote:
>
> > Use asm_inline() to instruct the compiler that the size of asm()
> > is the minimum size of one instruction, ignoring how many instructions
> > the compiler thinks it is. ALTERNATIVE macro that expands to several
> > pseudo directives causes instruction length estimate to count
> > more than 20 instructions.
> >
> > bloat-o-meter reports minimal code size increase
> > (x86_64 defconfig with CONFIG_ADDRESS_MASKING, gcc-14.2.1):
> >
> >   add/remove: 2/2 grow/shrink: 5/1 up/down: 2365/-1995 (370)
> >
> >       Function                          old     new   delta
> >       -----------------------------------------------------
> >       do_get_mempolicy                    -    1449   +1449
> >       copy_nodes_to_user                  -     226    +226
> >       __x64_sys_get_mempolicy            35     213    +178
> >       syscall_user_dispatch_set_config  157     332    +175
> >       __ia32_sys_get_mempolicy           31     206    +175
> >       set_syscall_user_dispatch          29     181    +152
> >       __do_sys_mremap                  2073    2083     +10
> >       sp_insert                         133     117     -16
> >       task_set_syscall_user_dispatch    172       -    -172
> >       kernel_get_mempolicy             1807       -   -1807
> >
> >   Total: Before=21423151, After=21423521, chg +0.00%
> >
> > The code size increase is due to the compiler inlining
> > more functions that inline untagged_addr(), e.g:
> >
> > task_set_syscall_user_dispatch() is now fully inlined in
> > set_syscall_user_dispatch():
> >
> >       000000000010b7e0 <set_syscall_user_dispatch>:
> >         10b7e0:       f3 0f 1e fa             endbr64
> >         10b7e4:       49 89 c8                mov    %rcx,%r8
> >         10b7e7:       48 89 d1                mov    %rdx,%rcx
> >         10b7ea:       48 89 f2                mov    %rsi,%rdx
> >         10b7ed:       48 89 fe                mov    %rdi,%rsi
> >         10b7f0:       65 48 8b 3d 00 00 00    mov    %gs:0x0(%rip),%rdi
> >         10b7f7:       00
> >         10b7f8:       e9 03 fe ff ff          jmp    10b600 <task_set_syscall_user_dispatch>
>
> So this was a tail-call optimization that jumped to a standalone
> <task_set_syscall_user_dispatch>, right? So now we'll avoid the
> tail-jump and maybe a bit of the register parameter shuffling? Which
> would explain the bloatometer delta of -172 for
> task_set_syscall_user_dispatch?

Yes, this is correct. Register shuffling is part of the call setup
(because parameters have to be passed to the called function in
certain registers according to the ps-ABI). Inlining avoids this
because parameters can now be freely allocated to any register of the
required class.

> Could you also cite the first relevant bits of <task_set_syscall_user_dispatch>?

000000000010b600 <task_set_syscall_user_dispatch>:
  10b600:    48 89 f8                 mov    %rdi,%rax
  10b603:    48 85 f6                 test   %rsi,%rsi
  10b606:    74 54                    je     10b65c
<task_set_syscall_user_dispatch+0x5c>
  10b608:    48 83 fe 01              cmp    $0x1,%rsi
  10b60c:    74 06                    je     10b614
<task_set_syscall_user_dispatch+0x14>
  10b60e:    b8 ea ff ff ff           mov    $0xffffffea,%eax
  10b613:    c3                       ret
  10b614:    48 85 d2                 test   %rdx,%rdx
  10b617:    75 7b                    jne    10b694
<task_set_syscall_user_dispatch+0x94>
  10b619:    4d 85 c0                 test   %r8,%r8
  10b61c:    74 1a                    je     10b638
<task_set_syscall_user_dispatch+0x38>
  10b61e:    4c 89 c6                 mov    %r8,%rsi
  10b621:    48 bf ef cd ab 89 67     movabs $0x123456789abcdef,%rdi
  10b628:    45 23 01
  10b62b:    90                       nop
  10b62c:    90                       nop
  10b62d:    90                       nop
  10b62e:    90                       nop
  10b62f:    90                       nop
  10b630:    90                       nop
  10b631:    90                       nop
  10b632:    90                       nop
  10b633:    48 39 f7                 cmp    %rsi,%rdi
  10b636:    72 6e                    jb     10b6a6
<task_set_syscall_user_dispatch+0xa6>
  10b638:    4c 89 80 48 08 00 00     mov    %r8,0x848(%rax)
  10b63f:    48 89 90 50 08 00 00     mov    %rdx,0x850(%rax)
  10b646:    48 89 88 58 08 00 00     mov    %rcx,0x858(%rax)
  10b64d:    c6 80 60 08 00 00 00     movb   $0x0,0x860(%rax)
  10b654:    f0 80 48 08 20           lock orb $0x20,0x8(%rax)
  10b659:    31 c0                    xor    %eax,%eax
  10b65b:    c3                       ret
  10b65c:    49 09 c8                 or     %rcx,%r8
  10b65f:    49 09 d0                 or     %rdx,%r8
  10b662:    75 aa                    jne    10b60e
<task_set_syscall_user_dispatch+0xe>
  10b664:    48 c7 87 48 08 00 00     movq   $0x0,0x848(%rdi)
  10b66b:    00 00 00 00
  10b66f:    48 c7 87 50 08 00 00     movq   $0x0,0x850(%rdi)
  10b676:    00 00 00 00
  10b67a:    48 c7 87 58 08 00 00     movq   $0x0,0x858(%rdi)
  10b681:    00 00 00 00
  10b685:    c6 87 60 08 00 00 00     movb   $0x0,0x860(%rdi)
  10b68c:    f0 80 67 08 df           lock andb $0xdf,0x8(%rdi)
  10b691:    31 c0                    xor    %eax,%eax
  10b693:    c3                       ret
  10b694:    48 8d 34 0a              lea    (%rdx,%rcx,1),%rsi
  10b698:    48 39 f2                 cmp    %rsi,%rdx
  10b69b:    0f 82 78 ff ff ff        jb     10b619
<task_set_syscall_user_dispatch+0x19>
  10b6a1:    e9 68 ff ff ff           jmp    10b60e
<task_set_syscall_user_dispatch+0xe>
  10b6a6:    b8 f2 ff ff ff           mov    $0xfffffff2,%eax
  10b6ab:    c3                       ret

> I don't seem to be able to reproduce this inlining decision, my version
> of GCC is:
>
>   gcc version 14.2.0 (Ubuntu 14.2.0-4ubuntu2)
>
> which is one patch version older than your 14.2.1.
>
> I tried GCC 11, 12, 13 as well, but none did this tail optimization,
> all appear to be inlining <task_set_syscall_user_dispatch> into
> <set_syscall_user_dispatch>. What am I missing?

CONFIG_ADDRESS_MASKING in current -tip depends on:

  Depends on: X86_64 [=y] && (COMPILE_TEST [=n] || !CPU_MITIGATIONS [=y])

and I chose to disable CPU_MITIGATIONS (this is the reason no __pfx
functions are reported and the compiler is able to do sibling call
optimization).

BTW: I found the cited functions a representation of how the compiler
inlines bigger functions into smaller ones (one would expect the
opposite) depending on and respecting the code size growth factor
criteria for combined function.

> Another question, where do the size increases in these functions come
> from:
>
> >       __x64_sys_get_mempolicy            35     213    +178

This one now inlines kernel_get_mempolicy().

> >       syscall_user_dispatch_set_config  157     332    +175

This one now inlines task_set_syscall_user_dispatch().

> >       __ia32_sys_get_mempolicy           31     206    +175

This one now inlines kernel_get_mempolicy().

> >       set_syscall_user_dispatch          29     181    +152

This one now inlines task_set_syscall_user_dispatch().

> (I have to ask, because I have trouble reproducing with my toolchain so
> I cannot look at this myself.)

BTW: the indication of inline changes w.r.t. __untagged_addr() is the
number of references to the tlbstate_untag_mask variable in the
.altinstr_replacement section. If these go up, more inlining happens.
Also, bloat-o-meter indicates which function changes, and looking for
a string of 8 NOPs in the function will show where the AND instruction
is to be patched in.

Uros.