[PATCH RESEND -tip v2] x86/asm: Use asm_inline() instead of asm() in __untagged_addr()

Uros Bizjak posted 1 patch 8 months, 1 week ago
arch/x86/include/asm/uaccess_64.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
[PATCH RESEND -tip v2] x86/asm: Use asm_inline() instead of asm() in __untagged_addr()
Posted by Uros Bizjak 8 months, 1 week ago
Use asm_inline() to instruct the compiler that the size of asm()
is the minimum size of one instruction, ignoring how many instructions
the compiler thinks it is. ALTERNATIVE macro that expands to several
pseudo directives causes instruction length estimate to count
more than 20 instructions.

bloat-o-meter reports minimal code size increase
(x86_64 defconfig with CONFIG_ADDRESS_MASKING, gcc-14.2.1):

  add/remove: 2/2 grow/shrink: 5/1 up/down: 2365/-1995 (370)

	Function                          old     new   delta
	-----------------------------------------------------
	do_get_mempolicy                    -    1449   +1449
	copy_nodes_to_user                  -     226    +226
	__x64_sys_get_mempolicy            35     213    +178
	syscall_user_dispatch_set_config  157     332    +175
	__ia32_sys_get_mempolicy           31     206    +175
	set_syscall_user_dispatch          29     181    +152
	__do_sys_mremap                  2073    2083     +10
	sp_insert                         133     117     -16
	task_set_syscall_user_dispatch    172       -    -172
	kernel_get_mempolicy             1807       -   -1807

  Total: Before=21423151, After=21423521, chg +0.00%

The code size increase is due to the compiler inlining
more functions that inline untagged_addr(), e.g:

task_set_syscall_user_dispatch() is now fully inlined in
set_syscall_user_dispatch():

	000000000010b7e0 <set_syscall_user_dispatch>:
	  10b7e0:	f3 0f 1e fa          	endbr64
	  10b7e4:	49 89 c8             	mov    %rcx,%r8
	  10b7e7:	48 89 d1             	mov    %rdx,%rcx
	  10b7ea:	48 89 f2             	mov    %rsi,%rdx
	  10b7ed:	48 89 fe             	mov    %rdi,%rsi
	  10b7f0:	65 48 8b 3d 00 00 00 	mov    %gs:0x0(%rip),%rdi
	  10b7f7:	00
	  10b7f8:	e9 03 fe ff ff       	jmp    10b600 <task_set_syscall_user_dispatch>

that after inlining becomes:

	000000000010b730 <set_syscall_user_dispatch>:
	  10b730:	f3 0f 1e fa          	endbr64
	  10b734:	65 48 8b 05 00 00 00 	mov    %gs:0x0(%rip),%rax
	  10b73b:	00
	  10b73c:	48 85 ff             	test   %rdi,%rdi
	  10b73f:	74 54                	je     10b795 <set_syscall_user_dispatch+0x65>
	  10b741:	48 83 ff 01          	cmp    $0x1,%rdi
	  10b745:	74 06                	je     10b74d <set_syscall_user_dispatch+0x1d>
	  10b747:	b8 ea ff ff ff       	mov    $0xffffffea,%eax
	  10b74c:	c3                   	ret
	  10b74d:	48 85 f6             	test   %rsi,%rsi
	  10b750:	75 7b                	jne    10b7cd <set_syscall_user_dispatch+0x9d>
	  10b752:	48 85 c9             	test   %rcx,%rcx
	  10b755:	74 1a                	je     10b771 <set_syscall_user_dispatch+0x41>
	  10b757:	48 89 cf             	mov    %rcx,%rdi
	  10b75a:	49 b8 ef cd ab 89 67 	movabs $0x123456789abcdef,%r8
	  10b761:	45 23 01
	  10b764:	90                   	nop
	  10b765:	90                   	nop
	  10b766:	90                   	nop
	  10b767:	90                   	nop
	  10b768:	90                   	nop
	  10b769:	90                   	nop
	  10b76a:	90                   	nop
	  10b76b:	90                   	nop
	  10b76c:	49 39 f8             	cmp    %rdi,%r8
	  10b76f:	72 6e                	jb     10b7df <set_syscall_user_dispatch+0xaf>
	  10b771:	48 89 88 48 08 00 00 	mov    %rcx,0x848(%rax)
	  10b778:	48 89 b0 50 08 00 00 	mov    %rsi,0x850(%rax)
	  10b77f:	48 89 90 58 08 00 00 	mov    %rdx,0x858(%rax)
	  10b786:	c6 80 60 08 00 00 00 	movb   $0x0,0x860(%rax)
	  10b78d:	f0 80 48 08 20       	lock orb $0x20,0x8(%rax)
	  10b792:	31 c0                	xor    %eax,%eax
	  10b794:	c3                   	ret
	  10b795:	48 09 d1             	or     %rdx,%rcx
	  10b798:	48 09 f1             	or     %rsi,%rcx
	  10b79b:	75 aa                	jne    10b747 <set_syscall_user_dispatch+0x17>
	  10b79d:	48 c7 80 48 08 00 00 	movq   $0x0,0x848(%rax)
	  10b7a4:	00 00 00 00
	  10b7a8:	48 c7 80 50 08 00 00 	movq   $0x0,0x850(%rax)
	  10b7af:	00 00 00 00
	  10b7b3:	48 c7 80 58 08 00 00 	movq   $0x0,0x858(%rax)
	  10b7ba:	00 00 00 00
	  10b7be:	c6 80 60 08 00 00 00 	movb   $0x0,0x860(%rax)
	  10b7c5:	f0 80 60 08 df       	lock andb $0xdf,0x8(%rax)
	  10b7ca:	31 c0                	xor    %eax,%eax
	  10b7cc:	c3                   	ret
	  10b7cd:	48 8d 3c 16          	lea    (%rsi,%rdx,1),%rdi
	  10b7d1:	48 39 fe             	cmp    %rdi,%rsi
	  10b7d4:	0f 82 78 ff ff ff    	jb     10b752 <set_syscall_user_dispatch+0x22>
	  10b7da:	e9 68 ff ff ff       	jmp    10b747 <set_syscall_user_dispatch+0x17>
	  10b7df:	b8 f2 ff ff ff       	mov    $0xfffffff2,%eax
	  10b7e4:	c3                   	ret

Please note a series of NOPs that get replaced with an alternative:

	    11f0:	65 48 23 05 00 00 00 	and    %gs:0x0(%rip),%rax
	    11f7:	00

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
---
v2: Include asm dumps of inlining in the commit message.
---
 arch/x86/include/asm/uaccess_64.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index c52f0133425b..3c1bec3a0405 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -26,8 +26,8 @@ extern unsigned long USER_PTR_MAX;
  */
 static inline unsigned long __untagged_addr(unsigned long addr)
 {
-	asm (ALTERNATIVE("",
-			 "and " __percpu_arg([mask]) ", %[addr]", X86_FEATURE_LAM)
+	asm_inline (ALTERNATIVE("", "and " __percpu_arg([mask]) ", %[addr]",
+				X86_FEATURE_LAM)
 	     : [addr] "+r" (addr)
 	     : [mask] "m" (__my_cpu_var(tlbstate_untag_mask)));
 
-- 
2.49.0
[tip: x86/asm] x86/uaccess: Use asm_inline() instead of asm() in __untagged_addr()
Posted by tip-bot2 for Uros Bizjak 8 months ago
The following commit has been merged into the x86/asm branch of tip:

Commit-ID:     4850074ff06fce894a9946c5d3ab8ea13fa33e43
Gitweb:        https://git.kernel.org/tip/4850074ff06fce894a9946c5d3ab8ea13fa33e43
Author:        Uros Bizjak <ubizjak@gmail.com>
AuthorDate:    Mon, 07 Apr 2025 09:21:04 +02:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Sun, 13 Apr 2025 21:12:04 +02:00

x86/uaccess: Use asm_inline() instead of asm() in __untagged_addr()

Use asm_inline() to instruct the compiler that the size of asm()
is the minimum size of one instruction, ignoring how many instructions
the compiler thinks it is. ALTERNATIVE macro that expands to several
pseudo directives causes instruction length estimate to count
more than 20 instructions.

bloat-o-meter reports minimal code size increase
(x86_64 defconfig with CONFIG_ADDRESS_MASKING, gcc-14.2.1):

  add/remove: 2/2 grow/shrink: 5/1 up/down: 2365/-1995 (370)

	Function                          old     new   delta
	-----------------------------------------------------
	do_get_mempolicy                    -    1449   +1449
	copy_nodes_to_user                  -     226    +226
	__x64_sys_get_mempolicy            35     213    +178
	syscall_user_dispatch_set_config  157     332    +175
	__ia32_sys_get_mempolicy           31     206    +175
	set_syscall_user_dispatch          29     181    +152
	__do_sys_mremap                  2073    2083     +10
	sp_insert                         133     117     -16
	task_set_syscall_user_dispatch    172       -    -172
	kernel_get_mempolicy             1807       -   -1807

  Total: Before=21423151, After=21423521, chg +0.00%

The code size increase is due to the compiler inlining
more functions that inline untagged_addr(), e.g:

task_set_syscall_user_dispatch() is now fully inlined in
set_syscall_user_dispatch():

	000000000010b7e0 <set_syscall_user_dispatch>:
	  10b7e0:	f3 0f 1e fa          	endbr64
	  10b7e4:	49 89 c8             	mov    %rcx,%r8
	  10b7e7:	48 89 d1             	mov    %rdx,%rcx
	  10b7ea:	48 89 f2             	mov    %rsi,%rdx
	  10b7ed:	48 89 fe             	mov    %rdi,%rsi
	  10b7f0:	65 48 8b 3d 00 00 00 	mov    %gs:0x0(%rip),%rdi
	  10b7f7:	00
	  10b7f8:	e9 03 fe ff ff       	jmp    10b600 <task_set_syscall_user_dispatch>

that after inlining becomes:

	000000000010b730 <set_syscall_user_dispatch>:
	  10b730:	f3 0f 1e fa          	endbr64
	  10b734:	65 48 8b 05 00 00 00 	mov    %gs:0x0(%rip),%rax
	  10b73b:	00
	  10b73c:	48 85 ff             	test   %rdi,%rdi
	  10b73f:	74 54                	je     10b795 <set_syscall_user_dispatch+0x65>
	  10b741:	48 83 ff 01          	cmp    $0x1,%rdi
	  10b745:	74 06                	je     10b74d <set_syscall_user_dispatch+0x1d>
	  10b747:	b8 ea ff ff ff       	mov    $0xffffffea,%eax
	  10b74c:	c3                   	ret
	  10b74d:	48 85 f6             	test   %rsi,%rsi
	  10b750:	75 7b                	jne    10b7cd <set_syscall_user_dispatch+0x9d>
	  10b752:	48 85 c9             	test   %rcx,%rcx
	  10b755:	74 1a                	je     10b771 <set_syscall_user_dispatch+0x41>
	  10b757:	48 89 cf             	mov    %rcx,%rdi
	  10b75a:	49 b8 ef cd ab 89 67 	movabs $0x123456789abcdef,%r8
	  10b761:	45 23 01
	  10b764:	90                   	nop
	  10b765:	90                   	nop
	  10b766:	90                   	nop
	  10b767:	90                   	nop
	  10b768:	90                   	nop
	  10b769:	90                   	nop
	  10b76a:	90                   	nop
	  10b76b:	90                   	nop
	  10b76c:	49 39 f8             	cmp    %rdi,%r8
	  10b76f:	72 6e                	jb     10b7df <set_syscall_user_dispatch+0xaf>
	  10b771:	48 89 88 48 08 00 00 	mov    %rcx,0x848(%rax)
	  10b778:	48 89 b0 50 08 00 00 	mov    %rsi,0x850(%rax)
	  10b77f:	48 89 90 58 08 00 00 	mov    %rdx,0x858(%rax)
	  10b786:	c6 80 60 08 00 00 00 	movb   $0x0,0x860(%rax)
	  10b78d:	f0 80 48 08 20       	lock orb $0x20,0x8(%rax)
	  10b792:	31 c0                	xor    %eax,%eax
	  10b794:	c3                   	ret
	  10b795:	48 09 d1             	or     %rdx,%rcx
	  10b798:	48 09 f1             	or     %rsi,%rcx
	  10b79b:	75 aa                	jne    10b747 <set_syscall_user_dispatch+0x17>
	  10b79d:	48 c7 80 48 08 00 00 	movq   $0x0,0x848(%rax)
	  10b7a4:	00 00 00 00
	  10b7a8:	48 c7 80 50 08 00 00 	movq   $0x0,0x850(%rax)
	  10b7af:	00 00 00 00
	  10b7b3:	48 c7 80 58 08 00 00 	movq   $0x0,0x858(%rax)
	  10b7ba:	00 00 00 00
	  10b7be:	c6 80 60 08 00 00 00 	movb   $0x0,0x860(%rax)
	  10b7c5:	f0 80 60 08 df       	lock andb $0xdf,0x8(%rax)
	  10b7ca:	31 c0                	xor    %eax,%eax
	  10b7cc:	c3                   	ret
	  10b7cd:	48 8d 3c 16          	lea    (%rsi,%rdx,1),%rdi
	  10b7d1:	48 39 fe             	cmp    %rdi,%rsi
	  10b7d4:	0f 82 78 ff ff ff    	jb     10b752 <set_syscall_user_dispatch+0x22>
	  10b7da:	e9 68 ff ff ff       	jmp    10b747 <set_syscall_user_dispatch+0x17>
	  10b7df:	b8 f2 ff ff ff       	mov    $0xfffffff2,%eax
	  10b7e4:	c3                   	ret

Please note a series of NOPs that get replaced with an alternative:

	    11f0:	65 48 23 05 00 00 00 	and    %gs:0x0(%rip),%rax
	    11f7:	00

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20250407072129.33440-1-ubizjak@gmail.com
---
 arch/x86/include/asm/uaccess_64.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 4c13883..c8a5ae3 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -26,8 +26,8 @@ extern unsigned long USER_PTR_MAX;
  */
 static inline unsigned long __untagged_addr(unsigned long addr)
 {
-	asm (ALTERNATIVE("",
-			 "and " __percpu_arg([mask]) ", %[addr]", X86_FEATURE_LAM)
+	asm_inline (ALTERNATIVE("", "and " __percpu_arg([mask]) ", %[addr]",
+				X86_FEATURE_LAM)
 	     : [addr] "+r" (addr)
 	     : [mask] "m" (__my_cpu_var(tlbstate_untag_mask)));