x86: Optimize __get_user_asm() assembly

[PATCH] x86: Optimize __get_user_asm() assembly

Posted by Uros Bizjak 2 months, 1 week ago

On x86, byte and word moves (MOVB, MOVW) write only to the low
portion of a 32-bit register, leaving the upper bits unchanged.
Modern compilers therefore prefer using MOVZBL and MOVZWL to load
8-bit and 16-bit values with zero-extension to the full register
width.

Update __get_user_asm() to follow this convention by explicitly
zero-extending 8-bit and 16-bit loads from memory.

An additional benefit of this change is that it enables the full
integer register set to be used for 8-bit loads. Also, it
eliminates the need for manual zero-extension of 8-bit values.

There is only a minimal increase in code size:

      text     data     bss      dec     hex filename
  28258526  4810554  740932 33810012  03e65c vmlinux-old.o
  28258550  4810554  740932 33810036  03e674 vmlinux-new.o

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
-
---
 arch/x86/include/asm/uaccess.h | 40 +++++++++++++++-------------------
 1 file changed, 17 insertions(+), 23 deletions(-)

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 367297b188c3..cb463c1301f6 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -260,30 +260,27 @@ do {									\
 	unsigned int __gu_low, __gu_high;				\
 	const unsigned int __user *__gu_ptr;				\
 	__gu_ptr = (const void __user *)(ptr);				\
-	__get_user_asm(__gu_low, __gu_ptr, "l", "=r", label);		\
-	__get_user_asm(__gu_high, __gu_ptr+1, "l", "=r", label);	\
+	__get_user_asm(__gu_low, __gu_ptr, "movl", "", label);		\
+	__get_user_asm(__gu_high, __gu_ptr+1, "movl", "", label);	\
 	(x) = ((unsigned long long)__gu_high << 32) | __gu_low;		\
 } while (0)
 #else
 #define __get_user_asm_u64(x, ptr, label)				\
-	__get_user_asm(x, ptr, "q", "=r", label)
+	__get_user_asm(x, ptr, "movq", "", label)
 #endif
 
 #define __get_user_size(x, ptr, size, label)				\
 do {									\
 	__chk_user_ptr(ptr);						\
 	switch (size) {							\
-	case 1:	{							\
-		unsigned char x_u8__;					\
-		__get_user_asm(x_u8__, ptr, "b", "=q", label);		\
-		(x) = x_u8__;						\
+	case 1:								\
+		__get_user_asm(x, ptr, "movzbl", "k", label);		\
 		break;							\
-	}								\
 	case 2:								\
-		__get_user_asm(x, ptr, "w", "=r", label);		\
+		__get_user_asm(x, ptr, "movzwl", "k", label);		\
 		break;							\
 	case 4:								\
-		__get_user_asm(x, ptr, "l", "=r", label);		\
+		__get_user_asm(x, ptr, "movl", "", label);		\
 		break;							\
 	case 8:								\
 		__get_user_asm_u64(x, ptr, label);			\
@@ -294,11 +291,11 @@ do {									\
 	instrument_get_user(x);						\
 } while (0)
 
-#define __get_user_asm(x, addr, itype, ltype, label)			\
+#define __get_user_asm(x, addr, insn, opmod, label)			\
 	asm_goto_output("\n"						\
-		     "1:	mov"itype" %[umem],%[output]\n"		\
+		     "1:	" insn " %[umem],%" opmod "[output]\n"	\
 		     _ASM_EXTABLE_UA(1b, %l2)				\
-		     : [output] ltype(x)				\
+		     : [output] "=r" (x)				\
 		     : [umem] "m" (__m(addr))				\
 		     : : label)
 
@@ -326,26 +323,23 @@ do {									\
 })
 
 #else
-#define __get_user_asm_u64(x, ptr, retval) \
-	 __get_user_asm(x, ptr, retval, "q")
+#define __get_user_asm_u64(x, ptr, retval)				\
+	__get_user_asm(x, ptr, "movq", "", retval)
 #endif
 
 #define __get_user_size(x, ptr, size, retval)				\
 do {									\
-	unsigned char x_u8__;						\
-									\
 	retval = 0;							\
 	__chk_user_ptr(ptr);						\
 	switch (size) {							\
 	case 1:								\
-		__get_user_asm(x_u8__, ptr, retval, "b");		\
-		(x) = x_u8__;						\
+		 __get_user_asm(x, ptr, "movzbl", "k", retval);		\
 		break;							\
 	case 2:								\
-		__get_user_asm(x, ptr, retval, "w");			\
+		__get_user_asm(x, ptr, "movzwl", "k", retval);		\
 		break;							\
 	case 4:								\
-		__get_user_asm(x, ptr, retval, "l");			\
+		__get_user_asm(x, ptr, "movl", "", retval);		\
 		break;							\
 	case 8:								\
 		__get_user_asm_u64(x, ptr, retval);			\
@@ -355,9 +349,9 @@ do {									\
 	}								\
 } while (0)
 
-#define __get_user_asm(x, addr, err, itype)				\
+#define __get_user_asm(x, addr, insn, opmod, err)			\
 	asm volatile("\n"						\
-		     "1:	mov"itype" %[umem],%[output]\n"		\
+		     "1:	" insn " %[umem],%" opmod "[output]\n"	\
 		     "2:\n"						\
 		     _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG | \
 					   EX_FLAG_CLEAR_AX,		\
-- 
2.52.0

Re: [PATCH] x86: Optimize __get_user_asm() assembly

Posted by david laight 2 months, 1 week ago

On Wed, 26 Nov 2025 14:23:32 +0100
Uros Bizjak <ubizjak@gmail.com> wrote:

> On x86, byte and word moves (MOVB, MOVW) write only to the low
> portion of a 32-bit register, leaving the upper bits unchanged.
> Modern compilers therefore prefer using MOVZBL and MOVZWL to load
> 8-bit and 16-bit values with zero-extension to the full register
> width.
> 
> Update __get_user_asm() to follow this convention by explicitly
> zero-extending 8-bit and 16-bit loads from memory.
> 
> An additional benefit of this change is that it enables the full
> integer register set to be used for 8-bit loads. Also, it
> eliminates the need for manual zero-extension of 8-bit values.
> 
> There is only a minimal increase in code size:

Interesting, where does that come from.
I'd have thought it should shrink it.
The mov[sz]x is one byte longer.
Perhaps a lot of 8-bit values are never zero-extended?

I'm trying to remember what the magic 'k' is for.
Does gas treat %krxx as %dxx ?
Which would matter if 'x' is 64bit when using "=r" for 32bit reads.
But, in that case, I think you need it on the 'movl' as well.

There is the slight problem (which affects for of the asm) is that
gcc (I think) can't assume that the high 32bits of a register result
from an inline asm function are zero.
But if you return 'u64' then everything at the call site has to
be extended as well.
I wonder (not looked at the generated code yet) whether casting
the u64 result from the function to u32 has the effect of avoiding
the zero extend if the value is needed as a 64bit one.
This may (or may not) affect get_user() but will affect the 'bit scan'
functions.

This means you might get better code if 'x' is always u64 (with gas
generating instructions that write to the low 32 bits - zero the high ones)
but with a cast to u32 before any value can be used (esp when inlined).

Of course, I might be smoking substances again...

	David

> 
>       text     data     bss      dec     hex filename
>   28258526  4810554  740932 33810012  03e65c vmlinux-old.o
>   28258550  4810554  740932 33810036  03e674 vmlinux-new.o
> 
> Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Ingo Molnar <mingo@kernel.org>
> Cc: Borislav Petkov <bp@alien8.de>
> Cc: Dave Hansen <dave.hansen@linux.intel.com>
> Cc: "H. Peter Anvin" <hpa@zytor.com>
> -
> ---
>  arch/x86/include/asm/uaccess.h | 40 +++++++++++++++-------------------
>  1 file changed, 17 insertions(+), 23 deletions(-)
> 
> diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
> index 367297b188c3..cb463c1301f6 100644
> --- a/arch/x86/include/asm/uaccess.h
> +++ b/arch/x86/include/asm/uaccess.h
> @@ -260,30 +260,27 @@ do {									\
>  	unsigned int __gu_low, __gu_high;				\
>  	const unsigned int __user *__gu_ptr;				\
>  	__gu_ptr = (const void __user *)(ptr);				\
> -	__get_user_asm(__gu_low, __gu_ptr, "l", "=r", label);		\
> -	__get_user_asm(__gu_high, __gu_ptr+1, "l", "=r", label);	\
> +	__get_user_asm(__gu_low, __gu_ptr, "movl", "", label);		\
> +	__get_user_asm(__gu_high, __gu_ptr+1, "movl", "", label);	\
>  	(x) = ((unsigned long long)__gu_high << 32) | __gu_low;		\
>  } while (0)
>  #else
>  #define __get_user_asm_u64(x, ptr, label)				\
> -	__get_user_asm(x, ptr, "q", "=r", label)
> +	__get_user_asm(x, ptr, "movq", "", label)
>  #endif
>  
>  #define __get_user_size(x, ptr, size, label)				\
>  do {									\
>  	__chk_user_ptr(ptr);						\
>  	switch (size) {							\
> -	case 1:	{							\
> -		unsigned char x_u8__;					\
> -		__get_user_asm(x_u8__, ptr, "b", "=q", label);		\
> -		(x) = x_u8__;						\
> +	case 1:								\
> +		__get_user_asm(x, ptr, "movzbl", "k", label);		\
>  		break;							\
> -	}								\
>  	case 2:								\
> -		__get_user_asm(x, ptr, "w", "=r", label);		\
> +		__get_user_asm(x, ptr, "movzwl", "k", label);		\
>  		break;							\
>  	case 4:								\
> -		__get_user_asm(x, ptr, "l", "=r", label);		\
> +		__get_user_asm(x, ptr, "movl", "", label);		\
>  		break;							\
>  	case 8:								\
>  		__get_user_asm_u64(x, ptr, label);			\
> @@ -294,11 +291,11 @@ do {									\
>  	instrument_get_user(x);						\
>  } while (0)
>  
> -#define __get_user_asm(x, addr, itype, ltype, label)			\
> +#define __get_user_asm(x, addr, insn, opmod, label)			\
>  	asm_goto_output("\n"						\
> -		     "1:	mov"itype" %[umem],%[output]\n"		\
> +		     "1:	" insn " %[umem],%" opmod "[output]\n"	\
>  		     _ASM_EXTABLE_UA(1b, %l2)				\
> -		     : [output] ltype(x)				\
> +		     : [output] "=r" (x)				\
>  		     : [umem] "m" (__m(addr))				\
>  		     : : label)
>  
> @@ -326,26 +323,23 @@ do {									\
>  })
>  
>  #else
> -#define __get_user_asm_u64(x, ptr, retval) \
> -	 __get_user_asm(x, ptr, retval, "q")
> +#define __get_user_asm_u64(x, ptr, retval)				\
> +	__get_user_asm(x, ptr, "movq", "", retval)
>  #endif
>  
>  #define __get_user_size(x, ptr, size, retval)				\
>  do {									\
> -	unsigned char x_u8__;						\
> -									\
>  	retval = 0;							\
>  	__chk_user_ptr(ptr);						\
>  	switch (size) {							\
>  	case 1:								\
> -		__get_user_asm(x_u8__, ptr, retval, "b");		\
> -		(x) = x_u8__;						\
> +		 __get_user_asm(x, ptr, "movzbl", "k", retval);		\
>  		break;							\
>  	case 2:								\
> -		__get_user_asm(x, ptr, retval, "w");			\
> +		__get_user_asm(x, ptr, "movzwl", "k", retval);		\
>  		break;							\
>  	case 4:								\
> -		__get_user_asm(x, ptr, retval, "l");			\
> +		__get_user_asm(x, ptr, "movl", "", retval);		\
>  		break;							\
>  	case 8:								\
>  		__get_user_asm_u64(x, ptr, retval);			\
> @@ -355,9 +349,9 @@ do {									\
>  	}								\
>  } while (0)
>  
> -#define __get_user_asm(x, addr, err, itype)				\
> +#define __get_user_asm(x, addr, insn, opmod, err)			\
>  	asm volatile("\n"						\
> -		     "1:	mov"itype" %[umem],%[output]\n"		\
> +		     "1:	" insn " %[umem],%" opmod "[output]\n"	\
>  		     "2:\n"						\
>  		     _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG | \
>  					   EX_FLAG_CLEAR_AX,		\

Re: [PATCH] x86: Optimize __get_user_asm() assembly

Posted by Uros Bizjak 2 months, 1 week ago

On Wed, Nov 26, 2025 at 4:18 PM david laight <david.laight@runbox.com> wrote:
>
> On Wed, 26 Nov 2025 14:23:32 +0100
> Uros Bizjak <ubizjak@gmail.com> wrote:
>
> > On x86, byte and word moves (MOVB, MOVW) write only to the low
> > portion of a 32-bit register, leaving the upper bits unchanged.
> > Modern compilers therefore prefer using MOVZBL and MOVZWL to load
> > 8-bit and 16-bit values with zero-extension to the full register
> > width.
> >
> > Update __get_user_asm() to follow this convention by explicitly
> > zero-extending 8-bit and 16-bit loads from memory.
> >
> > An additional benefit of this change is that it enables the full
> > integer register set to be used for 8-bit loads. Also, it
> > eliminates the need for manual zero-extension of 8-bit values.
> >
> > There is only a minimal increase in code size:
>
> Interesting, where does that come from.
> I'd have thought it should shrink it.
> The mov[sz]x is one byte longer.
> Perhaps a lot of 8-bit values are never zero-extended?

bloat-o-meter says:

add/remove: 0/0 grow/shrink: 4/0 up/down: 16/0 (16)
Function                                     old     new   delta
strncpy_from_kernel_nofault                  151     158      +7
copy_from_kernel_nofault                     207     214      +7
strncpy_from_user                            218     219      +1
fault_in_readable                            150     151      +1
Total: Before=23919018, After=23919034, chg +0.00%

where the difference in copy_from_kernel_nofault() is expected:

-       66 8b 45 00             mov    0x0(%rbp),%ax
+       0f b7 45 00             movzwl 0x0(%rbp),%eax
...
-       8a 45 00                mov    0x0(%rbp),%al
+       0f b6 45 00             movzbl 0x0(%rbp),%eax

and then some NOPs at the end due to function entry alignment.

> I'm trying to remember what the magic 'k' is for.
> Does gas treat %krxx as %dxx ?
> Which would matter if 'x' is 64bit when using "=r" for 32bit reads.
> But, in that case, I think you need it on the 'movl' as well.

%k is gcc operand modifier:

‘k’        Print the SImode name of the register.

that overrides operand width and prints a 32-bit register name.

> There is the slight problem (which affects for of the asm) is that
> gcc (I think) can't assume that the high 32bits of a register result
> from an inline asm function are zero.

No, the compiler can't assume that. It doesn't know that highpart is
zero and ...

> But if you return 'u64' then everything at the call site has to
> be extended as well.

... has to zero-extend from u32 to u64 even if we know highpart is zero.

> I wonder (not looked at the generated code yet) whether casting
> the u64 result from the function to u32 has the effect of avoiding
> the zero extend if the value is needed as a 64bit one.

No, it still has to follow C promotion rules, so e.g.:

unsigned int foo (unsigned int a)
{
    return (unsigned char)a;
}

always extends function arguments:

foo:
       movzbl  %dil, %eax
       ret

Uros.

Re: [PATCH] x86: Optimize __get_user_asm() assembly

Posted by david laight 2 months, 1 week ago

On Wed, 26 Nov 2025 17:22:36 +0100
Uros Bizjak <ubizjak@gmail.com> wrote:

> On Wed, Nov 26, 2025 at 4:18 PM david laight <david.laight@runbox.com> wrote:
> >
> > On Wed, 26 Nov 2025 14:23:32 +0100
> > Uros Bizjak <ubizjak@gmail.com> wrote:
> >  
> > > On x86, byte and word moves (MOVB, MOVW) write only to the low
> > > portion of a 32-bit register, leaving the upper bits unchanged.
> > > Modern compilers therefore prefer using MOVZBL and MOVZWL to load
> > > 8-bit and 16-bit values with zero-extension to the full register
> > > width.
> > >
> > > Update __get_user_asm() to follow this convention by explicitly
> > > zero-extending 8-bit and 16-bit loads from memory.
> > >
> > > An additional benefit of this change is that it enables the full
> > > integer register set to be used for 8-bit loads. Also, it
> > > eliminates the need for manual zero-extension of 8-bit values.
> > >
> > > There is only a minimal increase in code size:  
> >
> > Interesting, where does that come from.
> > I'd have thought it should shrink it.
> > The mov[sz]x is one byte longer.
> > Perhaps a lot of 8-bit values are never zero-extended?  
> 
> bloat-o-meter says:
> 
> add/remove: 0/0 grow/shrink: 4/0 up/down: 16/0 (16)
> Function                                     old     new   delta
> strncpy_from_kernel_nofault                  151     158      +7
> copy_from_kernel_nofault                     207     214      +7
> strncpy_from_user                            218     219      +1
> fault_in_readable                            150     151      +1
> Total: Before=23919018, After=23919034, chg +0.00%
> 
> where the difference in copy_from_kernel_nofault() is expected:
> 
> -       66 8b 45 00             mov    0x0(%rbp),%ax
> +       0f b7 45 00             movzwl 0x0(%rbp),%eax
> ...
> -       8a 45 00                mov    0x0(%rbp),%al
> +       0f b6 45 00             movzbl 0x0(%rbp),%eax
> 
> and then some NOPs at the end due to function entry alignment.
> 
> > I'm trying to remember what the magic 'k' is for.
> > Does gas treat %krxx as %dxx ?
> > Which would matter if 'x' is 64bit when using "=r" for 32bit reads.
> > But, in that case, I think you need it on the 'movl' as well.  
> 
> %k is gcc operand modifier:
> 
> ‘k’        Print the SImode name of the register.
> 
> that overrides operand width and prints a 32-bit register name.

I can never find anything in that bit of the manual.
It would be easier if there were a separate section for each architecture.

So it does just change %rxx to %exx.

But the u32 code is much the same - so I doubt you need it at all.

Looks at the full file....
The 'x' parameter is __gu_val from __inttype(*(ptr)) __gu_val;
__inttype() is a 'horrid mess' that generates char/short/long/long long.

(So the "k" are actually changing the 16bit %bx to the 32bit %ebx (etc).)

But that isn't what you need for the 'movzbl' (etc).
At most you need to pick between 'long' and 'long long'.
Put the "k" in and you can use 'long long' all the time.

But you lose at the final step:
	(x) = (__force __typeof__(*(ptr))_gu_val;
all your hard work goes away.

If the usage was:
	val = unsafe_get_user(ptr, err_label);
you could return the 32bit register containing the byte value.
If the calling code assigned it to an int, then you'd save the zero extend.

This means the entire patch is pointless - even if you change the type
of 'x' to be 32bit for the size 8/16 cases.

It also explains why there are no ripple through changes.

> 
> > There is the slight problem (which affects for of the asm) is that
> > gcc (I think) can't assume that the high 32bits of a register result
> > from an inline asm function are zero.  
> 
> No, the compiler can't assume that. It doesn't know that highpart is
> zero and ...
> 
> > But if you return 'u64' then everything at the call site has to
> > be extended as well.  
> 
> ... has to zero-extend from u32 to u64 even if we know highpart is zero.

I realised that later - so you can't win :-(

	David

Re: [PATCH] x86: Optimize __get_user_asm() assembly

Posted by Uros Bizjak 2 months, 1 week ago

On Wed, Nov 26, 2025 at 8:23 PM david laight <david.laight@runbox.com> wrote:
>
> On Wed, 26 Nov 2025 17:22:36 +0100
> Uros Bizjak <ubizjak@gmail.com> wrote:
>
> > On Wed, Nov 26, 2025 at 4:18 PM david laight <david.laight@runbox.com> wrote:
> > >
> > > On Wed, 26 Nov 2025 14:23:32 +0100
> > > Uros Bizjak <ubizjak@gmail.com> wrote:
> > >
> > > > On x86, byte and word moves (MOVB, MOVW) write only to the low
> > > > portion of a 32-bit register, leaving the upper bits unchanged.
> > > > Modern compilers therefore prefer using MOVZBL and MOVZWL to load
> > > > 8-bit and 16-bit values with zero-extension to the full register
> > > > width.
> > > >
> > > > Update __get_user_asm() to follow this convention by explicitly
> > > > zero-extending 8-bit and 16-bit loads from memory.
> > > >
> > > > An additional benefit of this change is that it enables the full
> > > > integer register set to be used for 8-bit loads. Also, it
> > > > eliminates the need for manual zero-extension of 8-bit values.
> > > >
> > > > There is only a minimal increase in code size:
> > >
> > > Interesting, where does that come from.
> > > I'd have thought it should shrink it.
> > > The mov[sz]x is one byte longer.
> > > Perhaps a lot of 8-bit values are never zero-extended?
> >
> > bloat-o-meter says:
> >
> > add/remove: 0/0 grow/shrink: 4/0 up/down: 16/0 (16)
> > Function                                     old     new   delta
> > strncpy_from_kernel_nofault                  151     158      +7
> > copy_from_kernel_nofault                     207     214      +7
> > strncpy_from_user                            218     219      +1
> > fault_in_readable                            150     151      +1
> > Total: Before=23919018, After=23919034, chg +0.00%
> >
> > where the difference in copy_from_kernel_nofault() is expected:
> >
> > -       66 8b 45 00             mov    0x0(%rbp),%ax
> > +       0f b7 45 00             movzwl 0x0(%rbp),%eax
> > ...
> > -       8a 45 00                mov    0x0(%rbp),%al
> > +       0f b6 45 00             movzbl 0x0(%rbp),%eax
> >
> > and then some NOPs at the end due to function entry alignment.
> >
> > > I'm trying to remember what the magic 'k' is for.
> > > Does gas treat %krxx as %dxx ?
> > > Which would matter if 'x' is 64bit when using "=r" for 32bit reads.
> > > But, in that case, I think you need it on the 'movl' as well.
> >
> > %k is gcc operand modifier:
> >
> > ‘k’        Print the SImode name of the register.
> >
> > that overrides operand width and prints a 32-bit register name.
>
> I can never find anything in that bit of the manual.
> It would be easier if there were a separate section for each architecture.

You are looking for  [1].

[1] https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#x86-Operand-Modifiers

> So it does just change %rxx to %exx.

Actually it changes e.g ax to eax, bl to ebx (and rcx to ecx, but we
don't usethis)

> But the u32 code is much the same - so I doubt you need it at all.

Yes, the patch specifically changes 8-bit and 16-bit loads.

> Looks at the full file....
> The 'x' parameter is __gu_val from __inttype(*(ptr)) __gu_val;
> __inttype() is a 'horrid mess' that generates char/short/long/long long.
>
> (So the "k" are actually changing the 16bit %bx to the 32bit %ebx (etc).)

Yes.

> But that isn't what you need for the 'movzbl' (etc).
> At most you need to pick between 'long' and 'long long'.
> Put the "k" in and you can use 'long long' all the time.

No, putting "k" for the output register of long long move would
*narrow* the read to 32 bits.

> But you lose at the final step:
>         (x) = (__force __typeof__(*(ptr))_gu_val;
> all your hard work goes away.

Not really. By using MOVZBL *instead of* MOVB instruction, we can use
all integer registers for its output. Also, we don't need to
zero-extend through _x_u8__ temporary, because we *know* that 32-bit
(and 64-bit on x86_64) aliases of the original 8-bit register now hold
zero-extended 8-bit value. (Side note: MOVW also performs the insert
and zero-extend trick is currently missing for this case).

> If the usage was:
>         val = unsafe_get_user(ptr, err_label);
> you could return the 32bit register containing the byte value.
> If the calling code assigned it to an int, then you'd save the zero extend.
> This means the entire patch is pointless - even if you change the type
> of 'x' to be 32bit for the size 8/16 cases.

Please see the above explanation.

> It also explains why there are no ripple through changes.

There are actually not many reads from 8/16-bit values. But the
changes are evident where these are used.

BR,
Uros.

Re: [PATCH] x86: Optimize __get_user_asm() assembly

Posted by david laight 2 months, 1 week ago

On Wed, 26 Nov 2025 21:29:22 +0100
Uros Bizjak <ubizjak@gmail.com> wrote:

> On Wed, Nov 26, 2025 at 8:23 PM david laight <david.laight@runbox.com> wrote:
> >
> > On Wed, 26 Nov 2025 17:22:36 +0100
> > Uros Bizjak <ubizjak@gmail.com> wrote:
> >  
> > > On Wed, Nov 26, 2025 at 4:18 PM david laight <david.laight@runbox.com> wrote:  
> > > >
> > > > On Wed, 26 Nov 2025 14:23:32 +0100
> > > > Uros Bizjak <ubizjak@gmail.com> wrote:
> > > >  
> > > > > On x86, byte and word moves (MOVB, MOVW) write only to the low
> > > > > portion of a 32-bit register, leaving the upper bits unchanged.
> > > > > Modern compilers therefore prefer using MOVZBL and MOVZWL to load
> > > > > 8-bit and 16-bit values with zero-extension to the full register
> > > > > width.
> > > > >
> > > > > Update __get_user_asm() to follow this convention by explicitly
> > > > > zero-extending 8-bit and 16-bit loads from memory.
> > > > >
> > > > > An additional benefit of this change is that it enables the full
> > > > > integer register set to be used for 8-bit loads. Also, it
> > > > > eliminates the need for manual zero-extension of 8-bit values.
> > > > >
> > > > > There is only a minimal increase in code size:  
> > > >
> > > > Interesting, where does that come from.
> > > > I'd have thought it should shrink it.
> > > > The mov[sz]x is one byte longer.
> > > > Perhaps a lot of 8-bit values are never zero-extended?  
> > >
> > > bloat-o-meter says:
> > >
> > > add/remove: 0/0 grow/shrink: 4/0 up/down: 16/0 (16)
> > > Function                                     old     new   delta
> > > strncpy_from_kernel_nofault                  151     158      +7
> > > copy_from_kernel_nofault                     207     214      +7
> > > strncpy_from_user                            218     219      +1
> > > fault_in_readable                            150     151      +1
> > > Total: Before=23919018, After=23919034, chg +0.00%
> > >
> > > where the difference in copy_from_kernel_nofault() is expected:
> > >
> > > -       66 8b 45 00             mov    0x0(%rbp),%ax
> > > +       0f b7 45 00             movzwl 0x0(%rbp),%eax
> > > ...
> > > -       8a 45 00                mov    0x0(%rbp),%al
> > > +       0f b6 45 00             movzbl 0x0(%rbp),%eax
> > >
> > > and then some NOPs at the end due to function entry alignment.
> > >  
> > > > I'm trying to remember what the magic 'k' is for.
> > > > Does gas treat %krxx as %dxx ?
> > > > Which would matter if 'x' is 64bit when using "=r" for 32bit reads.
> > > > But, in that case, I think you need it on the 'movl' as well.  
> > >
> > > %k is gcc operand modifier:
> > >
> > > ‘k’        Print the SImode name of the register.
> > >
> > > that overrides operand width and prints a 32-bit register name.  
...
> > So it does just change %rxx to %exx.  
> 
> Actually it changes e.g ax to eax, bl to ebx (and rcx to ecx, but we
> don't usethis)
> 
> > But the u32 code is much the same - so I doubt you need it at all.  
> 
> Yes, the patch specifically changes 8-bit and 16-bit loads.
> 
> > Looks at the full file....
> > The 'x' parameter is __gu_val from __inttype(*(ptr)) __gu_val;
> > __inttype() is a 'horrid mess' that generates char/short/long/long long.
> >
> > (So the "k" are actually changing the 16bit %bx to the 32bit %ebx (etc).)  
> 
> Yes.
> 
> > But that isn't what you need for the 'movzbl' (etc).
> > At most you need to pick between 'long' and 'long long'.
> > Put the "k" in and you can use 'long long' all the time.  
> 
> No, putting "k" for the output register of long long move would
> *narrow* the read to 32 bits.

I wasn't suggesting that.
I was suggesting that 'x' could always be u64 provided you put a "k"
in the 32bit load.

> > But you lose at the final step:
> >         (x) = (__force __typeof__(*(ptr))_gu_val;
> > all your hard work goes away.  
> 
> Not really. By using MOVZBL *instead of* MOVB instruction, we can use
> all integer registers for its output.

That is only actually relevant in 32bit mode.
64bit mode can access the low 8 bits of 'si' quite happily.

It also makes no difference in 32bit mode.
The register selected depends on the "=r" and the type of the variable
(unsigned char) - so can only be one of %al, %bl, %cl or %dl.
The 'k' modifier makes the asm output contain %eax, %ebx, %ecx or %edx.

> Also, we don't need to
> zero-extend through _x_u8__ temporary, because we *know* that 32-bit
> (and 64-bit on x86_64) aliases of the original 8-bit register now hold
> zero-extended 8-bit value. (Side note: MOVW also performs the insert
> and zero-extend trick is currently missing for this case).

There is no zero-extension going on inside this code.
(There might be one that extends the result in the calling code.)
When the 'case 1' code is executed 'x' is unsigned char, the _x_u8__
probably exists to make that the code valid C when the size isn't 1
- the code is optimised away, but has to be valid enough to compile.

If there was a sign extension (and it went away) it would have been in the
bloat-o-meter output, but that just showed the one extra byte in the
'size 1' branch caused by the longer instruction.

Additionally although you've only told gcc to generate asm that uses %eax.
The compiler doesn't know what you've done with that register.
All it can look at is the C var - so if the value did need promoting
it would still need to sign/zero extend it.

This is why the only change you see is the extra byte added because
'movxbl' is a longer instruction.

I think you might be able to get the effect you want.
1) In unsafe_get_user() make __gu_val 'unsigned long' except for
   64bit read on 32bit (when it needs to be unsigned long long).
2) In __get_user_size() use movzbl and movzwl for the 8/16 bit reads
   and 'k' for all except the 64bit (on 64bit).
3) Back in unsafe_get_user() only do the cast for signed types.
   (They won't happen often enough to use movsx.)
4) Fixup get_kernel_nofault() so it still works.

In the 'not asm goto' variants, if you initialise the 'err' variable
in the caller, then it doesn't need to be initialised in get_user_size()
and then you only need one copy of get_user_size().
In fact, you can have a single copy of unsafe_get_user() if you pass
both 'err_label' and '__gu_err' though to the asm code.
If _gu_err is never referenced the final conditional test will be
optimised away.
The '.i' file will increase marginally, but the source will be better.

	David