[PATCHv8 02/17] x86/asm: Introduce inline memcpy and memset

Kirill A. Shutemov posted 17 patches 3 months, 1 week ago
There is a newer version of this series
[PATCHv8 02/17] x86/asm: Introduce inline memcpy and memset
Posted by Kirill A. Shutemov 3 months, 1 week ago
Extract memcpy and memset functions from copy_user_generic() and
__clear_user().

They can be used as inline memcpy and memset instead of the GCC builtins
whenever necessary. LASS requires them to handle text_poke.

Originally-by: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/all/20241029184840.GJ14555@noisy.programming.kicks-ass.net/
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 arch/x86/include/asm/string.h     | 46 +++++++++++++++++++++++++++++++
 arch/x86/include/asm/uaccess_64.h | 38 +++++++------------------
 arch/x86/lib/clear_page_64.S      | 13 +++++++--
 3 files changed, 67 insertions(+), 30 deletions(-)

diff --git a/arch/x86/include/asm/string.h b/arch/x86/include/asm/string.h
index c3c2c1914d65..17f6b5bfa8c1 100644
--- a/arch/x86/include/asm/string.h
+++ b/arch/x86/include/asm/string.h
@@ -1,6 +1,52 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_STRING_H
+#define _ASM_X86_STRING_H
+
+#include <asm/asm.h>
+#include <asm/alternative.h>
+#include <asm/cpufeatures.h>
+
 #ifdef CONFIG_X86_32
 # include <asm/string_32.h>
 #else
 # include <asm/string_64.h>
 #endif
+
+#ifdef CONFIG_X86_64
+#define ALT_64(orig, alt, feat) ALTERNATIVE(orig, alt, feat)
+#else
+#define ALT_64(orig, alt, feat) orig "\n"
+#endif
+
+static __always_inline void *__inline_memcpy(void *to, const void *from, size_t len)
+{
+	void *ret = to;
+
+	asm volatile("1:\n\t"
+		     ALT_64("rep movsb",
+			    "call rep_movs_alternative", ALT_NOT(X86_FEATURE_FSRM))
+		     "2:\n\t"
+		     _ASM_EXTABLE_UA(1b, 2b)
+		     : "+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
+		     : : "memory", _ASM_AX);
+
+	return ret + len;
+}
+
+static __always_inline void *__inline_memset(void *addr, int v, size_t len)
+{
+	void *ret = addr;
+
+	asm volatile("1:\n\t"
+		     ALT_64("rep stosb",
+			    "call rep_stos_alternative", ALT_NOT(X86_FEATURE_FSRM))
+		     "2:\n\t"
+		     _ASM_EXTABLE_UA(1b, 2b)
+		     : "+c" (len), "+D" (addr), ASM_CALL_CONSTRAINT
+		     : "a" ((uint8_t)v)
+		     : "memory", _ASM_SI, _ASM_DX);
+
+	return ret + len;
+}
+
+#endif /* _ASM_X86_STRING_H */
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index c8a5ae35c871..eb531e13e659 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -13,6 +13,7 @@
 #include <asm/page.h>
 #include <asm/percpu.h>
 #include <asm/runtime-const.h>
+#include <asm/string.h>
 
 /*
  * Virtual variable: there's no actual backing store for this,
@@ -118,21 +119,12 @@ rep_movs_alternative(void *to, const void *from, unsigned len);
 static __always_inline __must_check unsigned long
 copy_user_generic(void *to, const void *from, unsigned long len)
 {
+	void *ret;
+
 	stac();
-	/*
-	 * If CPU has FSRM feature, use 'rep movs'.
-	 * Otherwise, use rep_movs_alternative.
-	 */
-	asm volatile(
-		"1:\n\t"
-		ALTERNATIVE("rep movsb",
-			    "call rep_movs_alternative", ALT_NOT(X86_FEATURE_FSRM))
-		"2:\n"
-		_ASM_EXTABLE_UA(1b, 2b)
-		:"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
-		: : "memory", "rax");
+	ret = __inline_memcpy(to, from, len);
 	clac();
-	return len;
+	return ret - to;
 }
 
 static __always_inline __must_check unsigned long
@@ -178,25 +170,15 @@ rep_stos_alternative(void __user *addr, unsigned long len);
 
 static __always_inline __must_check unsigned long __clear_user(void __user *addr, unsigned long size)
 {
+	void *ptr = (__force void *)addr;
+	void *ret;
+
 	might_fault();
 	stac();
-
-	/*
-	 * No memory constraint because it doesn't change any memory gcc
-	 * knows about.
-	 */
-	asm volatile(
-		"1:\n\t"
-		ALTERNATIVE("rep stosb",
-			    "call rep_stos_alternative", ALT_NOT(X86_FEATURE_FSRS))
-		"2:\n"
-	       _ASM_EXTABLE_UA(1b, 2b)
-	       : "+c" (size), "+D" (addr), ASM_CALL_CONSTRAINT
-	       : "a" (0));
-
+	ret = __inline_memset(ptr, 0, size);
 	clac();
 
-	return size;
+	return ret - ptr;
 }
 
 static __always_inline unsigned long clear_user(void __user *to, unsigned long n)
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index a508e4a8c66a..47b613690f84 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -55,17 +55,26 @@ SYM_FUNC_END(clear_page_erms)
 EXPORT_SYMBOL_GPL(clear_page_erms)
 
 /*
- * Default clear user-space.
+ * Default memset.
  * Input:
  * rdi destination
+ * rsi scratch
  * rcx count
- * rax is zero
+ * al is value
  *
  * Output:
  * rcx: uncleared bytes or 0 if successful.
+ * rdx: clobbered
  */
 SYM_FUNC_START(rep_stos_alternative)
 	ANNOTATE_NOENDBR
+
+	movzbq %al, %rsi
+	movabs $0x0101010101010101, %rax
+
+	/* RDX:RAX = RAX * RSI */
+	mulq %rsi
+
 	cmpq $64,%rcx
 	jae .Lunrolled
 
-- 
2.47.2
Re: [PATCHv8 02/17] x86/asm: Introduce inline memcpy and memset
Posted by Dave Hansen 3 months ago
On 7/1/25 02:58, Kirill A. Shutemov wrote:
> Extract memcpy and memset functions from copy_user_generic() and
> __clear_user().
> 
> They can be used as inline memcpy and memset instead of the GCC builtins
> whenever necessary. LASS requires them to handle text_poke.

Why are we messing with the normal user copy functions? Code reuse is
great, but as you're discovering, the user copy code is highly
specialized and not that easy to reuse for other things.

Don't we just need a dirt simple chunk of code that does (logically):

	stac();
	asm("rep stosq...");
	clac();

Performance doesn't matter for text poking, right? It could be stosq or
anything else that you can inline. It could be a for() loop for all I
care as long as the compiler doesn't transform it into some out-of-line
memset. Right?
Re: [PATCHv8 02/17] x86/asm: Introduce inline memcpy and memset
Posted by David Laight 3 months ago
On Thu, 3 Jul 2025 10:13:44 -0700
Dave Hansen <dave.hansen@intel.com> wrote:

> On 7/1/25 02:58, Kirill A. Shutemov wrote:
> > Extract memcpy and memset functions from copy_user_generic() and
> > __clear_user().
> > 
> > They can be used as inline memcpy and memset instead of the GCC builtins
> > whenever necessary. LASS requires them to handle text_poke.  
> 
> Why are we messing with the normal user copy functions? Code reuse is
> great, but as you're discovering, the user copy code is highly
> specialized and not that easy to reuse for other things.
> 
> Don't we just need a dirt simple chunk of code that does (logically):
> 
> 	stac();
> 	asm("rep stosq...");
> 	clac();
> 
> Performance doesn't matter for text poking, right? It could be stosq or
> anything else that you can inline. It could be a for() loop for all I
> care as long as the compiler doesn't transform it into some out-of-line
> memset. Right?
> 

It doesn't even really matter if there is an out-of-line memset.
All you need to do is 'teach' objtool it isn't a problem.

Is this for the boot-time asm-alternatives?
In that case I wonder why a 'low' address is being used?
With LASS enabled using a low address on a life kernel would make it
harder for another cpu to leverage the writable code page, but
that isn't a requirement of LASS.

If it is being used for later instruction patching you need the
very careful instruction sequences and cpu synchronisation.
In that case I suspect you need to add conditional stac/clac
to the existing patching code (and teach objtool it is all ok).

	David
Re: [PATCHv8 02/17] x86/asm: Introduce inline memcpy and memset
Posted by Kirill A. Shutemov 3 months ago
On Sun, Jul 06, 2025 at 10:13:42AM +0100, David Laight wrote:
> On Thu, 3 Jul 2025 10:13:44 -0700
> Dave Hansen <dave.hansen@intel.com> wrote:
> 
> > On 7/1/25 02:58, Kirill A. Shutemov wrote:
> > > Extract memcpy and memset functions from copy_user_generic() and
> > > __clear_user().
> > > 
> > > They can be used as inline memcpy and memset instead of the GCC builtins
> > > whenever necessary. LASS requires them to handle text_poke.  
> > 
> > Why are we messing with the normal user copy functions? Code reuse is
> > great, but as you're discovering, the user copy code is highly
> > specialized and not that easy to reuse for other things.
> > 
> > Don't we just need a dirt simple chunk of code that does (logically):
> > 
> > 	stac();
> > 	asm("rep stosq...");
> > 	clac();
> > 
> > Performance doesn't matter for text poking, right? It could be stosq or
> > anything else that you can inline. It could be a for() loop for all I
> > care as long as the compiler doesn't transform it into some out-of-line
> > memset. Right?
> > 
> 
> It doesn't even really matter if there is an out-of-line memset.
> All you need to do is 'teach' objtool it isn't a problem.

PeterZ was not fan of the idead;

https://lore.kernel.org/all/20241029113611.GS14555@noisy.programming.kicks-ass.net/

> Is this for the boot-time asm-alternatives?

Not only boot-time. static_branches are switchable at runtime.

> In that case I wonder why a 'low' address is being used?
> With LASS enabled using a low address on a life kernel would make it
> harder for another cpu to leverage the writable code page, but
> that isn't a requirement of LASS.

Because kernel side of address space is shared across all CPU and we don't
want kernel code to be writable to all CPUs

> If it is being used for later instruction patching you need the
> very careful instruction sequences and cpu synchronisation.
> In that case I suspect you need to add conditional stac/clac
> to the existing patching code (and teach objtool it is all ok).

STAC/CLAC is conditional in text poke on LASS presence on the machine.

-- 
  Kiryl Shutsemau / Kirill A. Shutemov
Re: [PATCHv8 02/17] x86/asm: Introduce inline memcpy and memset
Posted by David Laight 3 months ago
On Mon, 7 Jul 2025 11:02:06 +0300
"Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> wrote:

> On Sun, Jul 06, 2025 at 10:13:42AM +0100, David Laight wrote:
> > On Thu, 3 Jul 2025 10:13:44 -0700
> > Dave Hansen <dave.hansen@intel.com> wrote:
> >   
> > > On 7/1/25 02:58, Kirill A. Shutemov wrote:  
> > > > Extract memcpy and memset functions from copy_user_generic() and
> > > > __clear_user().
> > > > 
> > > > They can be used as inline memcpy and memset instead of the GCC builtins
> > > > whenever necessary. LASS requires them to handle text_poke.    
> > > 
> > > Why are we messing with the normal user copy functions? Code reuse is
> > > great, but as you're discovering, the user copy code is highly
> > > specialized and not that easy to reuse for other things.
> > > 
> > > Don't we just need a dirt simple chunk of code that does (logically):
> > > 
> > > 	stac();
> > > 	asm("rep stosq...");
> > > 	clac();
> > > 
> > > Performance doesn't matter for text poking, right? It could be stosq or
> > > anything else that you can inline. It could be a for() loop for all I
> > > care as long as the compiler doesn't transform it into some out-of-line
> > > memset. Right?
> > >   
> > 
> > It doesn't even really matter if there is an out-of-line memset.
> > All you need to do is 'teach' objtool it isn't a problem.  
> 
> PeterZ was not fan of the idead;
> 
> https://lore.kernel.org/all/20241029113611.GS14555@noisy.programming.kicks-ass.net/
> 
> > Is this for the boot-time asm-alternatives?  
> 
> Not only boot-time. static_branches are switchable at runtime.
> 
> > In that case I wonder why a 'low' address is being used?
> > With LASS enabled using a low address on a life kernel would make it
> > harder for another cpu to leverage the writable code page, but
> > that isn't a requirement of LASS.  
> 
> Because kernel side of address space is shared across all CPU and we don't
> want kernel code to be writable to all CPUs

So, as I said, it isn't a requirement for LASS.
Just something that LASS lets you do.
Although I'm sure there will be some odd effect of putting a 'supervisor'
page in the middle of 'user' pages.

Isn't there also (something like) kmap_local_page() that updates the local
page tables but doesn't broadcast the change?

> 
> > If it is being used for later instruction patching you need the
> > very careful instruction sequences and cpu synchronisation.
> > In that case I suspect you need to add conditional stac/clac
> > to the existing patching code (and teach objtool it is all ok).  
> 
> STAC/CLAC is conditional in text poke on LASS presence on the machine.

So just change the code to use byte copy loops with a volatile
destination pointer and all will be fine.

	David
Re: [PATCHv8 02/17] x86/asm: Introduce inline memcpy and memset
Posted by Kirill A. Shutemov 3 months ago
On Thu, Jul 03, 2025 at 10:13:44AM -0700, Dave Hansen wrote:
> On 7/1/25 02:58, Kirill A. Shutemov wrote:
> > Extract memcpy and memset functions from copy_user_generic() and
> > __clear_user().
> > 
> > They can be used as inline memcpy and memset instead of the GCC builtins
> > whenever necessary. LASS requires them to handle text_poke.
> 
> Why are we messing with the normal user copy functions? Code reuse is
> great, but as you're discovering, the user copy code is highly
> specialized and not that easy to reuse for other things.
> 
> Don't we just need a dirt simple chunk of code that does (logically):
> 
> 	stac();
> 	asm("rep stosq...");
> 	clac();
> 
> Performance doesn't matter for text poking, right? It could be stosq or
> anything else that you can inline. It could be a for() loop for all I
> care as long as the compiler doesn't transform it into some out-of-line
> memset. Right?

Yeah, performance doesn't matter for text poking. And this approach
simplifies the code quite a bit. I will use direct asm() for text poke.

-- 
  Kiryl Shutsemau / Kirill A. Shutemov
Re: [PATCHv8 02/17] x86/asm: Introduce inline memcpy and memset
Posted by David Laight 3 months ago
On Tue,  1 Jul 2025 12:58:31 +0300
"Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> wrote:

> Extract memcpy and memset functions from copy_user_generic() and
> __clear_user().
> 
> They can be used as inline memcpy and memset instead of the GCC builtins
> whenever necessary. LASS requires them to handle text_poke.

Except they contain the fault handlers so aren't generic calls.

> 
> Originally-by: Peter Zijlstra <peterz@infradead.org>
> Link: https://lore.kernel.org/all/20241029184840.GJ14555@noisy.programming.kicks-ass.net/
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> ---
>  arch/x86/include/asm/string.h     | 46 +++++++++++++++++++++++++++++++
>  arch/x86/include/asm/uaccess_64.h | 38 +++++++------------------
>  arch/x86/lib/clear_page_64.S      | 13 +++++++--
>  3 files changed, 67 insertions(+), 30 deletions(-)
> 
> diff --git a/arch/x86/include/asm/string.h b/arch/x86/include/asm/string.h
> index c3c2c1914d65..17f6b5bfa8c1 100644
> --- a/arch/x86/include/asm/string.h
> +++ b/arch/x86/include/asm/string.h
> @@ -1,6 +1,52 @@
>  /* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _ASM_X86_STRING_H
> +#define _ASM_X86_STRING_H
> +
> +#include <asm/asm.h>
> +#include <asm/alternative.h>
> +#include <asm/cpufeatures.h>
> +
>  #ifdef CONFIG_X86_32
>  # include <asm/string_32.h>
>  #else
>  # include <asm/string_64.h>
>  #endif
> +
> +#ifdef CONFIG_X86_64
> +#define ALT_64(orig, alt, feat) ALTERNATIVE(orig, alt, feat)
> +#else
> +#define ALT_64(orig, alt, feat) orig "\n"
> +#endif
> +
> +static __always_inline void *__inline_memcpy(void *to, const void *from, size_t len)
> +{
> +	void *ret = to;
> +
> +	asm volatile("1:\n\t"
> +		     ALT_64("rep movsb",
> +			    "call rep_movs_alternative", ALT_NOT(X86_FEATURE_FSRM))
> +		     "2:\n\t"
> +		     _ASM_EXTABLE_UA(1b, 2b)
> +		     : "+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
> +		     : : "memory", _ASM_AX);
> +
> +	return ret + len;
> +}
> +
> +static __always_inline void *__inline_memset(void *addr, int v, size_t len)
> +{
> +	void *ret = addr;
> +
> +	asm volatile("1:\n\t"
> +		     ALT_64("rep stosb",
> +			    "call rep_stos_alternative", ALT_NOT(X86_FEATURE_FSRM))
> +		     "2:\n\t"
> +		     _ASM_EXTABLE_UA(1b, 2b)
> +		     : "+c" (len), "+D" (addr), ASM_CALL_CONSTRAINT
> +		     : "a" ((uint8_t)v)

You shouldn't need the (uint8_t) cast (should that be (u8) anyway).
At best it doesn't matter, at worst it will add code to mask with 0xff.

> +		     : "memory", _ASM_SI, _ASM_DX);
> +
> +	return ret + len;
> +}
> +
> +#endif /* _ASM_X86_STRING_H */
> diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
> index c8a5ae35c871..eb531e13e659 100644
> --- a/arch/x86/include/asm/uaccess_64.h
> +++ b/arch/x86/include/asm/uaccess_64.h
> @@ -13,6 +13,7 @@
>  #include <asm/page.h>
>  #include <asm/percpu.h>
>  #include <asm/runtime-const.h>
> +#include <asm/string.h>
>  
>  /*
>   * Virtual variable: there's no actual backing store for this,
> @@ -118,21 +119,12 @@ rep_movs_alternative(void *to, const void *from, unsigned len);
>  static __always_inline __must_check unsigned long
>  copy_user_generic(void *to, const void *from, unsigned long len)
>  {
> +	void *ret;
> +
>  	stac();
> -	/*
> -	 * If CPU has FSRM feature, use 'rep movs'.
> -	 * Otherwise, use rep_movs_alternative.
> -	 */
> -	asm volatile(
> -		"1:\n\t"
> -		ALTERNATIVE("rep movsb",
> -			    "call rep_movs_alternative", ALT_NOT(X86_FEATURE_FSRM))
> -		"2:\n"
> -		_ASM_EXTABLE_UA(1b, 2b)
> -		:"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
> -		: : "memory", "rax");
> +	ret = __inline_memcpy(to, from, len);
>  	clac();
> -	return len;
> +	return ret - to;
>  }
>  
>  static __always_inline __must_check unsigned long
> @@ -178,25 +170,15 @@ rep_stos_alternative(void __user *addr, unsigned long len);
>  
>  static __always_inline __must_check unsigned long __clear_user(void __user *addr, unsigned long size)
>  {
> +	void *ptr = (__force void *)addr;
> +	void *ret;
> +
>  	might_fault();
>  	stac();
> -
> -	/*
> -	 * No memory constraint because it doesn't change any memory gcc
> -	 * knows about.
> -	 */
> -	asm volatile(
> -		"1:\n\t"
> -		ALTERNATIVE("rep stosb",
> -			    "call rep_stos_alternative", ALT_NOT(X86_FEATURE_FSRS))
> -		"2:\n"
> -	       _ASM_EXTABLE_UA(1b, 2b)
> -	       : "+c" (size), "+D" (addr), ASM_CALL_CONSTRAINT
> -	       : "a" (0));
> -
> +	ret = __inline_memset(ptr, 0, size);
>  	clac();
>  
> -	return size;
> +	return ret - ptr;
>  }
>  
>  static __always_inline unsigned long clear_user(void __user *to, unsigned long n)
> diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
> index a508e4a8c66a..47b613690f84 100644
> --- a/arch/x86/lib/clear_page_64.S
> +++ b/arch/x86/lib/clear_page_64.S
> @@ -55,17 +55,26 @@ SYM_FUNC_END(clear_page_erms)
>  EXPORT_SYMBOL_GPL(clear_page_erms)
>  
>  /*
> - * Default clear user-space.
> + * Default memset.
>   * Input:
>   * rdi destination
> + * rsi scratch
>   * rcx count
> - * rax is zero
> + * al is value
>   *
>   * Output:
>   * rcx: uncleared bytes or 0 if successful.
> + * rdx: clobbered
>   */
>  SYM_FUNC_START(rep_stos_alternative)
>  	ANNOTATE_NOENDBR
> +
> +	movzbq %al, %rsi
> +	movabs $0x0101010101010101, %rax
> +
> +	/* RDX:RAX = RAX * RSI */
> +	mulq %rsi

NAK - you can't do that here.
Neither %rsi nor %rdx can be trashed.
The function has a very explicit calling convention.

It is also almost certainly a waste of time.
Pretty much all the calls will be for a constant 0x00.
Rename it all memzero() ...

	David

> +
>  	cmpq $64,%rcx
>  	jae .Lunrolled
>
Re: [PATCHv8 02/17] x86/asm: Introduce inline memcpy and memset
Posted by Kirill A. Shutemov 3 months ago
On Thu, Jul 03, 2025 at 09:44:17AM +0100, David Laight wrote:
> On Tue,  1 Jul 2025 12:58:31 +0300
> "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> wrote:
> 
> > Extract memcpy and memset functions from copy_user_generic() and
> > __clear_user().
> > 
> > They can be used as inline memcpy and memset instead of the GCC builtins
> > whenever necessary. LASS requires them to handle text_poke.
> 
> Except they contain the fault handlers so aren't generic calls.

That's true. I will add a comment to clarify it.

> > Originally-by: Peter Zijlstra <peterz@infradead.org>
> > Link: https://lore.kernel.org/all/20241029184840.GJ14555@noisy.programming.kicks-ass.net/
> > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> > ---
> >  arch/x86/include/asm/string.h     | 46 +++++++++++++++++++++++++++++++
> >  arch/x86/include/asm/uaccess_64.h | 38 +++++++------------------
> >  arch/x86/lib/clear_page_64.S      | 13 +++++++--
> >  3 files changed, 67 insertions(+), 30 deletions(-)
> > 
> > diff --git a/arch/x86/include/asm/string.h b/arch/x86/include/asm/string.h
> > index c3c2c1914d65..17f6b5bfa8c1 100644
> > --- a/arch/x86/include/asm/string.h
> > +++ b/arch/x86/include/asm/string.h
> > @@ -1,6 +1,52 @@
> >  /* SPDX-License-Identifier: GPL-2.0 */
> > +#ifndef _ASM_X86_STRING_H
> > +#define _ASM_X86_STRING_H
> > +
> > +#include <asm/asm.h>
> > +#include <asm/alternative.h>
> > +#include <asm/cpufeatures.h>
> > +
> >  #ifdef CONFIG_X86_32
> >  # include <asm/string_32.h>
> >  #else
> >  # include <asm/string_64.h>
> >  #endif
> > +
> > +#ifdef CONFIG_X86_64
> > +#define ALT_64(orig, alt, feat) ALTERNATIVE(orig, alt, feat)
> > +#else
> > +#define ALT_64(orig, alt, feat) orig "\n"
> > +#endif
> > +
> > +static __always_inline void *__inline_memcpy(void *to, const void *from, size_t len)
> > +{
> > +	void *ret = to;
> > +
> > +	asm volatile("1:\n\t"
> > +		     ALT_64("rep movsb",
> > +			    "call rep_movs_alternative", ALT_NOT(X86_FEATURE_FSRM))
> > +		     "2:\n\t"
> > +		     _ASM_EXTABLE_UA(1b, 2b)
> > +		     : "+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
> > +		     : : "memory", _ASM_AX);
> > +
> > +	return ret + len;
> > +}
> > +
> > +static __always_inline void *__inline_memset(void *addr, int v, size_t len)
> > +{
> > +	void *ret = addr;
> > +
> > +	asm volatile("1:\n\t"
> > +		     ALT_64("rep stosb",
> > +			    "call rep_stos_alternative", ALT_NOT(X86_FEATURE_FSRM))
> > +		     "2:\n\t"
> > +		     _ASM_EXTABLE_UA(1b, 2b)
> > +		     : "+c" (len), "+D" (addr), ASM_CALL_CONSTRAINT
> > +		     : "a" ((uint8_t)v)
> 
> You shouldn't need the (uint8_t) cast (should that be (u8) anyway).
> At best it doesn't matter, at worst it will add code to mask with 0xff.

Right, will drop.

> > +		     : "memory", _ASM_SI, _ASM_DX);
> > +
> > +	return ret + len;
> > +}
> > +
> > +#endif /* _ASM_X86_STRING_H */
> > diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
> > index c8a5ae35c871..eb531e13e659 100644
> > --- a/arch/x86/include/asm/uaccess_64.h
> > +++ b/arch/x86/include/asm/uaccess_64.h
> > @@ -13,6 +13,7 @@
> >  #include <asm/page.h>
> >  #include <asm/percpu.h>
> >  #include <asm/runtime-const.h>
> > +#include <asm/string.h>
> >  
> >  /*
> >   * Virtual variable: there's no actual backing store for this,
> > @@ -118,21 +119,12 @@ rep_movs_alternative(void *to, const void *from, unsigned len);
> >  static __always_inline __must_check unsigned long
> >  copy_user_generic(void *to, const void *from, unsigned long len)
> >  {
> > +	void *ret;
> > +
> >  	stac();
> > -	/*
> > -	 * If CPU has FSRM feature, use 'rep movs'.
> > -	 * Otherwise, use rep_movs_alternative.
> > -	 */
> > -	asm volatile(
> > -		"1:\n\t"
> > -		ALTERNATIVE("rep movsb",
> > -			    "call rep_movs_alternative", ALT_NOT(X86_FEATURE_FSRM))
> > -		"2:\n"
> > -		_ASM_EXTABLE_UA(1b, 2b)
> > -		:"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
> > -		: : "memory", "rax");
> > +	ret = __inline_memcpy(to, from, len);
> >  	clac();
> > -	return len;
> > +	return ret - to;
> >  }
> >  
> >  static __always_inline __must_check unsigned long
> > @@ -178,25 +170,15 @@ rep_stos_alternative(void __user *addr, unsigned long len);
> >  
> >  static __always_inline __must_check unsigned long __clear_user(void __user *addr, unsigned long size)
> >  {
> > +	void *ptr = (__force void *)addr;
> > +	void *ret;
> > +
> >  	might_fault();
> >  	stac();
> > -
> > -	/*
> > -	 * No memory constraint because it doesn't change any memory gcc
> > -	 * knows about.
> > -	 */
> > -	asm volatile(
> > -		"1:\n\t"
> > -		ALTERNATIVE("rep stosb",
> > -			    "call rep_stos_alternative", ALT_NOT(X86_FEATURE_FSRS))
> > -		"2:\n"
> > -	       _ASM_EXTABLE_UA(1b, 2b)
> > -	       : "+c" (size), "+D" (addr), ASM_CALL_CONSTRAINT
> > -	       : "a" (0));
> > -
> > +	ret = __inline_memset(ptr, 0, size);
> >  	clac();
> >  
> > -	return size;
> > +	return ret - ptr;
> >  }
> >  
> >  static __always_inline unsigned long clear_user(void __user *to, unsigned long n)
> > diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
> > index a508e4a8c66a..47b613690f84 100644
> > --- a/arch/x86/lib/clear_page_64.S
> > +++ b/arch/x86/lib/clear_page_64.S
> > @@ -55,17 +55,26 @@ SYM_FUNC_END(clear_page_erms)
> >  EXPORT_SYMBOL_GPL(clear_page_erms)
> >  
> >  /*
> > - * Default clear user-space.
> > + * Default memset.
> >   * Input:
> >   * rdi destination
> > + * rsi scratch
> >   * rcx count
> > - * rax is zero
> > + * al is value
> >   *
> >   * Output:
> >   * rcx: uncleared bytes or 0 if successful.
> > + * rdx: clobbered
> >   */
> >  SYM_FUNC_START(rep_stos_alternative)
> >  	ANNOTATE_NOENDBR
> > +
> > +	movzbq %al, %rsi
> > +	movabs $0x0101010101010101, %rax
> > +
> > +	/* RDX:RAX = RAX * RSI */
> > +	mulq %rsi
> 
> NAK - you can't do that here.
> Neither %rsi nor %rdx can be trashed.
> The function has a very explicit calling convention.

What calling convention? We change the only caller to confirm to this.

> It is also almost certainly a waste of time.
> Pretty much all the calls will be for a constant 0x00.
> Rename it all memzero() ...

text_poke_memset() is not limited to zeroing.

-- 
  Kiryl Shutsemau / Kirill A. Shutemov
Re: [PATCHv8 02/17] x86/asm: Introduce inline memcpy and memset
Posted by David Laight 3 months ago
On Thu, 3 Jul 2025 13:39:57 +0300
"Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> wrote:

> On Thu, Jul 03, 2025 at 09:44:17AM +0100, David Laight wrote:
> > On Tue,  1 Jul 2025 12:58:31 +0300
> > "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> wrote:
> >   
> > > Extract memcpy and memset functions from copy_user_generic() and
> > > __clear_user().
> > > 
> > > They can be used as inline memcpy and memset instead of the GCC builtins
> > > whenever necessary. LASS requires them to handle text_poke.  
> > 
> > Except they contain the fault handlers so aren't generic calls.  
> 
> That's true. I will add a comment to clarify it.

They need renaming.

...
> > > diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
> > > index a508e4a8c66a..47b613690f84 100644
> > > --- a/arch/x86/lib/clear_page_64.S
> > > +++ b/arch/x86/lib/clear_page_64.S
> > > @@ -55,17 +55,26 @@ SYM_FUNC_END(clear_page_erms)
> > >  EXPORT_SYMBOL_GPL(clear_page_erms)
> > >  
> > >  /*
> > > - * Default clear user-space.
> > > + * Default memset.
> > >   * Input:
> > >   * rdi destination
> > > + * rsi scratch
> > >   * rcx count
> > > - * rax is zero
> > > + * al is value
> > >   *
> > >   * Output:
> > >   * rcx: uncleared bytes or 0 if successful.
> > > + * rdx: clobbered
> > >   */
> > >  SYM_FUNC_START(rep_stos_alternative)
> > >  	ANNOTATE_NOENDBR
> > > +
> > > +	movzbq %al, %rsi
> > > +	movabs $0x0101010101010101, %rax
> > > +
> > > +	/* RDX:RAX = RAX * RSI */
> > > +	mulq %rsi  
> > 
> > NAK - you can't do that here.
> > Neither %rsi nor %rdx can be trashed.
> > The function has a very explicit calling convention.  
> 
> What calling convention? We change the only caller to confirm to this.

The one that is implicit in:

> > > +	asm volatile("1:\n\t"
> > > +		     ALT_64("rep stosb",
> > > +			    "call rep_stos_alternative", ALT_NOT(X86_FEATURE_FSRM))
> > > +		     "2:\n\t"
> > > +		     _ASM_EXTABLE_UA(1b, 2b)
> > > +		     : "+c" (len), "+D" (addr), ASM_CALL_CONSTRAINT
> > > +		     : "a" ((uint8_t)v)

The called function is only allowed to change the registers that
'rep stosb' uses - except it can access (but not change)
all of %rax - not just %al.

See: https://godbolt.org/z/3fnrT3x9r
In particular note that 'do_mset' must not change %rax.

This is very specific and is done so that the compiler can use
all the registers.

> > It is also almost certainly a waste of time.
> > Pretty much all the calls will be for a constant 0x00.
> > Rename it all memzero() ...  
> 
> text_poke_memset() is not limited to zeroing.

But you don't want the overhead of extending the constant
on all the calls - never mind reserving %rdx to do it.
Maybe define a function that requires the caller to have
done the 'dirty work' - so any code that wants memzero()
just passes zero.
Or do the multiply in the C code where it will get optimised
away for constant zero.
You do get the multiply for the 'rep stosb' case - but that
is always going to be true unless you complicate things further.  

	David
Re: [PATCHv8 02/17] x86/asm: Introduce inline memcpy and memset
Posted by Kirill A. Shutemov 3 months ago
On Thu, Jul 03, 2025 at 01:15:52PM +0100, David Laight wrote:
> On Thu, 3 Jul 2025 13:39:57 +0300
> "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> wrote:
> 
> > On Thu, Jul 03, 2025 at 09:44:17AM +0100, David Laight wrote:
> > > On Tue,  1 Jul 2025 12:58:31 +0300
> > > "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> wrote:
> > >   
> > > > Extract memcpy and memset functions from copy_user_generic() and
> > > > __clear_user().
> > > > 
> > > > They can be used as inline memcpy and memset instead of the GCC builtins
> > > > whenever necessary. LASS requires them to handle text_poke.  
> > > 
> > > Except they contain the fault handlers so aren't generic calls.  
> > 
> > That's true. I will add a comment to clarify it.
> 
> They need renaming.

__inline_memcpy/memset_safe()?

> ...
> > > > diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
> > > > index a508e4a8c66a..47b613690f84 100644
> > > > --- a/arch/x86/lib/clear_page_64.S
> > > > +++ b/arch/x86/lib/clear_page_64.S
> > > > @@ -55,17 +55,26 @@ SYM_FUNC_END(clear_page_erms)
> > > >  EXPORT_SYMBOL_GPL(clear_page_erms)
> > > >  
> > > >  /*
> > > > - * Default clear user-space.
> > > > + * Default memset.
> > > >   * Input:
> > > >   * rdi destination
> > > > + * rsi scratch
> > > >   * rcx count
> > > > - * rax is zero
> > > > + * al is value
> > > >   *
> > > >   * Output:
> > > >   * rcx: uncleared bytes or 0 if successful.
> > > > + * rdx: clobbered
> > > >   */
> > > >  SYM_FUNC_START(rep_stos_alternative)
> > > >  	ANNOTATE_NOENDBR
> > > > +
> > > > +	movzbq %al, %rsi
> > > > +	movabs $0x0101010101010101, %rax
> > > > +
> > > > +	/* RDX:RAX = RAX * RSI */
> > > > +	mulq %rsi  
> > > 
> > > NAK - you can't do that here.
> > > Neither %rsi nor %rdx can be trashed.
> > > The function has a very explicit calling convention.  
> > 
> > What calling convention? We change the only caller to confirm to this.
> 
> The one that is implicit in:
> 
> > > > +	asm volatile("1:\n\t"
> > > > +		     ALT_64("rep stosb",
> > > > +			    "call rep_stos_alternative", ALT_NOT(X86_FEATURE_FSRM))
> > > > +		     "2:\n\t"
> > > > +		     _ASM_EXTABLE_UA(1b, 2b)
> > > > +		     : "+c" (len), "+D" (addr), ASM_CALL_CONSTRAINT
> > > > +		     : "a" ((uint8_t)v)
> 
> The called function is only allowed to change the registers that
> 'rep stosb' uses - except it can access (but not change)
> all of %rax - not just %al.
> 
> See: https://godbolt.org/z/3fnrT3x9r
> In particular note that 'do_mset' must not change %rax.
> 
> This is very specific and is done so that the compiler can use
> all the registers.

Okay, I see what you are saying.

> > > It is also almost certainly a waste of time.
> > > Pretty much all the calls will be for a constant 0x00.
> > > Rename it all memzero() ...  
> > 
> > text_poke_memset() is not limited to zeroing.
> 
> But you don't want the overhead of extending the constant
> on all the calls - never mind reserving %rdx to do it.
> Maybe define a function that requires the caller to have
> done the 'dirty work' - so any code that wants memzero()
> just passes zero.
> Or do the multiply in the C code where it will get optimised
> away for constant zero.
> You do get the multiply for the 'rep stosb' case - but that
> is always going to be true unless you complicate things further.  

The patch below seems to do the trick: compiler optimizes out the
multiplication for v == 0.

It would be nice to avoid it for X86_FEATURE_FSRM, but we cannot use
cpu_feature_enabled() here as <asm/cpufeature.h> depends on
<asm/string.h>.

I cannot say I like the result.

Any suggestions?

diff --git a/arch/x86/include/asm/string.h b/arch/x86/include/asm/string.h
index becb9ee3bc8a..c7644a6f426b 100644
--- a/arch/x86/include/asm/string.h
+++ b/arch/x86/include/asm/string.h
@@ -35,16 +35,27 @@ static __always_inline void *__inline_memcpy(void *to, const void *from, size_t
 
 static __always_inline void *__inline_memset(void *addr, int v, size_t len)
 {
+	unsigned long val = v;
 	void *ret = addr;
 
+	if (IS_ENABLED(CONFIG_X86_64)) {
+		/*
+		 * Fill all bytes by value in byte 0.
+		 *
+		 * To be used in rep_stos_alternative()i
+		 */
+		val &= 0xff;
+		val *= 0x0101010101010101;
+	}
+
 	asm volatile("1:\n\t"
 		     ALT_64("rep stosb",
 			    "call rep_stos_alternative", ALT_NOT(X86_FEATURE_FSRM))
 		     "2:\n\t"
 		     _ASM_EXTABLE_UA(1b, 2b)
 		     : "+c" (len), "+D" (addr), ASM_CALL_CONSTRAINT
-		     : "a" (v)
-		     : "memory", _ASM_SI, _ASM_DX);
+		     : "a" (val)
+		     : "memory");
 
 	return ret + len;
 }
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index 47b613690f84..3ef7d796deb3 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -58,23 +58,15 @@ EXPORT_SYMBOL_GPL(clear_page_erms)
  * Default memset.
  * Input:
  * rdi destination
- * rsi scratch
  * rcx count
  * al is value
  *
  * Output:
  * rcx: uncleared bytes or 0 if successful.
- * rdx: clobbered
  */
 SYM_FUNC_START(rep_stos_alternative)
 	ANNOTATE_NOENDBR
 
-	movzbq %al, %rsi
-	movabs $0x0101010101010101, %rax
-
-	/* RDX:RAX = RAX * RSI */
-	mulq %rsi
-
 	cmpq $64,%rcx
 	jae .Lunrolled
 
-- 
  Kiryl Shutsemau / Kirill A. Shutemov
Re: [PATCHv8 02/17] x86/asm: Introduce inline memcpy and memset
Posted by David Laight 3 months ago
On Thu, 3 Jul 2025 17:10:34 +0300
"Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> wrote:

> On Thu, Jul 03, 2025 at 01:15:52PM +0100, David Laight wrote:
> > On Thu, 3 Jul 2025 13:39:57 +0300
> > "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> wrote:
> >   
> > > On Thu, Jul 03, 2025 at 09:44:17AM +0100, David Laight wrote:  
> > > > On Tue,  1 Jul 2025 12:58:31 +0300
> > > > "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> wrote:
> > > >     
> > > > > Extract memcpy and memset functions from copy_user_generic() and
> > > > > __clear_user().
> > > > > 
> > > > > They can be used as inline memcpy and memset instead of the GCC builtins
> > > > > whenever necessary. LASS requires them to handle text_poke.    
> > > > 
> > > > Except they contain the fault handlers so aren't generic calls.    
> > > 
> > > That's true. I will add a comment to clarify it.  
> > 
> > They need renaming.  
> 
> __inline_memcpy/memset_safe()?

'safe' against what :-)
They can't be used for user accesses without access_ok() and clac.
The get/put_user variants without access_ok() have _unsafe() suffix.

> 
> > ...  
> > > > > diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
> > > > > index a508e4a8c66a..47b613690f84 100644
> > > > > --- a/arch/x86/lib/clear_page_64.S
> > > > > +++ b/arch/x86/lib/clear_page_64.S
> > > > > @@ -55,17 +55,26 @@ SYM_FUNC_END(clear_page_erms)
> > > > >  EXPORT_SYMBOL_GPL(clear_page_erms)
> > > > >  
> > > > >  /*
> > > > > - * Default clear user-space.
> > > > > + * Default memset.
> > > > >   * Input:
> > > > >   * rdi destination
> > > > > + * rsi scratch
> > > > >   * rcx count
> > > > > - * rax is zero
> > > > > + * al is value
> > > > >   *
> > > > >   * Output:
> > > > >   * rcx: uncleared bytes or 0 if successful.
> > > > > + * rdx: clobbered
> > > > >   */
> > > > >  SYM_FUNC_START(rep_stos_alternative)
> > > > >  	ANNOTATE_NOENDBR
> > > > > +
> > > > > +	movzbq %al, %rsi
> > > > > +	movabs $0x0101010101010101, %rax
> > > > > +
> > > > > +	/* RDX:RAX = RAX * RSI */
> > > > > +	mulq %rsi    
> > > > 
> > > > NAK - you can't do that here.
> > > > Neither %rsi nor %rdx can be trashed.
> > > > The function has a very explicit calling convention.    
> > > 
> > > What calling convention? We change the only caller to confirm to this.  
> > 
> > The one that is implicit in:
> >   
> > > > > +	asm volatile("1:\n\t"
> > > > > +		     ALT_64("rep stosb",
> > > > > +			    "call rep_stos_alternative", ALT_NOT(X86_FEATURE_FSRM))
> > > > > +		     "2:\n\t"
> > > > > +		     _ASM_EXTABLE_UA(1b, 2b)
> > > > > +		     : "+c" (len), "+D" (addr), ASM_CALL_CONSTRAINT
> > > > > +		     : "a" ((uint8_t)v)  
> > 
> > The called function is only allowed to change the registers that
> > 'rep stosb' uses - except it can access (but not change)
> > all of %rax - not just %al.
> > 
> > See: https://godbolt.org/z/3fnrT3x9r
> > In particular note that 'do_mset' must not change %rax.
> > 
> > This is very specific and is done so that the compiler can use
> > all the registers.  
> 
> Okay, I see what you are saying.
> 
> > > > It is also almost certainly a waste of time.
> > > > Pretty much all the calls will be for a constant 0x00.
> > > > Rename it all memzero() ...    
> > > 
> > > text_poke_memset() is not limited to zeroing.  
> > 
> > But you don't want the overhead of extending the constant
> > on all the calls - never mind reserving %rdx to do it.
> > Maybe define a function that requires the caller to have
> > done the 'dirty work' - so any code that wants memzero()
> > just passes zero.
> > Or do the multiply in the C code where it will get optimised
> > away for constant zero.
> > You do get the multiply for the 'rep stosb' case - but that
> > is always going to be true unless you complicate things further.    
> 
> The patch below seems to do the trick: compiler optimizes out the
> multiplication for v == 0.
> 
> It would be nice to avoid it for X86_FEATURE_FSRM, but we cannot use
> cpu_feature_enabled() here as <asm/cpufeature.h> depends on
> <asm/string.h>.
> 
> I cannot say I like the result.
> 
> Any suggestions?
> 
> diff --git a/arch/x86/include/asm/string.h b/arch/x86/include/asm/string.h
> index becb9ee3bc8a..c7644a6f426b 100644
> --- a/arch/x86/include/asm/string.h
> +++ b/arch/x86/include/asm/string.h
> @@ -35,16 +35,27 @@ static __always_inline void *__inline_memcpy(void *to, const void *from, size_t
>  
>  static __always_inline void *__inline_memset(void *addr, int v, size_t len)
>  {
> +	unsigned long val = v;
>  	void *ret = addr;
>  
> +	if (IS_ENABLED(CONFIG_X86_64)) {
> +		/*
> +		 * Fill all bytes by value in byte 0.
> +		 *
> +		 * To be used in rep_stos_alternative()i
> +		 */
> +		val &= 0xff;
> +		val *= 0x0101010101010101;
> +	}

That won't compile for 32bit, and it needs the same thing done.
	val *= (unsigned long)0x0101010101010101ull;
should work.
I don't think you need the 'val &= 0xff', just rely on the caller
passing a valid value - nothing will break badly if it doesn't.

	David

> +
>  	asm volatile("1:\n\t"
>  		     ALT_64("rep stosb",
>  			    "call rep_stos_alternative", ALT_NOT(X86_FEATURE_FSRM))
>  		     "2:\n\t"
>  		     _ASM_EXTABLE_UA(1b, 2b)
>  		     : "+c" (len), "+D" (addr), ASM_CALL_CONSTRAINT
> -		     : "a" (v)
> -		     : "memory", _ASM_SI, _ASM_DX);
> +		     : "a" (val)
> +		     : "memory");
>  
>  	return ret + len;
>  }
> diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
> index 47b613690f84..3ef7d796deb3 100644
> --- a/arch/x86/lib/clear_page_64.S
> +++ b/arch/x86/lib/clear_page_64.S
> @@ -58,23 +58,15 @@ EXPORT_SYMBOL_GPL(clear_page_erms)
>   * Default memset.
>   * Input:
>   * rdi destination
> - * rsi scratch
>   * rcx count
>   * al is value
>   *
>   * Output:
>   * rcx: uncleared bytes or 0 if successful.
> - * rdx: clobbered
>   */
>  SYM_FUNC_START(rep_stos_alternative)
>  	ANNOTATE_NOENDBR
>  
> -	movzbq %al, %rsi
> -	movabs $0x0101010101010101, %rax
> -
> -	/* RDX:RAX = RAX * RSI */
> -	mulq %rsi
> -
>  	cmpq $64,%rcx
>  	jae .Lunrolled
>
Re: [PATCHv8 02/17] x86/asm: Introduce inline memcpy and memset
Posted by Vegard Nossum 3 months ago
On 03/07/2025 14:15, David Laight wrote:
> On Thu, 3 Jul 2025 13:39:57 +0300
> "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> wrote:
>> On Thu, Jul 03, 2025 at 09:44:17AM +0100, David Laight wrote:
>>> On Tue,  1 Jul 2025 12:58:31 +0300
>>> "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> wrote:
>>>> diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
>>>> index a508e4a8c66a..47b613690f84 100644
>>>> --- a/arch/x86/lib/clear_page_64.S
>>>> +++ b/arch/x86/lib/clear_page_64.S
>>>> @@ -55,17 +55,26 @@ SYM_FUNC_END(clear_page_erms)
>>>>   EXPORT_SYMBOL_GPL(clear_page_erms)
>>>>   
>>>>   /*
>>>> - * Default clear user-space.
>>>> + * Default memset.
>>>>    * Input:
>>>>    * rdi destination
>>>> + * rsi scratch
>>>>    * rcx count
>>>> - * rax is zero
>>>> + * al is value
>>>>    *
>>>>    * Output:
>>>>    * rcx: uncleared bytes or 0 if successful.
>>>> + * rdx: clobbered
>>>>    */
>>>>   SYM_FUNC_START(rep_stos_alternative)
>>>>   	ANNOTATE_NOENDBR
>>>> +
>>>> +	movzbq %al, %rsi
>>>> +	movabs $0x0101010101010101, %rax
>>>> +
>>>> +	/* RDX:RAX = RAX * RSI */
>>>> +	mulq %rsi
>>>
>>> NAK - you can't do that here.
>>> Neither %rsi nor %rdx can be trashed.
>>> The function has a very explicit calling convention.

That's why we have the clobbers... see below

>> What calling convention? We change the only caller to confirm to this.
> 
> The one that is implicit in:
> 
>>>> +	asm volatile("1:\n\t"
>>>> +		     ALT_64("rep stosb",
>>>> +			    "call rep_stos_alternative", ALT_NOT(X86_FEATURE_FSRM))
>>>> +		     "2:\n\t"
>>>> +		     _ASM_EXTABLE_UA(1b, 2b)
>>>> +		     : "+c" (len), "+D" (addr), ASM_CALL_CONSTRAINT
>>>> +		     : "a" ((uint8_t)v)
> 
> The called function is only allowed to change the registers that
> 'rep stosb' uses - except it can access (but not change)
> all of %rax - not just %al.
> 
> See: https://godbolt.org/z/3fnrT3x9r
> In particular note that 'do_mset' must not change %rax.
> 
> This is very specific and is done so that the compiler can use
> all the registers.

I feel like you trimmed off the clobbers from the asm() in the context
above. For reference, it is:

+		     : "memory", _ASM_SI, _ASM_DX);

I'm not saying this can't be optimized, but that doesn't seem to be your
complaint -- you say "the called function is only allowed to change
...", but this is not true when we have the clobbers, right?

This is exactly what I fixed with my v7 fixlet to this patch:

https://lore.kernel.org/all/1b96b0ca-5c14-4271-86c1-c305bf052b16@oracle.com/


Vegard
Re: [PATCHv8 02/17] x86/asm: Introduce inline memcpy and memset
Posted by David Laight 3 months ago
On Thu, 3 Jul 2025 15:33:16 +0200
Vegard Nossum <vegard.nossum@oracle.com> wrote:

> On 03/07/2025 14:15, David Laight wrote:
> > On Thu, 3 Jul 2025 13:39:57 +0300
> > "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> wrote:  
> >> On Thu, Jul 03, 2025 at 09:44:17AM +0100, David Laight wrote:  
> >>> On Tue,  1 Jul 2025 12:58:31 +0300
> >>> "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> wrote:  
> >>>> diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
> >>>> index a508e4a8c66a..47b613690f84 100644
> >>>> --- a/arch/x86/lib/clear_page_64.S
> >>>> +++ b/arch/x86/lib/clear_page_64.S
> >>>> @@ -55,17 +55,26 @@ SYM_FUNC_END(clear_page_erms)
> >>>>   EXPORT_SYMBOL_GPL(clear_page_erms)
> >>>>   
> >>>>   /*
> >>>> - * Default clear user-space.
> >>>> + * Default memset.
> >>>>    * Input:
> >>>>    * rdi destination
> >>>> + * rsi scratch
> >>>>    * rcx count
> >>>> - * rax is zero
> >>>> + * al is value
> >>>>    *
> >>>>    * Output:
> >>>>    * rcx: uncleared bytes or 0 if successful.
> >>>> + * rdx: clobbered
> >>>>    */
> >>>>   SYM_FUNC_START(rep_stos_alternative)
> >>>>   	ANNOTATE_NOENDBR
> >>>> +
> >>>> +	movzbq %al, %rsi
> >>>> +	movabs $0x0101010101010101, %rax
> >>>> +
> >>>> +	/* RDX:RAX = RAX * RSI */
> >>>> +	mulq %rsi  
> >>>
> >>> NAK - you can't do that here.
> >>> Neither %rsi nor %rdx can be trashed.
> >>> The function has a very explicit calling convention.  
> 
> That's why we have the clobbers... see below
> 
> >> What calling convention? We change the only caller to confirm to this.  
> > 
> > The one that is implicit in:
> >   
> >>>> +	asm volatile("1:\n\t"
> >>>> +		     ALT_64("rep stosb",
> >>>> +			    "call rep_stos_alternative", ALT_NOT(X86_FEATURE_FSRM))
> >>>> +		     "2:\n\t"
> >>>> +		     _ASM_EXTABLE_UA(1b, 2b)
> >>>> +		     : "+c" (len), "+D" (addr), ASM_CALL_CONSTRAINT
> >>>> +		     : "a" ((uint8_t)v)  
> > 
> > The called function is only allowed to change the registers that
> > 'rep stosb' uses - except it can access (but not change)
> > all of %rax - not just %al.
> > 
> > See: https://godbolt.org/z/3fnrT3x9r
> > In particular note that 'do_mset' must not change %rax.
> > 
> > This is very specific and is done so that the compiler can use
> > all the registers.  
> 
> I feel like you trimmed off the clobbers from the asm() in the context
> above. For reference, it is:
> 
> +		     : "memory", _ASM_SI, _ASM_DX);

I'm sure they weren't there...

Enough clobbers will 'un-break' it - but that isn't the point.
Linux will reject the patch if he reads it.
The whole point about the function is that it is as direct a replacement
for 'rep stos/movsb' as possible.  

> 
> I'm not saying this can't be optimized, but that doesn't seem to be your
> complaint -- you say "the called function is only allowed to change
> ...", but this is not true when we have the clobbers, right?

You can't change %rax either - not without a clobber.

Oh, and even with your version you only clobbers for %rax and %rdx.
There is no need to use both %rsi and %rdx.

The performance is a different problem.
And the extra clobbers are likely to matter.
x86 really doesn't have many registers.

	David

> 
> This is exactly what I fixed with my v7 fixlet to this patch:
> 
> https://lore.kernel.org/all/1b96b0ca-5c14-4271-86c1-c305bf052b16@oracle.com/
> 
> 
> Vegard