x86/asm/32: Modernize _memcpy()

[PATCH RESEND] x86/asm/32: Modernize _memcpy()

Posted by Uros Bizjak 1 month, 3 weeks ago

Use inout "+" constraint modifier where appropriate, declare
temporary variables as unsigned long and rewrite parts of assembly
in plain C. The memcpy() function shrinks by 10 bytes, from:

00e778d0 <memcpy>:
  e778d0:	55                   	push   %ebp
  e778d1:	89 e5                	mov    %esp,%ebp
  e778d3:	83 ec 0c             	sub    $0xc,%esp
  e778d6:	89 5d f4             	mov    %ebx,-0xc(%ebp)
  e778d9:	89 c3                	mov    %eax,%ebx
  e778db:	89 c8                	mov    %ecx,%eax
  e778dd:	89 75 f8             	mov    %esi,-0x8(%ebp)
  e778e0:	c1 e9 02             	shr    $0x2,%ecx
  e778e3:	89 d6                	mov    %edx,%esi
  e778e5:	89 7d fc             	mov    %edi,-0x4(%ebp)
  e778e8:	89 df                	mov    %ebx,%edi
  e778ea:	f3 a5                	rep movsl %ds:(%esi),%es:(%edi)
  e778ec:	89 c1                	mov    %eax,%ecx
  e778ee:	83 e1 03             	and    $0x3,%ecx
  e778f1:	74 02                	je     e778f5 <memcpy+0x25>
  e778f3:	f3 a4                	rep movsb %ds:(%esi),%es:(%edi)
  e778f5:	8b 75 f8             	mov    -0x8(%ebp),%esi
  e778f8:	89 d8                	mov    %ebx,%eax
  e778fa:	8b 5d f4             	mov    -0xc(%ebp),%ebx
  e778fd:	8b 7d fc             	mov    -0x4(%ebp),%edi
  e77900:	89 ec                	mov    %ebp,%esp
  e77902:	5d                   	pop    %ebp
  e77903:	c3                   	ret

to:

00e778b0 <memcpy>:
  e778b0:	55                   	push   %ebp
  e778b1:	89 e5                	mov    %esp,%ebp
  e778b3:	83 ec 08             	sub    $0x8,%esp
  e778b6:	89 75 f8             	mov    %esi,-0x8(%ebp)
  e778b9:	89 d6                	mov    %edx,%esi
  e778bb:	89 ca                	mov    %ecx,%edx
  e778bd:	89 7d fc             	mov    %edi,-0x4(%ebp)
  e778c0:	c1 e9 02             	shr    $0x2,%ecx
  e778c3:	89 c7                	mov    %eax,%edi
  e778c5:	f3 a5                	rep movsl %ds:(%esi),%es:(%edi)
  e778c7:	83 e2 03             	and    $0x3,%edx
  e778ca:	74 04                	je     e778d0 <memcpy+0x20>
  e778cc:	89 d1                	mov    %edx,%ecx
  e778ce:	f3 a4                	rep movsb %ds:(%esi),%es:(%edi)
  e778d0:	8b 75 f8             	mov    -0x8(%ebp),%esi
  e778d3:	8b 7d fc             	mov    -0x4(%ebp),%edi
  e778d6:	89 ec                	mov    %ebp,%esp
  e778d8:	5d                   	pop    %ebp
  e778d9:	c3                   	ret

due to a better register allocation, avoiding the call-saved
%ebx register.

No functional changes intended.

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
---
 arch/x86/include/asm/string_32.h | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h
index e9cce169bb4c..16cdda93e437 100644
--- a/arch/x86/include/asm/string_32.h
+++ b/arch/x86/include/asm/string_32.h
@@ -32,16 +32,18 @@ extern size_t strlen(const char *s);
 
 static __always_inline void *__memcpy(void *to, const void *from, size_t n)
 {
-	int d0, d1, d2;
-	asm volatile("rep movsl\n\t"
-		     "movl %4,%%ecx\n\t"
-		     "andl $3,%%ecx\n\t"
-		     "jz 1f\n\t"
-		     "rep movsb\n\t"
-		     "1:"
-		     : "=&c" (d0), "=&D" (d1), "=&S" (d2)
-		     : "0" (n / 4), "g" (n), "1" ((long)to), "2" ((long)from)
-		     : "memory");
+	unsigned long esi = (unsigned long)from;
+	unsigned long edi = (unsigned long)to;
+	unsigned long ecx = n >> 2;
+
+	asm volatile("rep movsl"
+		     : "+D" (edi), "+S" (esi), "+c" (ecx)
+		     : : "memory");
+	ecx = n & 3;
+	if (ecx)
+		asm volatile("rep movsb"
+			     : "+D" (edi), "+S" (esi), "+c" (ecx)
+			     : : "memory");
 	return to;
 }
 
-- 
2.52.0

Re: [PATCH RESEND] x86/asm/32: Modernize _memcpy()

Posted by David Laight 1 month, 3 weeks ago

On Tue, 16 Dec 2025 11:37:33 +0100
Uros Bizjak <ubizjak@gmail.com> wrote:

> Use inout "+" constraint modifier where appropriate, declare
> temporary variables as unsigned long and rewrite parts of assembly
> in plain C. The memcpy() function shrinks by 10 bytes, from:
> 
> 00e778d0 <memcpy>:
>   e778d0:	55                   	push   %ebp
>   e778d1:	89 e5                	mov    %esp,%ebp
>   e778d3:	83 ec 0c             	sub    $0xc,%esp
>   e778d6:	89 5d f4             	mov    %ebx,-0xc(%ebp)
>   e778d9:	89 c3                	mov    %eax,%ebx
>   e778db:	89 c8                	mov    %ecx,%eax
>   e778dd:	89 75 f8             	mov    %esi,-0x8(%ebp)
>   e778e0:	c1 e9 02             	shr    $0x2,%ecx
>   e778e3:	89 d6                	mov    %edx,%esi
>   e778e5:	89 7d fc             	mov    %edi,-0x4(%ebp)
>   e778e8:	89 df                	mov    %ebx,%edi
>   e778ea:	f3 a5                	rep movsl %ds:(%esi),%es:(%edi)
>   e778ec:	89 c1                	mov    %eax,%ecx
>   e778ee:	83 e1 03             	and    $0x3,%ecx
>   e778f1:	74 02                	je     e778f5 <memcpy+0x25>
>   e778f3:	f3 a4                	rep movsb %ds:(%esi),%es:(%edi)
>   e778f5:	8b 75 f8             	mov    -0x8(%ebp),%esi
>   e778f8:	89 d8                	mov    %ebx,%eax
>   e778fa:	8b 5d f4             	mov    -0xc(%ebp),%ebx
>   e778fd:	8b 7d fc             	mov    -0x4(%ebp),%edi
>   e77900:	89 ec                	mov    %ebp,%esp
>   e77902:	5d                   	pop    %ebp
>   e77903:	c3                   	ret
> 
> to:
> 
> 00e778b0 <memcpy>:
>   e778b0:	55                   	push   %ebp
>   e778b1:	89 e5                	mov    %esp,%ebp
>   e778b3:	83 ec 08             	sub    $0x8,%esp
>   e778b6:	89 75 f8             	mov    %esi,-0x8(%ebp)
>   e778b9:	89 d6                	mov    %edx,%esi
>   e778bb:	89 ca                	mov    %ecx,%edx
>   e778bd:	89 7d fc             	mov    %edi,-0x4(%ebp)
>   e778c0:	c1 e9 02             	shr    $0x2,%ecx
>   e778c3:	89 c7                	mov    %eax,%edi
>   e778c5:	f3 a5                	rep movsl %ds:(%esi),%es:(%edi)
>   e778c7:	83 e2 03             	and    $0x3,%edx
>   e778ca:	74 04                	je     e778d0 <memcpy+0x20>
>   e778cc:	89 d1                	mov    %edx,%ecx
>   e778ce:	f3 a4                	rep movsb %ds:(%esi),%es:(%edi)
>   e778d0:	8b 75 f8             	mov    -0x8(%ebp),%esi
>   e778d3:	8b 7d fc             	mov    -0x4(%ebp),%edi
>   e778d6:	89 ec                	mov    %ebp,%esp
>   e778d8:	5d                   	pop    %ebp
>   e778d9:	c3                   	ret
> 
> due to a better register allocation, avoiding the call-saved
> %ebx register.

That'll might be semi-random.

> 
> No functional changes intended.
> 
> Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Ingo Molnar <mingo@kernel.org>
> Cc: Borislav Petkov <bp@alien8.de>
> Cc: Dave Hansen <dave.hansen@linux.intel.com>
> Cc: "H. Peter Anvin" <hpa@zytor.com>
> ---
>  arch/x86/include/asm/string_32.h | 22 ++++++++++++----------
>  1 file changed, 12 insertions(+), 10 deletions(-)
> 
> diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h
> index e9cce169bb4c..16cdda93e437 100644
> --- a/arch/x86/include/asm/string_32.h
> +++ b/arch/x86/include/asm/string_32.h
> @@ -32,16 +32,18 @@ extern size_t strlen(const char *s);
>  
>  static __always_inline void *__memcpy(void *to, const void *from, size_t n)
>  {
> -	int d0, d1, d2;
> -	asm volatile("rep movsl\n\t"
> -		     "movl %4,%%ecx\n\t"
> -		     "andl $3,%%ecx\n\t"
> -		     "jz 1f\n\t"
> -		     "rep movsb\n\t"
> -		     "1:"
> -		     : "=&c" (d0), "=&D" (d1), "=&S" (d2)
> -		     : "0" (n / 4), "g" (n), "1" ((long)to), "2" ((long)from)
> -		     : "memory");
> +	unsigned long esi = (unsigned long)from;
> +	unsigned long edi = (unsigned long)to;

There is no need to cast to 'unsigned long', the same code should be
generated if 'void *' is used.

> +	unsigned long ecx = n >> 2;
> +
> +	asm volatile("rep movsl"
> +		     : "+D" (edi), "+S" (esi), "+c" (ecx)
> +		     : : "memory");
> +	ecx = n & 3;
> +	if (ecx)
> +		asm volatile("rep movsb"
> +			     : "+D" (edi), "+S" (esi), "+c" (ecx)
> +			     : : "memory");
>  	return to;
>  }
> 

This version seems to generate better code still:
see https://godbolt.org/z/78cq97PPj

void *__memcpy(void *to, const void *from, unsigned long n)
{
	unsigned long ecx = n >> 2;

	asm volatile("rep movsl"
		     : "+D" (to), "+S" (from), "+c" (ecx)
		     : : "memory");
	ecx = n & 3;
	if (ecx)
		asm volatile("rep movsb"
			     : "+D" (to), "+S" (from), "+c" (ecx)
			     : : "memory");
	return (char *)to - n;
}

Note that the performance is horrid on a lot of cpu.
(In particular the copy of the remaining 1-3 bytes.)

	David

Re: [PATCH RESEND] x86/asm/32: Modernize _memcpy()

Posted by Uros Bizjak 1 month, 3 weeks ago

On Tue, Dec 16, 2025 at 2:14 PM David Laight
<david.laight.linux@gmail.com> wrote:

> > 00e778b0 <memcpy>:
> >   e778b0:     55                      push   %ebp
> >   e778b1:     89 e5                   mov    %esp,%ebp
> >   e778b3:     83 ec 08                sub    $0x8,%esp
> >   e778b6:     89 75 f8                mov    %esi,-0x8(%ebp)
> >   e778b9:     89 d6                   mov    %edx,%esi
> >   e778bb:     89 ca                   mov    %ecx,%edx
> >   e778bd:     89 7d fc                mov    %edi,-0x4(%ebp)
> >   e778c0:     c1 e9 02                shr    $0x2,%ecx
> >   e778c3:     89 c7                   mov    %eax,%edi
> >   e778c5:     f3 a5                   rep movsl %ds:(%esi),%es:(%edi)
> >   e778c7:     83 e2 03                and    $0x3,%edx
> >   e778ca:     74 04                   je     e778d0 <memcpy+0x20>
> >   e778cc:     89 d1                   mov    %edx,%ecx
> >   e778ce:     f3 a4                   rep movsb %ds:(%esi),%es:(%edi)
> >   e778d0:     8b 75 f8                mov    -0x8(%ebp),%esi
> >   e778d3:     8b 7d fc                mov    -0x4(%ebp),%edi
> >   e778d6:     89 ec                   mov    %ebp,%esp
> >   e778d8:     5d                      pop    %ebp
> >   e778d9:     c3                      ret
> >
> > due to a better register allocation, avoiding the call-saved
> > %ebx register.
>
> That'll might be semi-random.

Not really, the compiler has more freedom to allocate more optimal register.

> > +     unsigned long ecx = n >> 2;
> > +
> > +     asm volatile("rep movsl"
> > +                  : "+D" (edi), "+S" (esi), "+c" (ecx)
> > +                  : : "memory");
> > +     ecx = n & 3;
> > +     if (ecx)
> > +             asm volatile("rep movsb"
> > +                          : "+D" (edi), "+S" (esi), "+c" (ecx)
> > +                          : : "memory");
> >       return to;
> >  }
> >

> This version seems to generate better code still:
> see https://godbolt.org/z/78cq97PPj
>
> void *__memcpy(void *to, const void *from, unsigned long n)
> {
>         unsigned long ecx = n >> 2;
>
>         asm volatile("rep movsl"
>                      : "+D" (to), "+S" (from), "+c" (ecx)
>                      : : "memory");
>         ecx = n & 3;
>         if (ecx)
>                 asm volatile("rep movsb"
>                              : "+D" (to), "+S" (from), "+c" (ecx)
>                              : : "memory");
>         return (char *)to - n;

I don't think that additional subtraction outweighs a move from EAX to
a temporary.

BR,
Uros.

Re: [PATCH RESEND] x86/asm/32: Modernize _memcpy()

Posted by David Laight 1 month, 3 weeks ago

On Tue, 16 Dec 2025 17:30:55 +0100
Uros Bizjak <ubizjak@gmail.com> wrote:

> On Tue, Dec 16, 2025 at 2:14 PM David Laight
> <david.laight.linux@gmail.com> wrote:
> 
> > > 00e778b0 <memcpy>:
> > >   e778b0:     55                      push   %ebp
> > >   e778b1:     89 e5                   mov    %esp,%ebp
> > >   e778b3:     83 ec 08                sub    $0x8,%esp
> > >   e778b6:     89 75 f8                mov    %esi,-0x8(%ebp)
> > >   e778b9:     89 d6                   mov    %edx,%esi
> > >   e778bb:     89 ca                   mov    %ecx,%edx
> > >   e778bd:     89 7d fc                mov    %edi,-0x4(%ebp)
> > >   e778c0:     c1 e9 02                shr    $0x2,%ecx
> > >   e778c3:     89 c7                   mov    %eax,%edi
> > >   e778c5:     f3 a5                   rep movsl %ds:(%esi),%es:(%edi)
> > >   e778c7:     83 e2 03                and    $0x3,%edx
> > >   e778ca:     74 04                   je     e778d0 <memcpy+0x20>
> > >   e778cc:     89 d1                   mov    %edx,%ecx
> > >   e778ce:     f3 a4                   rep movsb %ds:(%esi),%es:(%edi)
> > >   e778d0:     8b 75 f8                mov    -0x8(%ebp),%esi
> > >   e778d3:     8b 7d fc                mov    -0x4(%ebp),%edi
> > >   e778d6:     89 ec                   mov    %ebp,%esp
> > >   e778d8:     5d                      pop    %ebp
> > >   e778d9:     c3                      ret
> > >
> > > due to a better register allocation, avoiding the call-saved
> > > %ebx register.  
> >
> > That'll might be semi-random.  
> 
> Not really, the compiler has more freedom to allocate more optimal register.
> 
> > > +     unsigned long ecx = n >> 2;
> > > +
> > > +     asm volatile("rep movsl"
> > > +                  : "+D" (edi), "+S" (esi), "+c" (ecx)
> > > +                  : : "memory");
> > > +     ecx = n & 3;
> > > +     if (ecx)
> > > +             asm volatile("rep movsb"
> > > +                          : "+D" (edi), "+S" (esi), "+c" (ecx)
> > > +                          : : "memory");
> > >       return to;
> > >  }
> > >  
> 
> > This version seems to generate better code still:
> > see https://godbolt.org/z/78cq97PPj
> >
> > void *__memcpy(void *to, const void *from, unsigned long n)
> > {
> >         unsigned long ecx = n >> 2;
> >
> >         asm volatile("rep movsl"
> >                      : "+D" (to), "+S" (from), "+c" (ecx)
> >                      : : "memory");
> >         ecx = n & 3;
> >         if (ecx)
> >                 asm volatile("rep movsb"
> >                              : "+D" (to), "+S" (from), "+c" (ecx)
> >                              : : "memory");
> >         return (char *)to - n;  
> 
> I don't think that additional subtraction outweighs a move from EAX to
> a temporary.

It saves a read from stack, and/or a register spill to stack
(at least as I compiled the code...).
The extra alu operation is much cheaper - especially since it will
sit in the 'out of order' instruction queue with nothing waiting
for the result (the write to stack .

Since pretty much no code user the result of memcpy() you could
have a inline wrapper that just preserves the 'to' address and calls
a 'void' function to do the copy itself.

But the real performance killer is the setup time for 'rep movs'.
That will outweigh any minor faffing about by orders of magnitude.

Oh, and start trying to select between algorithms based on length
and/or alignment and you get killed by the mispredicted branch
penalty - that seems to be about 20 clocks on my zen-5.
I haven't measured the setup cost for 'rep movs', it is on my
'round tuit' list.

	David

> 
> BR,
> Uros.

Re: [PATCH RESEND] x86/asm/32: Modernize _memcpy()

Posted by Dave Hansen 1 month, 3 weeks ago

On 12/16/25 08:30, Uros Bizjak wrote:
> I don't think that additional subtraction outweighs a move from EAX to
> a temporary.

There are basically two ways this can end up:

 1. None of these small changes (like an extra subtraction) matters in
    the end, and this is a debate about nothing. The simplest code wins.
 2. It matters and can be shown with some data.

Unfortunately, I see a lot more gazing into crystal balls than data
collection right now.