arch/x86/include/asm/string_32.h | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-)
Use inout "+" constraint modifier where appropriate, declare
temporary variables as unsigned long and rewrite parts of assembly
in plain C. The memcpy() function shrinks by 10 bytes, from:
00e778d0 <memcpy>:
e778d0: 55 push %ebp
e778d1: 89 e5 mov %esp,%ebp
e778d3: 83 ec 0c sub $0xc,%esp
e778d6: 89 5d f4 mov %ebx,-0xc(%ebp)
e778d9: 89 c3 mov %eax,%ebx
e778db: 89 c8 mov %ecx,%eax
e778dd: 89 75 f8 mov %esi,-0x8(%ebp)
e778e0: c1 e9 02 shr $0x2,%ecx
e778e3: 89 d6 mov %edx,%esi
e778e5: 89 7d fc mov %edi,-0x4(%ebp)
e778e8: 89 df mov %ebx,%edi
e778ea: f3 a5 rep movsl %ds:(%esi),%es:(%edi)
e778ec: 89 c1 mov %eax,%ecx
e778ee: 83 e1 03 and $0x3,%ecx
e778f1: 74 02 je e778f5 <memcpy+0x25>
e778f3: f3 a4 rep movsb %ds:(%esi),%es:(%edi)
e778f5: 8b 75 f8 mov -0x8(%ebp),%esi
e778f8: 89 d8 mov %ebx,%eax
e778fa: 8b 5d f4 mov -0xc(%ebp),%ebx
e778fd: 8b 7d fc mov -0x4(%ebp),%edi
e77900: 89 ec mov %ebp,%esp
e77902: 5d pop %ebp
e77903: c3 ret
to:
00e778b0 <memcpy>:
e778b0: 55 push %ebp
e778b1: 89 e5 mov %esp,%ebp
e778b3: 83 ec 08 sub $0x8,%esp
e778b6: 89 75 f8 mov %esi,-0x8(%ebp)
e778b9: 89 d6 mov %edx,%esi
e778bb: 89 ca mov %ecx,%edx
e778bd: 89 7d fc mov %edi,-0x4(%ebp)
e778c0: c1 e9 02 shr $0x2,%ecx
e778c3: 89 c7 mov %eax,%edi
e778c5: f3 a5 rep movsl %ds:(%esi),%es:(%edi)
e778c7: 83 e2 03 and $0x3,%edx
e778ca: 74 04 je e778d0 <memcpy+0x20>
e778cc: 89 d1 mov %edx,%ecx
e778ce: f3 a4 rep movsb %ds:(%esi),%es:(%edi)
e778d0: 8b 75 f8 mov -0x8(%ebp),%esi
e778d3: 8b 7d fc mov -0x4(%ebp),%edi
e778d6: 89 ec mov %ebp,%esp
e778d8: 5d pop %ebp
e778d9: c3 ret
due to a better register allocation, avoiding the call-saved
%ebx register.
No functional changes intended.
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
---
arch/x86/include/asm/string_32.h | 22 ++++++++++++----------
1 file changed, 12 insertions(+), 10 deletions(-)
diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h
index e9cce169bb4c..16cdda93e437 100644
--- a/arch/x86/include/asm/string_32.h
+++ b/arch/x86/include/asm/string_32.h
@@ -32,16 +32,18 @@ extern size_t strlen(const char *s);
static __always_inline void *__memcpy(void *to, const void *from, size_t n)
{
- int d0, d1, d2;
- asm volatile("rep movsl\n\t"
- "movl %4,%%ecx\n\t"
- "andl $3,%%ecx\n\t"
- "jz 1f\n\t"
- "rep movsb\n\t"
- "1:"
- : "=&c" (d0), "=&D" (d1), "=&S" (d2)
- : "0" (n / 4), "g" (n), "1" ((long)to), "2" ((long)from)
- : "memory");
+ unsigned long esi = (unsigned long)from;
+ unsigned long edi = (unsigned long)to;
+ unsigned long ecx = n >> 2;
+
+ asm volatile("rep movsl"
+ : "+D" (edi), "+S" (esi), "+c" (ecx)
+ : : "memory");
+ ecx = n & 3;
+ if (ecx)
+ asm volatile("rep movsb"
+ : "+D" (edi), "+S" (esi), "+c" (ecx)
+ : : "memory");
return to;
}
--
2.52.0
On Tue, 16 Dec 2025 11:37:33 +0100
Uros Bizjak <ubizjak@gmail.com> wrote:
> Use inout "+" constraint modifier where appropriate, declare
> temporary variables as unsigned long and rewrite parts of assembly
> in plain C. The memcpy() function shrinks by 10 bytes, from:
>
> 00e778d0 <memcpy>:
> e778d0: 55 push %ebp
> e778d1: 89 e5 mov %esp,%ebp
> e778d3: 83 ec 0c sub $0xc,%esp
> e778d6: 89 5d f4 mov %ebx,-0xc(%ebp)
> e778d9: 89 c3 mov %eax,%ebx
> e778db: 89 c8 mov %ecx,%eax
> e778dd: 89 75 f8 mov %esi,-0x8(%ebp)
> e778e0: c1 e9 02 shr $0x2,%ecx
> e778e3: 89 d6 mov %edx,%esi
> e778e5: 89 7d fc mov %edi,-0x4(%ebp)
> e778e8: 89 df mov %ebx,%edi
> e778ea: f3 a5 rep movsl %ds:(%esi),%es:(%edi)
> e778ec: 89 c1 mov %eax,%ecx
> e778ee: 83 e1 03 and $0x3,%ecx
> e778f1: 74 02 je e778f5 <memcpy+0x25>
> e778f3: f3 a4 rep movsb %ds:(%esi),%es:(%edi)
> e778f5: 8b 75 f8 mov -0x8(%ebp),%esi
> e778f8: 89 d8 mov %ebx,%eax
> e778fa: 8b 5d f4 mov -0xc(%ebp),%ebx
> e778fd: 8b 7d fc mov -0x4(%ebp),%edi
> e77900: 89 ec mov %ebp,%esp
> e77902: 5d pop %ebp
> e77903: c3 ret
>
> to:
>
> 00e778b0 <memcpy>:
> e778b0: 55 push %ebp
> e778b1: 89 e5 mov %esp,%ebp
> e778b3: 83 ec 08 sub $0x8,%esp
> e778b6: 89 75 f8 mov %esi,-0x8(%ebp)
> e778b9: 89 d6 mov %edx,%esi
> e778bb: 89 ca mov %ecx,%edx
> e778bd: 89 7d fc mov %edi,-0x4(%ebp)
> e778c0: c1 e9 02 shr $0x2,%ecx
> e778c3: 89 c7 mov %eax,%edi
> e778c5: f3 a5 rep movsl %ds:(%esi),%es:(%edi)
> e778c7: 83 e2 03 and $0x3,%edx
> e778ca: 74 04 je e778d0 <memcpy+0x20>
> e778cc: 89 d1 mov %edx,%ecx
> e778ce: f3 a4 rep movsb %ds:(%esi),%es:(%edi)
> e778d0: 8b 75 f8 mov -0x8(%ebp),%esi
> e778d3: 8b 7d fc mov -0x4(%ebp),%edi
> e778d6: 89 ec mov %ebp,%esp
> e778d8: 5d pop %ebp
> e778d9: c3 ret
>
> due to a better register allocation, avoiding the call-saved
> %ebx register.
That'll might be semi-random.
>
> No functional changes intended.
>
> Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Ingo Molnar <mingo@kernel.org>
> Cc: Borislav Petkov <bp@alien8.de>
> Cc: Dave Hansen <dave.hansen@linux.intel.com>
> Cc: "H. Peter Anvin" <hpa@zytor.com>
> ---
> arch/x86/include/asm/string_32.h | 22 ++++++++++++----------
> 1 file changed, 12 insertions(+), 10 deletions(-)
>
> diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h
> index e9cce169bb4c..16cdda93e437 100644
> --- a/arch/x86/include/asm/string_32.h
> +++ b/arch/x86/include/asm/string_32.h
> @@ -32,16 +32,18 @@ extern size_t strlen(const char *s);
>
> static __always_inline void *__memcpy(void *to, const void *from, size_t n)
> {
> - int d0, d1, d2;
> - asm volatile("rep movsl\n\t"
> - "movl %4,%%ecx\n\t"
> - "andl $3,%%ecx\n\t"
> - "jz 1f\n\t"
> - "rep movsb\n\t"
> - "1:"
> - : "=&c" (d0), "=&D" (d1), "=&S" (d2)
> - : "0" (n / 4), "g" (n), "1" ((long)to), "2" ((long)from)
> - : "memory");
> + unsigned long esi = (unsigned long)from;
> + unsigned long edi = (unsigned long)to;
There is no need to cast to 'unsigned long', the same code should be
generated if 'void *' is used.
> + unsigned long ecx = n >> 2;
> +
> + asm volatile("rep movsl"
> + : "+D" (edi), "+S" (esi), "+c" (ecx)
> + : : "memory");
> + ecx = n & 3;
> + if (ecx)
> + asm volatile("rep movsb"
> + : "+D" (edi), "+S" (esi), "+c" (ecx)
> + : : "memory");
> return to;
> }
>
This version seems to generate better code still:
see https://godbolt.org/z/78cq97PPj
void *__memcpy(void *to, const void *from, unsigned long n)
{
unsigned long ecx = n >> 2;
asm volatile("rep movsl"
: "+D" (to), "+S" (from), "+c" (ecx)
: : "memory");
ecx = n & 3;
if (ecx)
asm volatile("rep movsb"
: "+D" (to), "+S" (from), "+c" (ecx)
: : "memory");
return (char *)to - n;
}
Note that the performance is horrid on a lot of cpu.
(In particular the copy of the remaining 1-3 bytes.)
David
On Tue, Dec 16, 2025 at 2:14 PM David Laight
<david.laight.linux@gmail.com> wrote:
> > 00e778b0 <memcpy>:
> > e778b0: 55 push %ebp
> > e778b1: 89 e5 mov %esp,%ebp
> > e778b3: 83 ec 08 sub $0x8,%esp
> > e778b6: 89 75 f8 mov %esi,-0x8(%ebp)
> > e778b9: 89 d6 mov %edx,%esi
> > e778bb: 89 ca mov %ecx,%edx
> > e778bd: 89 7d fc mov %edi,-0x4(%ebp)
> > e778c0: c1 e9 02 shr $0x2,%ecx
> > e778c3: 89 c7 mov %eax,%edi
> > e778c5: f3 a5 rep movsl %ds:(%esi),%es:(%edi)
> > e778c7: 83 e2 03 and $0x3,%edx
> > e778ca: 74 04 je e778d0 <memcpy+0x20>
> > e778cc: 89 d1 mov %edx,%ecx
> > e778ce: f3 a4 rep movsb %ds:(%esi),%es:(%edi)
> > e778d0: 8b 75 f8 mov -0x8(%ebp),%esi
> > e778d3: 8b 7d fc mov -0x4(%ebp),%edi
> > e778d6: 89 ec mov %ebp,%esp
> > e778d8: 5d pop %ebp
> > e778d9: c3 ret
> >
> > due to a better register allocation, avoiding the call-saved
> > %ebx register.
>
> That'll might be semi-random.
Not really, the compiler has more freedom to allocate more optimal register.
> > + unsigned long ecx = n >> 2;
> > +
> > + asm volatile("rep movsl"
> > + : "+D" (edi), "+S" (esi), "+c" (ecx)
> > + : : "memory");
> > + ecx = n & 3;
> > + if (ecx)
> > + asm volatile("rep movsb"
> > + : "+D" (edi), "+S" (esi), "+c" (ecx)
> > + : : "memory");
> > return to;
> > }
> >
> This version seems to generate better code still:
> see https://godbolt.org/z/78cq97PPj
>
> void *__memcpy(void *to, const void *from, unsigned long n)
> {
> unsigned long ecx = n >> 2;
>
> asm volatile("rep movsl"
> : "+D" (to), "+S" (from), "+c" (ecx)
> : : "memory");
> ecx = n & 3;
> if (ecx)
> asm volatile("rep movsb"
> : "+D" (to), "+S" (from), "+c" (ecx)
> : : "memory");
> return (char *)to - n;
I don't think that additional subtraction outweighs a move from EAX to
a temporary.
BR,
Uros.
On Tue, 16 Dec 2025 17:30:55 +0100
Uros Bizjak <ubizjak@gmail.com> wrote:
> On Tue, Dec 16, 2025 at 2:14 PM David Laight
> <david.laight.linux@gmail.com> wrote:
>
> > > 00e778b0 <memcpy>:
> > > e778b0: 55 push %ebp
> > > e778b1: 89 e5 mov %esp,%ebp
> > > e778b3: 83 ec 08 sub $0x8,%esp
> > > e778b6: 89 75 f8 mov %esi,-0x8(%ebp)
> > > e778b9: 89 d6 mov %edx,%esi
> > > e778bb: 89 ca mov %ecx,%edx
> > > e778bd: 89 7d fc mov %edi,-0x4(%ebp)
> > > e778c0: c1 e9 02 shr $0x2,%ecx
> > > e778c3: 89 c7 mov %eax,%edi
> > > e778c5: f3 a5 rep movsl %ds:(%esi),%es:(%edi)
> > > e778c7: 83 e2 03 and $0x3,%edx
> > > e778ca: 74 04 je e778d0 <memcpy+0x20>
> > > e778cc: 89 d1 mov %edx,%ecx
> > > e778ce: f3 a4 rep movsb %ds:(%esi),%es:(%edi)
> > > e778d0: 8b 75 f8 mov -0x8(%ebp),%esi
> > > e778d3: 8b 7d fc mov -0x4(%ebp),%edi
> > > e778d6: 89 ec mov %ebp,%esp
> > > e778d8: 5d pop %ebp
> > > e778d9: c3 ret
> > >
> > > due to a better register allocation, avoiding the call-saved
> > > %ebx register.
> >
> > That'll might be semi-random.
>
> Not really, the compiler has more freedom to allocate more optimal register.
>
> > > + unsigned long ecx = n >> 2;
> > > +
> > > + asm volatile("rep movsl"
> > > + : "+D" (edi), "+S" (esi), "+c" (ecx)
> > > + : : "memory");
> > > + ecx = n & 3;
> > > + if (ecx)
> > > + asm volatile("rep movsb"
> > > + : "+D" (edi), "+S" (esi), "+c" (ecx)
> > > + : : "memory");
> > > return to;
> > > }
> > >
>
> > This version seems to generate better code still:
> > see https://godbolt.org/z/78cq97PPj
> >
> > void *__memcpy(void *to, const void *from, unsigned long n)
> > {
> > unsigned long ecx = n >> 2;
> >
> > asm volatile("rep movsl"
> > : "+D" (to), "+S" (from), "+c" (ecx)
> > : : "memory");
> > ecx = n & 3;
> > if (ecx)
> > asm volatile("rep movsb"
> > : "+D" (to), "+S" (from), "+c" (ecx)
> > : : "memory");
> > return (char *)to - n;
>
> I don't think that additional subtraction outweighs a move from EAX to
> a temporary.
It saves a read from stack, and/or a register spill to stack
(at least as I compiled the code...).
The extra alu operation is much cheaper - especially since it will
sit in the 'out of order' instruction queue with nothing waiting
for the result (the write to stack .
Since pretty much no code user the result of memcpy() you could
have a inline wrapper that just preserves the 'to' address and calls
a 'void' function to do the copy itself.
But the real performance killer is the setup time for 'rep movs'.
That will outweigh any minor faffing about by orders of magnitude.
Oh, and start trying to select between algorithms based on length
and/or alignment and you get killed by the mispredicted branch
penalty - that seems to be about 20 clocks on my zen-5.
I haven't measured the setup cost for 'rep movs', it is on my
'round tuit' list.
David
>
> BR,
> Uros.
On 12/16/25 08:30, Uros Bizjak wrote:
> I don't think that additional subtraction outweighs a move from EAX to
> a temporary.
There are basically two ways this can end up:
1. None of these small changes (like an extra subtraction) matters in
the end, and this is a debate about nothing. The simplest code wins.
2. It matters and can be shown with some data.
Unfortunately, I see a lot more gazing into crystal balls than data
collection right now.
© 2016 - 2026 Red Hat, Inc.