[RFC PATCH v2 1/4] tools/nolibc: x86-64: Use `rep movsb` for `memcpy()` and `memmove()`

Ammar Faizi posted 4 patches 2 years, 3 months ago
There is a newer version of this series
[RFC PATCH v2 1/4] tools/nolibc: x86-64: Use `rep movsb` for `memcpy()` and `memmove()`
Posted by Ammar Faizi 2 years, 3 months ago
Simplify memcpy() and memmove() on the x86-64 arch.

The x86-64 arch has a 'rep movsb' instruction, which can perform
memcpy() using only a single instruction, given:

    %rdi = destination
    %rsi = source
    %rcx = length

Additionally, it can also handle the overlapping case by setting DF=1
(backward copy), which can be used as the memmove() implementation.

Before this patch:
```
  00000000000010ab <memmove>:
    10ab: 48 89 f8              mov    %rdi,%rax
    10ae: 31 c9                 xor    %ecx,%ecx
    10b0: 48 39 f7              cmp    %rsi,%rdi
    10b3: 48 83 d1 ff           adc    $0xffffffffffffffff,%rcx
    10b7: 48 85 d2              test   %rdx,%rdx
    10ba: 74 25                 je     10e1 <memmove+0x36>
    10bc: 48 83 c9 01           or     $0x1,%rcx
    10c0: 48 39 f0              cmp    %rsi,%rax
    10c3: 48 c7 c7 ff ff ff ff  mov    $0xffffffffffffffff,%rdi
    10ca: 48 0f 43 fa           cmovae %rdx,%rdi
    10ce: 48 01 cf              add    %rcx,%rdi
    10d1: 44 8a 04 3e           mov    (%rsi,%rdi,1),%r8b
    10d5: 44 88 04 38           mov    %r8b,(%rax,%rdi,1)
    10d9: 48 01 cf              add    %rcx,%rdi
    10dc: 48 ff ca              dec    %rdx
    10df: 75 f0                 jne    10d1 <memmove+0x26>
    10e1: c3                    ret

  00000000000010e2 <memcpy>:
    10e2: 48 89 f8              mov    %rdi,%rax
    10e5: 48 85 d2              test   %rdx,%rdx
    10e8: 74 12                 je     10fc <memcpy+0x1a>
    10ea: 31 c9                 xor    %ecx,%ecx
    10ec: 40 8a 3c 0e           mov    (%rsi,%rcx,1),%dil
    10f0: 40 88 3c 08           mov    %dil,(%rax,%rcx,1)
    10f4: 48 ff c1              inc    %rcx
    10f7: 48 39 ca              cmp    %rcx,%rdx
    10fa: 75 f0                 jne    10ec <memcpy+0xa>
    10fc: c3                    ret
```

After this patch:
```
  0000000000401040 <memmove>:
    401040: 48 89 d1              mov    %rdx,%rcx
    401043: 48 89 fa              mov    %rdi,%rdx
    401046: 48 89 f8              mov    %rdi,%rax
    401049: 48 29 f2              sub    %rsi,%rdx
    40104c: 48 39 ca              cmp    %rcx,%rdx
    40104f: 73 0f                 jae    401060 <memmove+0x20>
    401051: 48 8d 7c 0f ff        lea    -0x1(%rdi,%rcx,1),%rdi
    401056: 48 8d 74 0e ff        lea    -0x1(%rsi,%rcx,1),%rsi
    40105b: fd                    std
    40105c: f3 a4                 rep movsb %ds:(%rsi),%es:(%rdi)
    40105e: fc                    cld
    40105f: c3                    ret
    401060: f3 a4                 rep movsb %ds:(%rsi),%es:(%rdi)
    401062: c3                    ret

  0000000000401063 <memcpy>:
    401063: 48 89 f8              mov    %rdi,%rax
    401066: 48 89 d1              mov    %rdx,%rcx
    401069: f3 a4                 rep movsb %ds:(%rsi),%es:(%rdi)
    40106b: c3                    ret
```

Link: https://lore.kernel.org/lkml/5a821292d96a4dbc84c96ccdc6b5b666@AcuMS.aculab.com
Suggested-by: David Laight <David.Laight@aculab.com>
Signed-off-by: Ammar Faizi <ammarfaizi2@gnuweeb.org>
---
 tools/include/nolibc/arch-x86_64.h | 35 ++++++++++++++++++++++++++++++
 tools/include/nolibc/string.h      |  4 ++++
 2 files changed, 39 insertions(+)

diff --git a/tools/include/nolibc/arch-x86_64.h b/tools/include/nolibc/arch-x86_64.h
index e5ccb926c90306b6..297ffa364b2312eb 100644
--- a/tools/include/nolibc/arch-x86_64.h
+++ b/tools/include/nolibc/arch-x86_64.h
@@ -156,21 +156,56 @@
 
 /* startup code */
 /*
  * x86-64 System V ABI mandates:
  * 1) %rsp must be 16-byte aligned right before the function call.
  * 2) The deepest stack frame should be zero (the %rbp).
  *
  */
 void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_stack_protector _start(void)
 {
 	__asm__ volatile (
 		"xor  %ebp, %ebp\n"       /* zero the stack frame                            */
 		"mov  %rsp, %rdi\n"       /* save stack pointer to %rdi, as arg1 of _start_c */
 		"and  $-16, %rsp\n"       /* %rsp must be 16-byte aligned before call        */
 		"call _start_c\n"         /* transfer to c runtime                           */
 		"hlt\n"                   /* ensure it does not return                       */
 	);
 	__builtin_unreachable();
 }
 
+#define NOLIBC_ARCH_HAS_MEMMOVE
+void *memmove(void *dst, const void *src, size_t len);
+
+#define NOLIBC_ARCH_HAS_MEMCPY
+void *memcpy(void *dst, const void *src, size_t len);
+
+__asm__ (
+".section .text.nolibc_memmove\n"
+".weak memmove\n"
+"memmove:\n"
+	"movq %rdx, %rcx\n"
+	"movq %rdi, %rdx\n"
+	"movq %rdi, %rax\n"
+	"subq %rsi, %rdx\n"
+	"cmpq %rcx, %rdx\n"
+	"jnb  .Lforward_copy\n"
+	"leaq -1(%rdi, %rcx, 1), %rdi\n"
+	"leaq -1(%rsi, %rcx, 1), %rsi\n"
+	"std\n"
+	"rep movsb\n"
+	"cld\n"
+	"retq\n"
+".Lforward_copy:\n"
+	"rep movsb\n"
+	"retq\n"
+
+".section .text.nolibc_memcpy\n"
+".weak memcpy\n"
+"memcpy:\n"
+	"movq %rdi, %rax\n"
+	"movq %rdx, %rcx\n"
+	"rep movsb\n"
+	"retq\n"
+);
+
 #endif /* _NOLIBC_ARCH_X86_64_H */
diff --git a/tools/include/nolibc/string.h b/tools/include/nolibc/string.h
index 0c2e06c7c4772bc6..6eca267ec6fa7177 100644
--- a/tools/include/nolibc/string.h
+++ b/tools/include/nolibc/string.h
@@ -32,70 +32,74 @@ void *_nolibc_memcpy_up(void *dst, const void *src, size_t len)
 {
 	size_t pos = 0;
 
 	while (pos < len) {
 		((char *)dst)[pos] = ((const char *)src)[pos];
 		pos++;
 	}
 	return dst;
 }
 
 static __attribute__((unused))
 void *_nolibc_memcpy_down(void *dst, const void *src, size_t len)
 {
 	while (len) {
 		len--;
 		((char *)dst)[len] = ((const char *)src)[len];
 	}
 	return dst;
 }
 
+#ifndef NOLIBC_ARCH_HAS_MEMMOVE
 /* might be ignored by the compiler without -ffreestanding, then found as
  * missing.
  */
 __attribute__((weak,unused,section(".text.nolibc_memmove")))
 void *memmove(void *dst, const void *src, size_t len)
 {
 	size_t dir, pos;
 
 	pos = len;
 	dir = -1;
 
 	if (dst < src) {
 		pos = -1;
 		dir = 1;
 	}
 
 	while (len) {
 		pos += dir;
 		((char *)dst)[pos] = ((const char *)src)[pos];
 		len--;
 	}
 	return dst;
 }
+#endif /* #ifndef NOLIBC_ARCH_HAS_MEMMOVE */
 
+#ifndef NOLIBC_ARCH_HAS_MEMCPY
 /* must be exported, as it's used by libgcc on ARM */
 __attribute__((weak,unused,section(".text.nolibc_memcpy")))
 void *memcpy(void *dst, const void *src, size_t len)
 {
 	return _nolibc_memcpy_up(dst, src, len);
 }
+#endif /* #ifndef NOLIBC_ARCH_HAS_MEMCPY */
 
 /* might be ignored by the compiler without -ffreestanding, then found as
  * missing.
  */
 __attribute__((weak,unused,section(".text.nolibc_memset")))
 void *memset(void *dst, int b, size_t len)
 {
 	char *p = dst;
 
 	while (len--) {
 		/* prevent gcc from recognizing memset() here */
 		__asm__ volatile("");
 		*(p++) = b;
 	}
 	return dst;
 }
 
 static __attribute__((unused))
 char *strchr(const char *s, int c)
 {
-- 
Ammar Faizi
Re: [RFC PATCH v2 1/4] tools/nolibc: x86-64: Use `rep movsb` for `memcpy()` and `memmove()`
Posted by Alviro Iskandar Setiawan 2 years, 3 months ago
On Sat, Sep 2, 2023 at 12:51 PM Ammar Faizi wrote:
> +__asm__ (
> +".section .text.nolibc_memmove\n"
> +".weak memmove\n"
> +"memmove:\n"
> +       "movq %rdx, %rcx\n"
> +       "movq %rdi, %rdx\n"
> +       "movq %rdi, %rax\n"
> +       "subq %rsi, %rdx\n"
> +       "cmpq %rcx, %rdx\n"
> +       "jnb  .Lforward_copy\n"
> +       "leaq -1(%rdi, %rcx, 1), %rdi\n"
> +       "leaq -1(%rsi, %rcx, 1), %rsi\n"
> +       "std\n"
> +       "rep movsb\n"
> +       "cld\n"
> +       "retq\n"
> +".Lforward_copy:\n"
> +       "rep movsb\n"
> +       "retq\n"
> +
> +".section .text.nolibc_memcpy\n"
> +".weak memcpy\n"
> +"memcpy:\n"
> +       "movq %rdi, %rax\n"
> +       "movq %rdx, %rcx\n"
> +       "rep movsb\n"
> +       "retq\n"
> +);

Btw, sir, this can be simplified more by merging the forward copy
path, only using two "rep movsb" for both memmove() and memcpy()
should be enough?
```
__asm__ (
".section .text.nolibc_memmove_memcpy\n"
".weak memmove\n"
".weak memcpy\n"
"memmove:\n"
        "movq %rdx, %rcx\n"
        "movq %rdi, %rdx\n"
        "movq %rdi, %rax\n"
        "subq %rsi, %rdx\n"
        "cmpq %rcx, %rdx\n"
        "jnb __nolibc_forward_copy\n"
        "leaq -1(%rdi, %rcx, 1), %rdi\n"
        "leaq -1(%rsi, %rcx, 1), %rsi\n"
        "std\n"
        "rep movsb\n"
        "cld\n"
        "retq\n"

"memcpy:\n"
        "movq %rdi, %rax\n"
        "movq %rdx, %rcx\n"
"__nolibc_forward_copy:\n"
        "rep movsb\n"
        "retq\n"
);
```
Thought?

-- Viro
Re: [RFC PATCH v2 1/4] tools/nolibc: x86-64: Use `rep movsb` for `memcpy()` and `memmove()`
Posted by Ammar Faizi 2 years, 3 months ago
On Sat, Sep 02, 2023 at 01:07:50PM +0700, Alviro Iskandar Setiawan wrote:
> Btw, sir, this can be simplified more by merging the forward copy
> path, only using two "rep movsb" for both memmove() and memcpy()
> should be enough?
> ```
> __asm__ (
> ".section .text.nolibc_memmove_memcpy\n"
> ".weak memmove\n"
> ".weak memcpy\n"
> "memmove:\n"
>         "movq %rdx, %rcx\n"
>         "movq %rdi, %rdx\n"
>         "movq %rdi, %rax\n"
>         "subq %rsi, %rdx\n"
>         "cmpq %rcx, %rdx\n"
>         "jnb __nolibc_forward_copy\n"
>         "leaq -1(%rdi, %rcx, 1), %rdi\n"
>         "leaq -1(%rsi, %rcx, 1), %rsi\n"
>         "std\n"
>         "rep movsb\n"
>         "cld\n"
>         "retq\n"
> 
> "memcpy:\n"
>         "movq %rdi, %rax\n"
>         "movq %rdx, %rcx\n"
> "__nolibc_forward_copy:\n"
>         "rep movsb\n"
>         "retq\n"
> );
> ```

Looks good. I'll apply that change.

-- 
Ammar Faizi
Re: [RFC PATCH v2 1/4] tools/nolibc: x86-64: Use `rep movsb` for `memcpy()` and `memmove()`
Posted by Willy Tarreau 2 years, 3 months ago
On Sat, Sep 02, 2023 at 01:11:06PM +0700, Ammar Faizi wrote:
> On Sat, Sep 02, 2023 at 01:07:50PM +0700, Alviro Iskandar Setiawan wrote:
> > Btw, sir, this can be simplified more by merging the forward copy
> > path, only using two "rep movsb" for both memmove() and memcpy()
> > should be enough?
> > ```
> > __asm__ (
> > ".section .text.nolibc_memmove_memcpy\n"
> > ".weak memmove\n"
> > ".weak memcpy\n"
> > "memmove:\n"
> >         "movq %rdx, %rcx\n"
> >         "movq %rdi, %rdx\n"
> >         "movq %rdi, %rax\n"
> >         "subq %rsi, %rdx\n"
> >         "cmpq %rcx, %rdx\n"
> >         "jnb __nolibc_forward_copy\n"
> >         "leaq -1(%rdi, %rcx, 1), %rdi\n"
> >         "leaq -1(%rsi, %rcx, 1), %rsi\n"
> >         "std\n"
> >         "rep movsb\n"
> >         "cld\n"
> >         "retq\n"
> > 
> > "memcpy:\n"
> >         "movq %rdi, %rax\n"
> >         "movq %rdx, %rcx\n"
> > "__nolibc_forward_copy:\n"
> >         "rep movsb\n"
> >         "retq\n"
> > );
> > ```
> 
> Looks good. I'll apply that change.

Note that in this case we simply don't need a special
version of memcpy(), memmove() is always OK for this,
so you can simplify this further by starting with:

  memcpy:
  memmove:
        ...

Willy
Re: [RFC PATCH v2 1/4] tools/nolibc: x86-64: Use `rep movsb` for `memcpy()` and `memmove()`
Posted by Ammar Faizi 2 years, 3 months ago
On Sat, Sep 02, 2023 at 08:22:37AM +0200, Willy Tarreau wrote:
> Note that in this case we simply don't need a special
> version of memcpy(), memmove() is always OK for this,
> so you can simplify this further by starting with:
> 
>   memcpy:
>   memmove:
>        ...

Ok, I'll do that.

-- 
Ammar Faizi
Re: [RFC PATCH v2 1/4] tools/nolibc: x86-64: Use `rep movsb` for `memcpy()` and `memmove()`
Posted by Alviro Iskandar Setiawan 2 years, 3 months ago
On Sat, Sep 2, 2023 at 1:38 PM Ammar Faizi wrote:
> Ok, I'll do that.

Another micro-optimization. Since the likely case is the forward copy,
make it the case that doesn't take the jump.

Pseudo C:
if (unlikely(dst - src < n))
        backward_copy();
else
        forward_copy();

-- Viro
Re: [RFC PATCH v2 1/4] tools/nolibc: x86-64: Use `rep movsb` for `memcpy()` and `memmove()`
Posted by Ammar Faizi 2 years, 3 months ago
On 2023/09/02 午後7:29, Alviro Iskandar Setiawan wrote:
> On Sat, Sep 2, 2023 at 1:38 PM Ammar Faizi wrote:
>> Ok, I'll do that.
> 
> Another micro-optimization. Since the likely case is the forward copy,
> make it the case that doesn't take the jump.
> 
> Pseudo C:
> if (unlikely(dst - src < n))
>          backward_copy();
> else
>          forward_copy();

Point taken.

-- 
Ammar Faizi

RE: [RFC PATCH v2 1/4] tools/nolibc: x86-64: Use `rep movsb` for `memcpy()` and `memmove()`
Posted by David Laight 2 years, 3 months ago
From: Ammar Faizi
> Sent: 02 September 2023 13:37
> On 2023/09/02 午後7:29, Alviro Iskandar Setiawan wrote:
> > On Sat, Sep 2, 2023 at 1:38 PM Ammar Faizi wrote:
> >> Ok, I'll do that.
> >
> > Another micro-optimization. Since the likely case is the forward copy,
> > make it the case that doesn't take the jump.
> >
> > Pseudo C:
> > if (unlikely(dst - src < n))
> >          backward_copy();
> > else
> >          forward_copy();
> 
> Point taken.

I believe it makes almost no difference.
I'm sure I read that modern (Intel at least) cpu never do
static branch prediction.
So 'cold cache' there is 50% chance of the branch being taken.
Nothing the compiler can do will affect it.
OTOH having
	if (likely(dst - src >= n))
		forwards();
	else
		backwards();
(and in that order) probably makes the code a bit easier to read.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)