[RFC PATCH v1 2/5] tools/nolibc: x86-64: Use `rep stosb` for `memset()`

Ammar Faizi posted 5 patches 2 years, 3 months ago
There is a newer version of this series
[RFC PATCH v1 2/5] tools/nolibc: x86-64: Use `rep stosb` for `memset()`
Posted by Ammar Faizi 2 years, 3 months ago
Simplify memset() on the x86-64 arch.

The x86-64 arch has a 'rep stosb' instruction, which can perform
memset() using only a single instruction, given:

    %al  = value (just like the second argument of memset())
    %rdi = destination
    %rcx = length

Before this patch:
```
  00000000000010c9 <memset>:
    10c9: 48 89 f8              mov    %rdi,%rax
    10cc: 48 85 d2              test   %rdx,%rdx
    10cf: 74 0e                 je     10df <memset+0x16>
    10d1: 31 c9                 xor    %ecx,%ecx
    10d3: 40 88 34 08           mov    %sil,(%rax,%rcx,1)
    10d7: 48 ff c1              inc    %rcx
    10da: 48 39 ca              cmp    %rcx,%rdx
    10dd: 75 f4                 jne    10d3 <memset+0xa>
    10df: c3                    ret
```

After this patch:
```
  00000000000010b1 <memset>:
    10b1: 48 89 f0              mov    %rsi,%rax
    10b4: 48 89 d1              mov    %rdx,%rcx
    10b7: 48 89 fa              mov    %rdi,%rdx
    10ba: f3 aa                 rep stos %al,%es:(%rdi)
    10bc: 48 89 d0              mov    %rdx,%rax
    10bf: c3                    ret
```

Signed-off-by: Ammar Faizi <ammarfaizi2@gnuweeb.org>
---
 tools/include/nolibc/arch-x86_64.h | 13 +++++++++++++
 tools/include/nolibc/string.h      |  2 ++
 2 files changed, 15 insertions(+)

diff --git a/tools/include/nolibc/arch-x86_64.h b/tools/include/nolibc/arch-x86_64.h
index c5162170a2ccdff1..42f2674ad1ecdd64 100644
--- a/tools/include/nolibc/arch-x86_64.h
+++ b/tools/include/nolibc/arch-x86_64.h
@@ -179,6 +179,9 @@ void *memmove(void *dst, const void *src, size_t len);
 #define NOLIBC_ARCH_HAS_MEMCPY
 void *memcpy(void *dst, const void *src, size_t len);
 
+#define NOLIBC_ARCH_HAS_MEMSET
+void *memset(void *dst, int c, size_t len);
+
 __asm__ (
 ".section .text.nolibc_memmove\n"
 ".weak memmove\n"
@@ -199,6 +202,16 @@ __asm__ (
 	"movq %rdx, %rcx\n"
 	"rep movsb\n"
 	"retq\n"
+
+".section .text.nolibc_memset\n"
+".weak memset\n"
+"memset:\n"
+	"movq %rsi, %rax\n"
+	"movq %rdx, %rcx\n"
+	"movq %rdi, %rdx\n"
+	"rep stosb\n"
+	"movq %rdx, %rax\n"
+	"retq\n"
 );
 
 #endif /* _NOLIBC_ARCH_X86_64_H */
diff --git a/tools/include/nolibc/string.h b/tools/include/nolibc/string.h
index 6eca267ec6fa7177..1bad6121ef8c4ab5 100644
--- a/tools/include/nolibc/string.h
+++ b/tools/include/nolibc/string.h
@@ -84,6 +84,7 @@ void *memcpy(void *dst, const void *src, size_t len)
 }
 #endif /* #ifndef NOLIBC_ARCH_HAS_MEMCPY */
 
+#ifndef NOLIBC_ARCH_HAS_MEMSET
 /* might be ignored by the compiler without -ffreestanding, then found as
  * missing.
  */
@@ -99,6 +100,7 @@ void *memset(void *dst, int b, size_t len)
 	}
 	return dst;
 }
+#endif /* #ifndef NOLIBC_ARCH_HAS_MEMSET */
 
 static __attribute__((unused))
 char *strchr(const char *s, int c)
-- 
Ammar Faizi
Re: [RFC PATCH v1 2/5] tools/nolibc: x86-64: Use `rep stosb` for `memset()`
Posted by Alviro Iskandar Setiawan 2 years, 3 months ago
On Wed, Aug 30, 2023 at 8:57 PM Ammar Faizi wrote:
>   00000000000010b1 <memset>:
>     10b1: 48 89 f0              mov    %rsi,%rax
>     10b4: 48 89 d1              mov    %rdx,%rcx
>     10b7: 48 89 fa              mov    %rdi,%rdx
>     10ba: f3 aa                 rep stos %al,%es:(%rdi)
>     10bc: 48 89 d0              mov    %rdx,%rax
>     10bf: c3                    ret

Just a small idea to shrink this more, "mov %rdi, %rdx" and "mov %rdx,
%rax" can be replaced with "push %rdi" and "pop %rax" (they are just a
byte). So we can save 4 bytes more.

0000000000001500 <memset>:
    1500: 48 89 f0     mov    %rsi,%rax
    1503: 48 89 d1     mov    %rdx,%rcx
    1506: 57           push   %rdi
    1507: f3 aa        rep stos %al,%es:(%rdi)
    1509: 58           pop    %rax
    150a: c3           ret

But I know you don't like it because it costs extra memory access.

-- Viro
Re: [RFC PATCH v1 2/5] tools/nolibc: x86-64: Use `rep stosb` for `memset()`
Posted by Ammar Faizi 2 years, 3 months ago
On Wed, Aug 30, 2023 at 09:24:45PM +0700, Alviro Iskandar Setiawan wrote:
> Just a small idea to shrink this more, "mov %rdi, %rdx" and "mov %rdx,
> %rax" can be replaced with "push %rdi" and "pop %rax" (they are just a
> byte). So we can save 4 bytes more.
> 
> 0000000000001500 <memset>:
>     1500: 48 89 f0     mov    %rsi,%rax
>     1503: 48 89 d1     mov    %rdx,%rcx
>     1506: 57           push   %rdi
>     1507: f3 aa        rep stos %al,%es:(%rdi)
>     1509: 58           pop    %rax
>     150a: c3           ret
> 
> But I know you don't like it because it costs extra memory access.

Yes, that's an extra memory access. But I believe it doesn't hurt
someone targetting -Os. In many cases, the compilers use push/pop to
align the stack before a 'call' instruction. If they want to avoid extra
memory access, they could have used "subq $8, %rsp" and "addq $8, %rsp".

For example: https://godbolt.org/z/Tzc1xWGEn

C code:
```
int fx(int b);
int fy(int a)
{
    return 1 + fx(a);
}
```

Targetting -Os, both clang and gcc compile it to:
```
fy:
    pushq   %rax
    call    fx
    popq    %rdx
    incl    %eax
    ret
```

Targetting -O2:
```
fy:
    subq    $8, %rsp
    call    fx
    addq    $8, %rsp
    addl    $1, %eax
    ret
```

That pushq/popq pair doesn't actually preserve anything; it's just to
align the %rsp at 16 bytes on 'call'. IOW, sometimes having extra memory
access to get a smaller code size is acceptable.

-- 
Ammar Faizi
Re: [RFC PATCH v1 2/5] tools/nolibc: x86-64: Use `rep stosb` for `memset()`
Posted by Willy Tarreau 2 years, 3 months ago
On Wed, Aug 30, 2023 at 10:09:51PM +0700, Ammar Faizi wrote:
> On Wed, Aug 30, 2023 at 09:24:45PM +0700, Alviro Iskandar Setiawan wrote:
> > Just a small idea to shrink this more, "mov %rdi, %rdx" and "mov %rdx,
> > %rax" can be replaced with "push %rdi" and "pop %rax" (they are just a
> > byte). So we can save 4 bytes more.
> > 
> > 0000000000001500 <memset>:
> >     1500: 48 89 f0     mov    %rsi,%rax
> >     1503: 48 89 d1     mov    %rdx,%rcx
> >     1506: 57           push   %rdi
> >     1507: f3 aa        rep stos %al,%es:(%rdi)
> >     1509: 58           pop    %rax
> >     150a: c3           ret
> > 
> > But I know you don't like it because it costs extra memory access.
> 
> Yes, that's an extra memory access. But I believe it doesn't hurt
> someone targetting -Os. In many cases, the compilers use push/pop to
> align the stack before a 'call' instruction. If they want to avoid extra
> memory access, they could have used "subq $8, %rsp" and "addq $8, %rsp".

Then "xchg %esi, %eax" is just one byte with no memory access ;-)

Willy
Re: [RFC PATCH v1 2/5] tools/nolibc: x86-64: Use `rep stosb` for `memset()`
Posted by Ammar Faizi 2 years, 3 months ago
On Wed, Aug 30, 2023 at 05:23:22PM +0200, Willy Tarreau wrote:
> Then "xchg %esi, %eax" is just one byte with no memory access ;-)

Perfect!

Now I got this, shorter than "movl %esi, %eax":
```
0000000000001500 <memset>:
    1500: 96          xchg   %eax,%esi
    1501: 48 89 d1    mov    %rdx,%rcx
    1504: 57          push   %rdi
    1505: f3 aa       rep stos %al,%es:(%rdi)
    1507: 58          pop    %rax
    1508: c3          ret
```

Unfortunately, the xchg trick doesn't yield smaller machine code for
%rdx, %rcx. Lol.

-- 
Ammar Faizi
Re: [RFC PATCH v1 2/5] tools/nolibc: x86-64: Use `rep stosb` for `memset()`
Posted by Willy Tarreau 2 years, 3 months ago
On Wed, Aug 30, 2023 at 10:44:53PM +0700, Ammar Faizi wrote:
> On Wed, Aug 30, 2023 at 05:23:22PM +0200, Willy Tarreau wrote:
> > Then "xchg %esi, %eax" is just one byte with no memory access ;-)
> 
> Perfect!
> 
> Now I got this, shorter than "movl %esi, %eax":
> ```
> 0000000000001500 <memset>:
>     1500: 96          xchg   %eax,%esi
>     1501: 48 89 d1    mov    %rdx,%rcx
>     1504: 57          push   %rdi
>     1505: f3 aa       rep stos %al,%es:(%rdi)
>     1507: 58          pop    %rax
>     1508: c3          ret
> ```
> 
> Unfortunately, the xchg trick doesn't yield smaller machine code for
> %rdx, %rcx. Lol.

Normal, that's because historically "xchg ax, regX" was a single-byte 0x9X
on 8086, then it turned to 32-bit keeping the same encoding, like many
instructions (note that NOP is encoded as xchg ax,ax). It remains short
when you can sacrifice the other register, or restore it later using yet
another xchg. For rcx/rdx a push/pop could do it as they should also be
a single-byte 0x5X even in long mode unless I'm mistaken. Thus if you
absolutely want to squeeze that 9th byte to end up with a 8-byte function
you could probably do:

    xchg %eax, %esi      1
    push %rdx            1
    pop %rcx             1
    push %rdi            1
    rep movsb            2
    pop %rax             1
    ret                  1
    ------------- Total: 8 bytes :-)

Willy
Re: [RFC PATCH v1 2/5] tools/nolibc: x86-64: Use `rep stosb` for `memset()`
Posted by Ammar Faizi 2 years, 3 months ago
On Wed, Aug 30, 2023 at 05:51:52PM +0200, Willy Tarreau wrote:
> Normal, that's because historically "xchg ax, regX" was a single-byte 0x9X
> on 8086, then it turned to 32-bit keeping the same encoding, like many
> instructions (note that NOP is encoded as xchg ax,ax). It remains short
> when you can sacrifice the other register, or restore it later using yet
> another xchg. For rcx/rdx a push/pop could do it as they should also be
> a single-byte 0x5X even in long mode unless I'm mistaken. Thus if you
> absolutely want to squeeze that 9th byte to end up with a 8-byte function
> you could probably do:
> 
>     xchg %eax, %esi      1
>     push %rdx            1
>     pop %rcx             1
>     push %rdi            1
>     rep movsb            2 [sic]
>     pop %rax             1
>     ret                  1
>     ------------- Total: 8 bytes :-)

Fun!

We're not doing a code golf game, though. So, I think I will leave the
"mov %rdx, %rcx" as is. Otherwise, I would be tempted to do that all
over the place.

-- 
Ammar Faizi
Re: [RFC PATCH v1 2/5] tools/nolibc: x86-64: Use `rep stosb` for `memset()`
Posted by Alviro Iskandar Setiawan 2 years, 3 months ago
On Wed, Aug 30, 2023 at 11:08 PM Ammar Faizi wrote:
> On Wed, Aug 30, 2023 at 05:51:52PM +0200, Willy Tarreau wrote:
> >     xchg %eax, %esi      1
> >     push %rdx            1
> >     pop %rcx             1
> >     push %rdi            1
> >     rep movsb            2 [sic]
> >     pop %rax             1
> >     ret                  1
> >     ------------- Total: 8 bytes :-)

That's beautiful!

-- Viro
Re: [RFC PATCH v1 2/5] tools/nolibc: x86-64: Use `rep stosb` for `memset()`
Posted by Alviro Iskandar Setiawan 2 years, 3 months ago
On Wed, Aug 30, 2023 at 8:57 PM Ammar Faizi wrote:
> +".section .text.nolibc_memset\n"
> +".weak memset\n"
> +"memset:\n"
> +       "movq %rsi, %rax\n"
> +       "movq %rdx, %rcx\n"
> +       "movq %rdi, %rdx\n"
> +       "rep stosb\n"
> +       "movq %rdx, %rax\n"
> +       "retq\n"

The first instruction could be:

   movl %esi, %eax

That's smaller. Also, the second argument of memset() is an int
anyway, so there is no need to have a full 64-bit copy of %rsi in
%rax.

-- Viro
Re: [RFC PATCH v1 2/5] tools/nolibc: x86-64: Use `rep stosb` for `memset()`
Posted by Ammar Faizi 2 years, 3 months ago
On Wed, Aug 30, 2023 at 09:08:05PM +0700, Alviro Iskandar Setiawan wrote:
> The first instruction could be:
> 
>    movl %esi, %eax
> 
> That's smaller. Also, the second argument of memset() is an int
> anyway, so there is no need to have a full 64-bit copy of %rsi in
> %rax.

Agree, noted.

-- 
Ammar Faizi