[PATCH v5 2/6] x86: re-work memset()

Jan Beulich posted 6 patches 4 months, 3 weeks ago
There is a newer version of this series
[PATCH v5 2/6] x86: re-work memset()
Posted by Jan Beulich 4 months, 3 weeks ago
Move the function to its own assembly file. Having it in C just for the
entire body to be an asm() isn't really helpful. Then have two flavors:
A "basic" version using qword steps for the bulk of the operation, and an
ERMS version for modern hardware, to be substituted in via alternatives
patching.

For RET to be usable in an alternative's replacement code, extend the
CALL/JMP patching to cover the case of "JMP __x86_return_thunk" coming
last in replacement code.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
We may want to consider branching over the REP STOSQ as well, if the
number of qwords turns out to be zero.
We may also want to consider using non-REP STOS{L,W,B} for the tail.
---
v5: Re-base.
v4: Use %r8 instead of %rsi in a few places.
v3: Re-base.

--- a/xen/arch/x86/Makefile
+++ b/xen/arch/x86/Makefile
@@ -47,6 +47,7 @@ obj-$(CONFIG_RETURN_THUNK) += indirect-t
 obj-$(CONFIG_PV) += ioport_emulate.o
 obj-y += irq.o
 obj-$(CONFIG_KEXEC) += machine_kexec.o
+obj-y += memset.o
 obj-y += mm.o x86_64/mm.o
 obj-$(CONFIG_VM_EVENT) += monitor.o
 obj-y += mpparse.o
--- a/xen/arch/x86/alternative.c
+++ b/xen/arch/x86/alternative.c
@@ -346,6 +346,12 @@ static int init_or_livepatch _apply_alte
         /* 0xe8/0xe9 are relative branches; fix the offset. */
         if ( a->repl_len >= 5 && (*buf & 0xfe) == 0xe8 )
             *(int32_t *)(buf + 1) += repl - orig;
+        else if ( IS_ENABLED(CONFIG_RETURN_THUNK) &&
+                  a->repl_len > 5 && buf[a->repl_len - 5] == 0xe9 &&
+                  ((long)repl + a->repl_len +
+                   *(int32_t *)(buf + a->repl_len - 4) ==
+                   (long)__x86_return_thunk) )
+            *(int32_t *)(buf + a->repl_len - 4) += repl - orig;
 
         a->priv = 1;
 
--- /dev/null
+++ b/xen/arch/x86/memset.S
@@ -0,0 +1,30 @@
+#include <asm/asm_defns.h>
+
+.macro memset
+        and     $7, %edx
+        shr     $3, %rcx
+        movzbl  %sil, %esi
+        mov     $0x0101010101010101, %rax
+        imul    %rsi, %rax
+        mov     %rdi, %r8
+        rep stosq
+        or      %edx, %ecx
+        jz      0f
+        rep stosb
+0:
+        mov     %r8, %rax
+        RET
+.endm
+
+.macro memset_erms
+        mov     %esi, %eax
+        mov     %rdi, %r8
+        rep stosb
+        mov     %r8, %rax
+        RET
+.endm
+
+FUNC(memset)
+        mov     %rdx, %rcx
+        ALTERNATIVE memset, memset_erms, X86_FEATURE_ERMS
+END(memset)
--- a/xen/arch/x86/string.c
+++ b/xen/arch/x86/string.c
@@ -22,19 +22,6 @@ void *(memcpy)(void *dest, const void *s
     return dest;
 }
 
-void *(memset)(void *s, int c, size_t n)
-{
-    long d0, d1;
-
-    asm volatile (
-        "rep stosb"
-        : "=&c" (d0), "=&D" (d1)
-        : "a" (c), "1" (s), "0" (n)
-        : "memory");
-
-    return s;
-}
-
 void *(memmove)(void *dest, const void *src, size_t n)
 {
     long d0, d1, d2;
Re: [PATCH v5 2/6] x86: re-work memset()
Posted by Teddy Astie 4 months, 3 weeks ago
Le 05/06/2025 à 12:26, Jan Beulich a écrit :
> Move the function to its own assembly file. Having it in C just for the
> entire body to be an asm() isn't really helpful. Then have two flavors:
> A "basic" version using qword steps for the bulk of the operation, and an
> ERMS version for modern hardware, to be substituted in via alternatives
> patching.
> 
> For RET to be usable in an alternative's replacement code, extend the
> CALL/JMP patching to cover the case of "JMP __x86_return_thunk" coming
> last in replacement code.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> We may want to consider branching over the REP STOSQ as well, if the
> number of qwords turns out to be zero.
> We may also want to consider using non-REP STOS{L,W,B} for the tail.
> ---
> v5: Re-base.
> v4: Use %r8 instead of %rsi in a few places.
> v3: Re-base.
> 
> --- a/xen/arch/x86/Makefile
> +++ b/xen/arch/x86/Makefile
> @@ -47,6 +47,7 @@ obj-$(CONFIG_RETURN_THUNK) += indirect-t
>   obj-$(CONFIG_PV) += ioport_emulate.o
>   obj-y += irq.o
>   obj-$(CONFIG_KEXEC) += machine_kexec.o
> +obj-y += memset.o
>   obj-y += mm.o x86_64/mm.o
>   obj-$(CONFIG_VM_EVENT) += monitor.o
>   obj-y += mpparse.o
> --- a/xen/arch/x86/alternative.c
> +++ b/xen/arch/x86/alternative.c
> @@ -346,6 +346,12 @@ static int init_or_livepatch _apply_alte
>           /* 0xe8/0xe9 are relative branches; fix the offset. */
>           if ( a->repl_len >= 5 && (*buf & 0xfe) == 0xe8 )
>               *(int32_t *)(buf + 1) += repl - orig;
> +        else if ( IS_ENABLED(CONFIG_RETURN_THUNK) &&
> +                  a->repl_len > 5 && buf[a->repl_len - 5] == 0xe9 &&
> +                  ((long)repl + a->repl_len +
> +                   *(int32_t *)(buf + a->repl_len - 4) ==
> +                   (long)__x86_return_thunk) )
> +            *(int32_t *)(buf + a->repl_len - 4) += repl - orig;

That looks a bit confusing, to me that probably some comment explaining 
what instructions transform it's looking to make.

>   
>           a->priv = 1;
>   
> --- /dev/null
> +++ b/xen/arch/x86/memset.S
> @@ -0,0 +1,30 @@
> +#include <asm/asm_defns.h>
> +

It would be nice to have a reminder of the calling convention (i.e what 
register maps to what memset parameter) as it will definitely help 
future readers.

If I understand correctly here :
- rdi: destination (s)
- rsi: byte to write (c)
- rdx: number of bytes to write (n)
- eventually that rcx = rdx (mov %rdx, %rcx) in memset/memset_erms

> +.macro memset
> +        and     $7, %edx
> +        shr     $3, %rcx
> +        movzbl  %sil, %esi
> +        mov     $0x0101010101010101, %rax
> +        imul    %rsi, %rax
> +        mov     %rdi, %r8
> +        rep stosq
> +        or      %edx, %ecx
> +        jz      0f
> +        rep stosb
> +0:
> +        mov     %r8, %rax
> +        RET
> +.endm
> +
> +.macro memset_erms
> +        mov     %esi, %eax
> +        mov     %rdi, %r8
> +        rep stosb
> +        mov     %r8, %rax
> +        RET
> +.endm
> +

Overall, I am a bit confused on the mixing of 32-bits (edx, esi, ...) 
and 64-bits registers (rax, ...). But it looks ok to me.

> +FUNC(memset)
> +        mov     %rdx, %rcx
> +        ALTERNATIVE memset, memset_erms, X86_FEATURE_ERMS
> +END(memset)
> --- a/xen/arch/x86/string.c
> +++ b/xen/arch/x86/string.c
> @@ -22,19 +22,6 @@ void *(memcpy)(void *dest, const void *s
>       return dest;
>   }
>   
> -void *(memset)(void *s, int c, size_t n)
> -{
> -    long d0, d1;
> -
> -    asm volatile (
> -        "rep stosb"
> -        : "=&c" (d0), "=&D" (d1)
> -        : "a" (c), "1" (s), "0" (n)
> -        : "memory");
> -
> -    return s;
> -}
> -
>   void *(memmove)(void *dest, const void *src, size_t n)
>   {
>       long d0, d1, d2;
> 
> 

Teddy


Teddy Astie | Vates XCP-ng Developer

XCP-ng & Xen Orchestra - Vates solutions

web: https://vates.tech
Re: [PATCH v5 2/6] x86: re-work memset()
Posted by Jan Beulich 4 months, 3 weeks ago
On 05.06.2025 18:59, Teddy Astie wrote:
> Le 05/06/2025 à 12:26, Jan Beulich a écrit :
>> --- a/xen/arch/x86/alternative.c
>> +++ b/xen/arch/x86/alternative.c
>> @@ -346,6 +346,12 @@ static int init_or_livepatch _apply_alte
>>           /* 0xe8/0xe9 are relative branches; fix the offset. */
>>           if ( a->repl_len >= 5 && (*buf & 0xfe) == 0xe8 )
>>               *(int32_t *)(buf + 1) += repl - orig;
>> +        else if ( IS_ENABLED(CONFIG_RETURN_THUNK) &&
>> +                  a->repl_len > 5 && buf[a->repl_len - 5] == 0xe9 &&
>> +                  ((long)repl + a->repl_len +
>> +                   *(int32_t *)(buf + a->repl_len - 4) ==
>> +                   (long)__x86_return_thunk) )
>> +            *(int32_t *)(buf + a->repl_len - 4) += repl - orig;
> 
> That looks a bit confusing, to me that probably some comment explaining 
> what instructions transform it's looking to make.

It's still the same comment ahead of the if() that applies here. This will all
become easier with Andrew's decode-light, at which point we'll be able to spot
CALL/JMP anywhere  in a blob.

>> --- /dev/null
>> +++ b/xen/arch/x86/memset.S
>> @@ -0,0 +1,30 @@
>> +#include <asm/asm_defns.h>
>> +
> 
> It would be nice to have a reminder of the calling convention (i.e what 
> register maps to what memset parameter) as it will definitely help 
> future readers.
> 
> If I understand correctly here :
> - rdi: destination (s)
> - rsi: byte to write (c)
> - rdx: number of bytes to write (n)

I don't think the (default) ABI needs re-stating for every function in every
assembly file.

>> +.macro memset
>> +        and     $7, %edx
>> +        shr     $3, %rcx
>> +        movzbl  %sil, %esi
>> +        mov     $0x0101010101010101, %rax
>> +        imul    %rsi, %rax
>> +        mov     %rdi, %r8
>> +        rep stosq
>> +        or      %edx, %ecx
>> +        jz      0f
>> +        rep stosb
>> +0:
>> +        mov     %r8, %rax
>> +        RET
>> +.endm
>> +
>> +.macro memset_erms
>> +        mov     %esi, %eax
>> +        mov     %rdi, %r8
>> +        rep stosb
>> +        mov     %r8, %rax
>> +        RET
>> +.endm
>> +
> 
> Overall, I am a bit confused on the mixing of 32-bits (edx, esi, ...) 
> and 64-bits registers (rax, ...). But it looks ok to me.

Since 64-bit forms require REX, 32-bit ones are used wherever that's sufficient
for the purpose.

Jan