[PATCH v3] target/s390x: Leverage 128-bit vector type in V{L, ST}L opcodes

Philippe Mathieu-Daudé posted 1 patch 1 month ago
Failed in applying to current master (apply log)
target/s390x/tcg/vec.h        | 12 ++++++++++++
target/s390x/tcg/vec_helper.c | 17 ++++++-----------
2 files changed, 18 insertions(+), 11 deletions(-)
[PATCH v3] target/s390x: Leverage 128-bit vector type in V{L, ST}L opcodes
Posted by Philippe Mathieu-Daudé 1 month ago
The access is unaligned (MO_UNALN), we can easily replace the 2
consecutive 64-bit calls by a single 128-bit one.
Considering atomicity of the new 128-bit access, the both opcodes
VLL (Vector Load with Length) and VSTL (Vector STore with Length)
access byte granularity, so use MO_ATOM_NONE.

Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
---
Based-on: <20260423135035.50126-1-philmd@linaro.org>

v3:
- do not increase addr (Ilya)
- MO_ATOM_NONE (Richard)
---
 target/s390x/tcg/vec.h        | 12 ++++++++++++
 target/s390x/tcg/vec_helper.c | 17 ++++++-----------
 2 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/target/s390x/tcg/vec.h b/target/s390x/tcg/vec.h
index 8d095efcfc6..34413f7e029 100644
--- a/target/s390x/tcg/vec.h
+++ b/target/s390x/tcg/vec.h
@@ -13,8 +13,10 @@
 #define S390X_VEC_H
 
 #include "tcg/tcg.h"
+#include "qemu/int128.h"
 
 typedef union S390Vector {
+    Int128 quadword;
     uint64_t doubleword[2];
     uint32_t word[4];
     uint16_t halfword[8];
@@ -72,6 +74,11 @@ static inline uint64_t s390_vec_read_element64(const S390Vector *v, uint8_t enr)
     return v->doubleword[enr];
 }
 
+static inline Int128 s390_vec_read_element128(const S390Vector *v)
+{
+    return v->quadword;
+}
+
 static inline uint64_t s390_vec_read_element(const S390Vector *v, uint8_t enr,
                                              uint8_t es)
 {
@@ -117,6 +124,11 @@ static inline void s390_vec_write_element64(S390Vector *v, uint8_t enr,
     v->doubleword[enr] = data;
 }
 
+static inline void s390_vec_write_element128(S390Vector *v, Int128 data)
+{
+    v->quadword = data;
+}
+
 static inline void s390_vec_write_element(S390Vector *v, uint8_t enr,
                                           uint8_t es, uint64_t data)
 {
diff --git a/target/s390x/tcg/vec_helper.c b/target/s390x/tcg/vec_helper.c
index 98eecd9fde6..a86819f61b6 100644
--- a/target/s390x/tcg/vec_helper.c
+++ b/target/s390x/tcg/vec_helper.c
@@ -48,14 +48,11 @@ void HELPER(vll)(CPUS390XState *env, void *v1, uint64_t addr, uint64_t bytes)
     MemOpIdx oi;
 
     if (likely(bytes >= 16)) {
-        uint64_t t0, t1;
+        Int128 t;
 
-        oi = make_memop_idx(MO_BE | MO_64 | MO_UNALN, mmu_idx);
-        t0 = cpu_ldq_mmu(env, addr, oi, ra);
-        addr = wrap_address(env, addr + 8);
-        t1 = cpu_ldq_mmu(env, addr, oi, ra);
-        s390_vec_write_element64(v1, 0, t0);
-        s390_vec_write_element64(v1, 1, t1);
+        oi = make_memop_idx(MO_128 | MO_UNALN | MO_ATOM_NONE | MO_BE, mmu_idx);
+        t = cpu_ld16_mmu(env, addr, oi, ra);
+        s390_vec_write_element128(v1, t);
     } else {
         S390Vector tmp = {};
 
@@ -206,10 +203,8 @@ void HELPER(vstl)(CPUS390XState *env, const void *v1, uint64_t addr,
     probe_write_access(env, addr, MIN(bytes, 16), GETPC());
 
     if (likely(bytes >= 16)) {
-        oi = make_memop_idx(MO_BE | MO_64 | MO_UNALN, mmu_idx);
-        cpu_stq_mmu(env, addr, s390_vec_read_element64(v1, 0), oi, ra);
-        addr = wrap_address(env, addr + 8);
-        cpu_stq_mmu(env, addr, s390_vec_read_element64(v1, 1), oi, ra);
+        oi = make_memop_idx(MO_128 | MO_UNALN | MO_ATOM_NONE | MO_BE, mmu_idx);
+        cpu_st16_mmu(env, addr, s390_vec_read_element128(v1), oi, ra);
     } else {
         oi = make_memop_idx(MO_8, mmu_idx);
 
-- 
2.53.0


Re: [PATCH v3] target/s390x: Leverage 128-bit vector type in V{L, ST}L opcodes
Posted by Ilya Leoshkevich 1 month ago
On 4/24/26 11:43, Philippe Mathieu-Daudé wrote:
> The access is unaligned (MO_UNALN), we can easily replace the 2
> consecutive 64-bit calls by a single 128-bit one.
> Considering atomicity of the new 128-bit access, the both opcodes
> VLL (Vector Load with Length) and VSTL (Vector STore with Length)
> access byte granularity, so use MO_ATOM_NONE.
>
> Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
> ---
> Based-on: <20260423135035.50126-1-philmd@linaro.org>
>
> v3:
> - do not increase addr (Ilya)
> - MO_ATOM_NONE (Richard)
> ---
>   target/s390x/tcg/vec.h        | 12 ++++++++++++
>   target/s390x/tcg/vec_helper.c | 17 ++++++-----------
>   2 files changed, 18 insertions(+), 11 deletions(-)
>
> diff --git a/target/s390x/tcg/vec.h b/target/s390x/tcg/vec.h
> index 8d095efcfc6..34413f7e029 100644
> --- a/target/s390x/tcg/vec.h
> +++ b/target/s390x/tcg/vec.h
> @@ -13,8 +13,10 @@
>   #define S390X_VEC_H
>   
>   #include "tcg/tcg.h"
> +#include "qemu/int128.h"
>   
>   typedef union S390Vector {
> +    Int128 quadword;
>       uint64_t doubleword[2];
>       uint32_t word[4];
>       uint16_t halfword[8];
> @@ -72,6 +74,11 @@ static inline uint64_t s390_vec_read_element64(const S390Vector *v, uint8_t enr)
>       return v->doubleword[enr];
>   }
>   
> +static inline Int128 s390_vec_read_element128(const S390Vector *v)
> +{
> +    return v->quadword;
> +}
> +
>   static inline uint64_t s390_vec_read_element(const S390Vector *v, uint8_t enr,
>                                                uint8_t es)
>   {
> @@ -117,6 +124,11 @@ static inline void s390_vec_write_element64(S390Vector *v, uint8_t enr,
>       v->doubleword[enr] = data;
>   }
>   
> +static inline void s390_vec_write_element128(S390Vector *v, Int128 data)
> +{
> +    v->quadword = data;
> +}
> +
>   static inline void s390_vec_write_element(S390Vector *v, uint8_t enr,
>                                             uint8_t es, uint64_t data)
>   {
> diff --git a/target/s390x/tcg/vec_helper.c b/target/s390x/tcg/vec_helper.c
> index 98eecd9fde6..a86819f61b6 100644
> --- a/target/s390x/tcg/vec_helper.c
> +++ b/target/s390x/tcg/vec_helper.c
> @@ -48,14 +48,11 @@ void HELPER(vll)(CPUS390XState *env, void *v1, uint64_t addr, uint64_t bytes)
>       MemOpIdx oi;
>   
>       if (likely(bytes >= 16)) {
> -        uint64_t t0, t1;
> +        Int128 t;
>   
> -        oi = make_memop_idx(MO_BE | MO_64 | MO_UNALN, mmu_idx);
> -        t0 = cpu_ldq_mmu(env, addr, oi, ra);
> -        addr = wrap_address(env, addr + 8);
> -        t1 = cpu_ldq_mmu(env, addr, oi, ra);
> -        s390_vec_write_element64(v1, 0, t0);
> -        s390_vec_write_element64(v1, 1, t1);
> +        oi = make_memop_idx(MO_128 | MO_UNALN | MO_ATOM_NONE | MO_BE, mmu_idx);
> +        t = cpu_ld16_mmu(env, addr, oi, ra);
> +        s390_vec_write_element128(v1, t);
>       } else {
>           S390Vector tmp = {};
>   
> @@ -206,10 +203,8 @@ void HELPER(vstl)(CPUS390XState *env, const void *v1, uint64_t addr,
>       probe_write_access(env, addr, MIN(bytes, 16), GETPC());
>   
>       if (likely(bytes >= 16)) {
> -        oi = make_memop_idx(MO_BE | MO_64 | MO_UNALN, mmu_idx);
> -        cpu_stq_mmu(env, addr, s390_vec_read_element64(v1, 0), oi, ra);
> -        addr = wrap_address(env, addr + 8);
> -        cpu_stq_mmu(env, addr, s390_vec_read_element64(v1, 1), oi, ra);
> +        oi = make_memop_idx(MO_128 | MO_UNALN | MO_ATOM_NONE | MO_BE, mmu_idx);
> +        cpu_st16_mmu(env, addr, s390_vec_read_element128(v1), oi, ra);
>       } else {
>           oi = make_memop_idx(MO_8, mmu_idx);
>   


In general LGTM, but I just tried this on top of 
20260423135035.50126-1-philmd@linaro.org and I'm consistently getting:


make[2]: *** [Makefile:236: run-precise-smc-softmmu] Error 1


The test uses VSTL, so it's definitely related.



Re: [PATCH v3] target/s390x: Leverage 128-bit vector type in V{L, ST}L opcodes
Posted by Philippe Mathieu-Daudé 1 month ago
On 25/4/26 00:05, Ilya Leoshkevich wrote:
> 
> On 4/24/26 11:43, Philippe Mathieu-Daudé wrote:
>> The access is unaligned (MO_UNALN), we can easily replace the 2
>> consecutive 64-bit calls by a single 128-bit one.
>> Considering atomicity of the new 128-bit access, the both opcodes
>> VLL (Vector Load with Length) and VSTL (Vector STore with Length)
>> access byte granularity, so use MO_ATOM_NONE.
>>
>> Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
>> ---
>> Based-on: <20260423135035.50126-1-philmd@linaro.org>
>>
>> v3:
>> - do not increase addr (Ilya)
>> - MO_ATOM_NONE (Richard)
>> ---
>>   target/s390x/tcg/vec.h        | 12 ++++++++++++
>>   target/s390x/tcg/vec_helper.c | 17 ++++++-----------
>>   2 files changed, 18 insertions(+), 11 deletions(-)
>>
>> diff --git a/target/s390x/tcg/vec.h b/target/s390x/tcg/vec.h
>> index 8d095efcfc6..34413f7e029 100644
>> --- a/target/s390x/tcg/vec.h
>> +++ b/target/s390x/tcg/vec.h
>> @@ -13,8 +13,10 @@
>>   #define S390X_VEC_H
>>   #include "tcg/tcg.h"
>> +#include "qemu/int128.h"
>>   typedef union S390Vector {
>> +    Int128 quadword;
>>       uint64_t doubleword[2];
>>       uint32_t word[4];
>>       uint16_t halfword[8];
>> @@ -72,6 +74,11 @@ static inline uint64_t 
>> s390_vec_read_element64(const S390Vector *v, uint8_t enr)
>>       return v->doubleword[enr];
>>   }
>> +static inline Int128 s390_vec_read_element128(const S390Vector *v)
>> +{
>> +    return v->quadword;
>> +}
>> +
>>   static inline uint64_t s390_vec_read_element(const S390Vector *v, 
>> uint8_t enr,
>>                                                uint8_t es)
>>   {
>> @@ -117,6 +124,11 @@ static inline void 
>> s390_vec_write_element64(S390Vector *v, uint8_t enr,
>>       v->doubleword[enr] = data;
>>   }
>> +static inline void s390_vec_write_element128(S390Vector *v, Int128 data)
>> +{
>> +    v->quadword = data;
>> +}
>> +
>>   static inline void s390_vec_write_element(S390Vector *v, uint8_t enr,
>>                                             uint8_t es, uint64_t data)
>>   {
>> diff --git a/target/s390x/tcg/vec_helper.c b/target/s390x/tcg/ 
>> vec_helper.c
>> index 98eecd9fde6..a86819f61b6 100644
>> --- a/target/s390x/tcg/vec_helper.c
>> +++ b/target/s390x/tcg/vec_helper.c
>> @@ -48,14 +48,11 @@ void HELPER(vll)(CPUS390XState *env, void *v1, 
>> uint64_t addr, uint64_t bytes)
>>       MemOpIdx oi;
>>       if (likely(bytes >= 16)) {
>> -        uint64_t t0, t1;
>> +        Int128 t;
>> -        oi = make_memop_idx(MO_BE | MO_64 | MO_UNALN, mmu_idx);
>> -        t0 = cpu_ldq_mmu(env, addr, oi, ra);
>> -        addr = wrap_address(env, addr + 8);
>> -        t1 = cpu_ldq_mmu(env, addr, oi, ra);
>> -        s390_vec_write_element64(v1, 0, t0);
>> -        s390_vec_write_element64(v1, 1, t1);
>> +        oi = make_memop_idx(MO_128 | MO_UNALN | MO_ATOM_NONE | MO_BE, 
>> mmu_idx);
>> +        t = cpu_ld16_mmu(env, addr, oi, ra);
>> +        s390_vec_write_element128(v1, t);
>>       } else {
>>           S390Vector tmp = {};
>> @@ -206,10 +203,8 @@ void HELPER(vstl)(CPUS390XState *env, const void 
>> *v1, uint64_t addr,
>>       probe_write_access(env, addr, MIN(bytes, 16), GETPC());
>>       if (likely(bytes >= 16)) {
>> -        oi = make_memop_idx(MO_BE | MO_64 | MO_UNALN, mmu_idx);
>> -        cpu_stq_mmu(env, addr, s390_vec_read_element64(v1, 0), oi, ra);
>> -        addr = wrap_address(env, addr + 8);
>> -        cpu_stq_mmu(env, addr, s390_vec_read_element64(v1, 1), oi, ra);
>> +        oi = make_memop_idx(MO_128 | MO_UNALN | MO_ATOM_NONE | MO_BE, 
>> mmu_idx);
>> +        cpu_st16_mmu(env, addr, s390_vec_read_element128(v1), oi, ra);
>>       } else {
>>           oi = make_memop_idx(MO_8, mmu_idx);
> 
> 
> In general LGTM, but I just tried this on top of 20260423135035.50126-1- 
> philmd@linaro.org and I'm consistently getting:
> 
> 
> make[2]: *** [Makefile:236: run-precise-smc-softmmu] Error 1
> 
> 
> The test uses VSTL, so it's definitely related.

Is this on a big-endian host? (Asking because this is the single
configuration I didn't tested)

Re: [PATCH v3] target/s390x: Leverage 128-bit vector type in V{L, ST}L opcodes
Posted by Ilya Leoshkevich 1 month ago
On 4/25/26 11:09, Philippe Mathieu-Daudé wrote:
> On 25/4/26 00:05, Ilya Leoshkevich wrote:
>>
>> On 4/24/26 11:43, Philippe Mathieu-Daudé wrote:
>>> The access is unaligned (MO_UNALN), we can easily replace the 2
>>> consecutive 64-bit calls by a single 128-bit one.
>>> Considering atomicity of the new 128-bit access, the both opcodes
>>> VLL (Vector Load with Length) and VSTL (Vector STore with Length)
>>> access byte granularity, so use MO_ATOM_NONE.
>>>
>>> Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
>>> ---
>>> Based-on: <20260423135035.50126-1-philmd@linaro.org>
>>>
>>> v3:
>>> - do not increase addr (Ilya)
>>> - MO_ATOM_NONE (Richard)
>>> ---
>>>   target/s390x/tcg/vec.h        | 12 ++++++++++++
>>>   target/s390x/tcg/vec_helper.c | 17 ++++++-----------
>>>   2 files changed, 18 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/target/s390x/tcg/vec.h b/target/s390x/tcg/vec.h
>>> index 8d095efcfc6..34413f7e029 100644
>>> --- a/target/s390x/tcg/vec.h
>>> +++ b/target/s390x/tcg/vec.h
>>> @@ -13,8 +13,10 @@
>>>   #define S390X_VEC_H
>>>   #include "tcg/tcg.h"
>>> +#include "qemu/int128.h"
>>>   typedef union S390Vector {
>>> +    Int128 quadword;
>>>       uint64_t doubleword[2];
>>>       uint32_t word[4];
>>>       uint16_t halfword[8];
>>> @@ -72,6 +74,11 @@ static inline uint64_t 
>>> s390_vec_read_element64(const S390Vector *v, uint8_t enr)
>>>       return v->doubleword[enr];
>>>   }
>>> +static inline Int128 s390_vec_read_element128(const S390Vector *v)
>>> +{
>>> +    return v->quadword;
>>> +}
>>> +
>>>   static inline uint64_t s390_vec_read_element(const S390Vector *v, 
>>> uint8_t enr,
>>>                                                uint8_t es)
>>>   {
>>> @@ -117,6 +124,11 @@ static inline void 
>>> s390_vec_write_element64(S390Vector *v, uint8_t enr,
>>>       v->doubleword[enr] = data;
>>>   }
>>> +static inline void s390_vec_write_element128(S390Vector *v, Int128 
>>> data)
>>> +{
>>> +    v->quadword = data;
>>> +}
>>> +
>>>   static inline void s390_vec_write_element(S390Vector *v, uint8_t enr,
>>>                                             uint8_t es, uint64_t data)
>>>   {
>>> diff --git a/target/s390x/tcg/vec_helper.c b/target/s390x/tcg/ 
>>> vec_helper.c
>>> index 98eecd9fde6..a86819f61b6 100644
>>> --- a/target/s390x/tcg/vec_helper.c
>>> +++ b/target/s390x/tcg/vec_helper.c
>>> @@ -48,14 +48,11 @@ void HELPER(vll)(CPUS390XState *env, void *v1, 
>>> uint64_t addr, uint64_t bytes)
>>>       MemOpIdx oi;
>>>       if (likely(bytes >= 16)) {
>>> -        uint64_t t0, t1;
>>> +        Int128 t;
>>> -        oi = make_memop_idx(MO_BE | MO_64 | MO_UNALN, mmu_idx);
>>> -        t0 = cpu_ldq_mmu(env, addr, oi, ra);
>>> -        addr = wrap_address(env, addr + 8);
>>> -        t1 = cpu_ldq_mmu(env, addr, oi, ra);
>>> -        s390_vec_write_element64(v1, 0, t0);
>>> -        s390_vec_write_element64(v1, 1, t1);
>>> +        oi = make_memop_idx(MO_128 | MO_UNALN | MO_ATOM_NONE | 
>>> MO_BE, mmu_idx);
>>> +        t = cpu_ld16_mmu(env, addr, oi, ra);
>>> +        s390_vec_write_element128(v1, t);
>>>       } else {
>>>           S390Vector tmp = {};
>>> @@ -206,10 +203,8 @@ void HELPER(vstl)(CPUS390XState *env, const 
>>> void *v1, uint64_t addr,
>>>       probe_write_access(env, addr, MIN(bytes, 16), GETPC());
>>>       if (likely(bytes >= 16)) {
>>> -        oi = make_memop_idx(MO_BE | MO_64 | MO_UNALN, mmu_idx);
>>> -        cpu_stq_mmu(env, addr, s390_vec_read_element64(v1, 0), oi, 
>>> ra);
>>> -        addr = wrap_address(env, addr + 8);
>>> -        cpu_stq_mmu(env, addr, s390_vec_read_element64(v1, 1), oi, 
>>> ra);
>>> +        oi = make_memop_idx(MO_128 | MO_UNALN | MO_ATOM_NONE | 
>>> MO_BE, mmu_idx);
>>> +        cpu_st16_mmu(env, addr, s390_vec_read_element128(v1), oi, ra);
>>>       } else {
>>>           oi = make_memop_idx(MO_8, mmu_idx);
>>
>>
>> In general LGTM, but I just tried this on top of 
>> 20260423135035.50126-1- philmd@linaro.org and I'm consistently getting:
>>
>>
>> make[2]: *** [Makefile:236: run-precise-smc-softmmu] Error 1
>>
>>
>> The test uses VSTL, so it's definitely related.
>
> Is this on a big-endian host? (Asking because this is the single
> configuration I didn't tested)


No, that's my x86_64 laptop.

Configure flags: '--target-list=s390x-linux-user,s390x-softmmu' 
'--disable-docs'



$ git log -1
commit 2fc3bdf329c54489438408161ebc907a6fa62329 (HEAD)
Author: Philippe Mathieu-Daudé <philmd@linaro.org>
Date:   Fri Apr 24 12:43:25 2026 +0200

     target/s390x: Leverage 128-bit vector type in V{L, ST}L opcodes


$ uname -a && make -j24 && make -j24 check-tcg ; echo $?
Linux heavy 6.19.10-100.fc42.x86_64 #1 SMP PREEMPT_DYNAMIC Wed Mar 25 
17:18:37 UTC 2026 x86_64 GNU/Linux

[...]

make[2]: *** [Makefile:236: run-precise-smc-softmmu] Error 1


$ git revert HEAD

$ uname -a && make -j24 && make -j24 check-tcg ; echo $?
[...]

0


Re: [PATCH v3] target/s390x: Leverage 128-bit vector type in V{L, ST}L opcodes
Posted by Richard Henderson 1 month ago
On 4/24/26 20:43, Philippe Mathieu-Daudé wrote:
> The access is unaligned (MO_UNALN), we can easily replace the 2
> consecutive 64-bit calls by a single 128-bit one.
> Considering atomicity of the new 128-bit access, the both opcodes
> VLL (Vector Load with Length) and VSTL (Vector STore with Length)
> access byte granularity, so use MO_ATOM_NONE.
> 
> Signed-off-by: Philippe Mathieu-Daudé<philmd@linaro.org>
> ---
> Based-on:<20260423135035.50126-1-philmd@linaro.org>
> 
> v3:
> - do not increase addr (Ilya)
> - MO_ATOM_NONE (Richard)
> ---
>   target/s390x/tcg/vec.h        | 12 ++++++++++++
>   target/s390x/tcg/vec_helper.c | 17 ++++++-----------
>   2 files changed, 18 insertions(+), 11 deletions(-)

Reviewed-by: Richard Henderson <richard.henderson@linaro.org>

r~