[PATCH v2] target/s390x: Leverage 128-bit vector type in VLL / VSTL opcodes

Philippe Mathieu-Daudé posted 1 patch 1 month, 1 week ago
Failed in applying to current master (apply log)
target/s390x/tcg/vec.h        | 12 ++++++++++++
target/s390x/tcg/vec_helper.c | 19 ++++++++-----------
2 files changed, 20 insertions(+), 11 deletions(-)
[PATCH v2] target/s390x: Leverage 128-bit vector type in VLL / VSTL opcodes
Posted by Philippe Mathieu-Daudé 1 month, 1 week ago
The access is unaligned, we can easily replace the 2
consecutive 64-bit calls by a single 128-bit one.

Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
---
Based-on: <20260423135035.50126-1-philmd@linaro.org>
---
 target/s390x/tcg/vec.h        | 12 ++++++++++++
 target/s390x/tcg/vec_helper.c | 19 ++++++++-----------
 2 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/target/s390x/tcg/vec.h b/target/s390x/tcg/vec.h
index 8d095efcfc6..34413f7e029 100644
--- a/target/s390x/tcg/vec.h
+++ b/target/s390x/tcg/vec.h
@@ -13,8 +13,10 @@
 #define S390X_VEC_H
 
 #include "tcg/tcg.h"
+#include "qemu/int128.h"
 
 typedef union S390Vector {
+    Int128 quadword;
     uint64_t doubleword[2];
     uint32_t word[4];
     uint16_t halfword[8];
@@ -72,6 +74,11 @@ static inline uint64_t s390_vec_read_element64(const S390Vector *v, uint8_t enr)
     return v->doubleword[enr];
 }
 
+static inline Int128 s390_vec_read_element128(const S390Vector *v)
+{
+    return v->quadword;
+}
+
 static inline uint64_t s390_vec_read_element(const S390Vector *v, uint8_t enr,
                                              uint8_t es)
 {
@@ -117,6 +124,11 @@ static inline void s390_vec_write_element64(S390Vector *v, uint8_t enr,
     v->doubleword[enr] = data;
 }
 
+static inline void s390_vec_write_element128(S390Vector *v, Int128 data)
+{
+    v->quadword = data;
+}
+
 static inline void s390_vec_write_element(S390Vector *v, uint8_t enr,
                                           uint8_t es, uint64_t data)
 {
diff --git a/target/s390x/tcg/vec_helper.c b/target/s390x/tcg/vec_helper.c
index 98eecd9fde6..feb99f51c78 100644
--- a/target/s390x/tcg/vec_helper.c
+++ b/target/s390x/tcg/vec_helper.c
@@ -48,14 +48,12 @@ void HELPER(vll)(CPUS390XState *env, void *v1, uint64_t addr, uint64_t bytes)
     MemOpIdx oi;
 
     if (likely(bytes >= 16)) {
-        uint64_t t0, t1;
+        Int128 t;
 
-        oi = make_memop_idx(MO_BE | MO_64 | MO_UNALN, mmu_idx);
-        t0 = cpu_ldq_mmu(env, addr, oi, ra);
-        addr = wrap_address(env, addr + 8);
-        t1 = cpu_ldq_mmu(env, addr, oi, ra);
-        s390_vec_write_element64(v1, 0, t0);
-        s390_vec_write_element64(v1, 1, t1);
+        oi = make_memop_idx(MO_BE | MO_128 | MO_UNALN, mmu_idx);
+        addr = wrap_address(env, addr + 16);
+        t = cpu_ld16_mmu(env, addr, oi, ra);
+        s390_vec_write_element128(v1, t);
     } else {
         S390Vector tmp = {};
 
@@ -206,10 +204,9 @@ void HELPER(vstl)(CPUS390XState *env, const void *v1, uint64_t addr,
     probe_write_access(env, addr, MIN(bytes, 16), GETPC());
 
     if (likely(bytes >= 16)) {
-        oi = make_memop_idx(MO_BE | MO_64 | MO_UNALN, mmu_idx);
-        cpu_stq_mmu(env, addr, s390_vec_read_element64(v1, 0), oi, ra);
-        addr = wrap_address(env, addr + 8);
-        cpu_stq_mmu(env, addr, s390_vec_read_element64(v1, 1), oi, ra);
+        oi = make_memop_idx(MO_BE | MO_128 | MO_UNALN, mmu_idx);
+        addr = wrap_address(env, addr + 16);
+        cpu_st16_mmu(env, addr, s390_vec_read_element128(v1), oi, ra);
     } else {
         oi = make_memop_idx(MO_8, mmu_idx);
 
-- 
2.53.0


Re: [PATCH v2] target/s390x: Leverage 128-bit vector type in VLL / VSTL opcodes
Posted by Richard Henderson 1 month ago
On 4/24/26 00:15, Philippe Mathieu-Daudé wrote:
> --- a/target/s390x/tcg/vec_helper.c
> +++ b/target/s390x/tcg/vec_helper.c
> @@ -48,14 +48,12 @@ void HELPER(vll)(CPUS390XState *env, void *v1, uint64_t addr, uint64_t bytes)
>       MemOpIdx oi;
>   
>       if (likely(bytes >= 16)) {
> -        uint64_t t0, t1;
> +        Int128 t;
>   
> -        oi = make_memop_idx(MO_BE | MO_64 | MO_UNALN, mmu_idx);
> -        t0 = cpu_ldq_mmu(env, addr, oi, ra);
> -        addr = wrap_address(env, addr + 8);
> -        t1 = cpu_ldq_mmu(env, addr, oi, ra);
> -        s390_vec_write_element64(v1, 0, t0);
> -        s390_vec_write_element64(v1, 1, t1);
> +        oi = make_memop_idx(MO_BE | MO_128 | MO_UNALN, mmu_idx);
> +        addr = wrap_address(env, addr + 16);
> +        t = cpu_ld16_mmu(env, addr, oi, ra);
> +        s390_vec_write_element128(v1, t);
>       } else {
>           S390Vector tmp = {};
>   
> @@ -206,10 +204,9 @@ void HELPER(vstl)(CPUS390XState *env, const void *v1, uint64_t addr,
>       probe_write_access(env, addr, MIN(bytes, 16), GETPC());
>   
>       if (likely(bytes >= 16)) {
> -        oi = make_memop_idx(MO_BE | MO_64 | MO_UNALN, mmu_idx);
> -        cpu_stq_mmu(env, addr, s390_vec_read_element64(v1, 0), oi, ra);
> -        addr = wrap_address(env, addr + 8);
> -        cpu_stq_mmu(env, addr, s390_vec_read_element64(v1, 1), oi, ra);
> +        oi = make_memop_idx(MO_BE | MO_128 | MO_UNALN, mmu_idx);
> +        addr = wrap_address(env, addr + 16);
> +        cpu_st16_mmu(env, addr, s390_vec_read_element128(v1), oi, ra);
>       } else {
>           oi = make_memop_idx(MO_8, mmu_idx);
>   

Bah, reviewed v1 before seeing v2.  Cut-and-paste:

The most important thing to consider when converting to 128-bit access is the atomicity, 
otherwise you can find yourself slowing down emulation when the host does not support 
atomic 128-bit access.

In this case, VLL (Vector Load with Length) loads n bytes, as can be seen with the 
fallback code just below.  So MO_ATOM_NONE is appropriate.  Similarly VSTL (Vector STore 
with Length).


r~

Re: [PATCH v2] target/s390x: Leverage 128-bit vector type in VLL / VSTL opcodes
Posted by Ilya Leoshkevich 1 month, 1 week ago
On 4/23/26 15:15, Philippe Mathieu-Daudé wrote:
> The access is unaligned, we can easily replace the 2
> consecutive 64-bit calls by a single 128-bit one.
>
> Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
> ---
> Based-on: <20260423135035.50126-1-philmd@linaro.org>
> ---
>   target/s390x/tcg/vec.h        | 12 ++++++++++++
>   target/s390x/tcg/vec_helper.c | 19 ++++++++-----------
>   2 files changed, 20 insertions(+), 11 deletions(-)
>
> diff --git a/target/s390x/tcg/vec.h b/target/s390x/tcg/vec.h
> index 8d095efcfc6..34413f7e029 100644
> --- a/target/s390x/tcg/vec.h
> +++ b/target/s390x/tcg/vec.h
> @@ -13,8 +13,10 @@
>   #define S390X_VEC_H
>   
>   #include "tcg/tcg.h"
> +#include "qemu/int128.h"
>   
>   typedef union S390Vector {
> +    Int128 quadword;
>       uint64_t doubleword[2];
>       uint32_t word[4];
>       uint16_t halfword[8];
> @@ -72,6 +74,11 @@ static inline uint64_t s390_vec_read_element64(const S390Vector *v, uint8_t enr)
>       return v->doubleword[enr];
>   }
>   
> +static inline Int128 s390_vec_read_element128(const S390Vector *v)
> +{
> +    return v->quadword;
> +}
> +
>   static inline uint64_t s390_vec_read_element(const S390Vector *v, uint8_t enr,
>                                                uint8_t es)
>   {
> @@ -117,6 +124,11 @@ static inline void s390_vec_write_element64(S390Vector *v, uint8_t enr,
>       v->doubleword[enr] = data;
>   }
>   
> +static inline void s390_vec_write_element128(S390Vector *v, Int128 data)
> +{
> +    v->quadword = data;
> +}
> +
>   static inline void s390_vec_write_element(S390Vector *v, uint8_t enr,
>                                             uint8_t es, uint64_t data)
>   {
> diff --git a/target/s390x/tcg/vec_helper.c b/target/s390x/tcg/vec_helper.c
> index 98eecd9fde6..feb99f51c78 100644
> --- a/target/s390x/tcg/vec_helper.c
> +++ b/target/s390x/tcg/vec_helper.c
> @@ -48,14 +48,12 @@ void HELPER(vll)(CPUS390XState *env, void *v1, uint64_t addr, uint64_t bytes)
>       MemOpIdx oi;
>   
>       if (likely(bytes >= 16)) {
> -        uint64_t t0, t1;
> +        Int128 t;
>   
> -        oi = make_memop_idx(MO_BE | MO_64 | MO_UNALN, mmu_idx);
> -        t0 = cpu_ldq_mmu(env, addr, oi, ra);
> -        addr = wrap_address(env, addr + 8);
> -        t1 = cpu_ldq_mmu(env, addr, oi, ra);
> -        s390_vec_write_element64(v1, 0, t0);
> -        s390_vec_write_element64(v1, 1, t1);
> +        oi = make_memop_idx(MO_BE | MO_128 | MO_UNALN, mmu_idx);
> +        addr = wrap_address(env, addr + 16);

I don't think we need to increment the address before loading from it.

We used to have +8, because it's the offset of the 2nd half.

> +        t = cpu_ld16_mmu(env, addr, oi, ra);
> +        s390_vec_write_element128(v1, t);
>       } else {
>           S390Vector tmp = {};
>   
> @@ -206,10 +204,9 @@ void HELPER(vstl)(CPUS390XState *env, const void *v1, uint64_t addr,
>       probe_write_access(env, addr, MIN(bytes, 16), GETPC());
>   
>       if (likely(bytes >= 16)) {
> -        oi = make_memop_idx(MO_BE | MO_64 | MO_UNALN, mmu_idx);
> -        cpu_stq_mmu(env, addr, s390_vec_read_element64(v1, 0), oi, ra);
> -        addr = wrap_address(env, addr + 8);
> -        cpu_stq_mmu(env, addr, s390_vec_read_element64(v1, 1), oi, ra);
> +        oi = make_memop_idx(MO_BE | MO_128 | MO_UNALN, mmu_idx);
> +        addr = wrap_address(env, addr + 16);

Same.

> +        cpu_st16_mmu(env, addr, s390_vec_read_element128(v1), oi, ra);
>       } else {
>           oi = make_memop_idx(MO_8, mmu_idx);
>   

Re: [PATCH v2] target/s390x: Leverage 128-bit vector type in VLL / VSTL opcodes
Posted by Philippe Mathieu-Daudé 1 month ago
On 23/4/26 17:41, Ilya Leoshkevich wrote:
> 
> On 4/23/26 15:15, Philippe Mathieu-Daudé wrote:
>> The access is unaligned, we can easily replace the 2
>> consecutive 64-bit calls by a single 128-bit one.
>>
>> Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
>> ---
>> Based-on: <20260423135035.50126-1-philmd@linaro.org>
>> ---
>>   target/s390x/tcg/vec.h        | 12 ++++++++++++
>>   target/s390x/tcg/vec_helper.c | 19 ++++++++-----------
>>   2 files changed, 20 insertions(+), 11 deletions(-)


>> diff --git a/target/s390x/tcg/vec_helper.c b/target/s390x/tcg/ 
>> vec_helper.c
>> index 98eecd9fde6..feb99f51c78 100644
>> --- a/target/s390x/tcg/vec_helper.c
>> +++ b/target/s390x/tcg/vec_helper.c
>> @@ -48,14 +48,12 @@ void HELPER(vll)(CPUS390XState *env, void *v1, 
>> uint64_t addr, uint64_t bytes)
>>       MemOpIdx oi;
>>       if (likely(bytes >= 16)) {
>> -        uint64_t t0, t1;
>> +        Int128 t;
>> -        oi = make_memop_idx(MO_BE | MO_64 | MO_UNALN, mmu_idx);
>> -        t0 = cpu_ldq_mmu(env, addr, oi, ra);
>> -        addr = wrap_address(env, addr + 8);
>> -        t1 = cpu_ldq_mmu(env, addr, oi, ra);
>> -        s390_vec_write_element64(v1, 0, t0);
>> -        s390_vec_write_element64(v1, 1, t1);
>> +        oi = make_memop_idx(MO_BE | MO_128 | MO_UNALN, mmu_idx);
>> +        addr = wrap_address(env, addr + 16);
> 
> I don't think we need to increment the address before loading from it.
> 
> We used to have +8, because it's the offset of the 2nd half.

Right!