target/s390x/tcg/vec.h | 12 ++++++++++++ target/s390x/tcg/vec_helper.c | 23 ++++++++++------------- 2 files changed, 22 insertions(+), 13 deletions(-)
The access is unaligned, we can easily replace the 2
consecutive 64-bit calls by a single 128-bit one.
Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
---
Based-on: <20260423135035.50126-1-philmd@linaro.org>
---
target/s390x/tcg/vec.h | 12 ++++++++++++
target/s390x/tcg/vec_helper.c | 23 ++++++++++-------------
2 files changed, 22 insertions(+), 13 deletions(-)
diff --git a/target/s390x/tcg/vec.h b/target/s390x/tcg/vec.h
index 8d095efcfc6..34413f7e029 100644
--- a/target/s390x/tcg/vec.h
+++ b/target/s390x/tcg/vec.h
@@ -13,8 +13,10 @@
#define S390X_VEC_H
#include "tcg/tcg.h"
+#include "qemu/int128.h"
typedef union S390Vector {
+ Int128 quadword;
uint64_t doubleword[2];
uint32_t word[4];
uint16_t halfword[8];
@@ -72,6 +74,11 @@ static inline uint64_t s390_vec_read_element64(const S390Vector *v, uint8_t enr)
return v->doubleword[enr];
}
+static inline Int128 s390_vec_read_element128(const S390Vector *v)
+{
+ return v->quadword;
+}
+
static inline uint64_t s390_vec_read_element(const S390Vector *v, uint8_t enr,
uint8_t es)
{
@@ -117,6 +124,11 @@ static inline void s390_vec_write_element64(S390Vector *v, uint8_t enr,
v->doubleword[enr] = data;
}
+static inline void s390_vec_write_element128(S390Vector *v, Int128 data)
+{
+ v->quadword = data;
+}
+
static inline void s390_vec_write_element(S390Vector *v, uint8_t enr,
uint8_t es, uint64_t data)
{
diff --git a/target/s390x/tcg/vec_helper.c b/target/s390x/tcg/vec_helper.c
index 98eecd9fde6..4d5529e1781 100644
--- a/target/s390x/tcg/vec_helper.c
+++ b/target/s390x/tcg/vec_helper.c
@@ -43,19 +43,17 @@ void HELPER(gvec_vbperm)(void *v1, const void *v2, const void *v3,
void HELPER(vll)(CPUS390XState *env, void *v1, uint64_t addr, uint64_t bytes)
{
- const int mmu_idx = cpu_mmu_index(env_cpu(env), false);
+ int mmu_idx = cpu_mmu_index(env_cpu(env), false);
const uintptr_t ra = GETPC();
MemOpIdx oi;
if (likely(bytes >= 16)) {
- uint64_t t0, t1;
+ Int128 t;
- oi = make_memop_idx(MO_BE | MO_64 | MO_UNALN, mmu_idx);
- t0 = cpu_ldq_mmu(env, addr, oi, ra);
- addr = wrap_address(env, addr + 8);
- t1 = cpu_ldq_mmu(env, addr, oi, ra);
- s390_vec_write_element64(v1, 0, t0);
- s390_vec_write_element64(v1, 1, t1);
+ oi = make_memop_idx(MO_BE | MO_128 | MO_UNALN, mmu_idx);
+ addr = wrap_address(env, addr + 16);
+ t = cpu_ld16_mmu(env, addr, oi, ra);
+ s390_vec_write_element128(v1, t);
} else {
S390Vector tmp = {};
@@ -198,7 +196,7 @@ void HELPER(gvec_vperm)(void *v1, const void *v2, const void *v3,
void HELPER(vstl)(CPUS390XState *env, const void *v1, uint64_t addr,
uint64_t bytes)
{
- const int mmu_idx = cpu_mmu_index(env_cpu(env), false);
+ int mmu_idx = cpu_mmu_index(env_cpu(env), false);
const uintptr_t ra = GETPC();
MemOpIdx oi;
@@ -206,10 +204,9 @@ void HELPER(vstl)(CPUS390XState *env, const void *v1, uint64_t addr,
probe_write_access(env, addr, MIN(bytes, 16), GETPC());
if (likely(bytes >= 16)) {
- oi = make_memop_idx(MO_BE | MO_64 | MO_UNALN, mmu_idx);
- cpu_stq_mmu(env, addr, s390_vec_read_element64(v1, 0), oi, ra);
- addr = wrap_address(env, addr + 8);
- cpu_stq_mmu(env, addr, s390_vec_read_element64(v1, 1), oi, ra);
+ oi = make_memop_idx(MO_BE | MO_128 | MO_UNALN, mmu_idx);
+ addr = wrap_address(env, addr + 16);
+ cpu_st16_mmu(env, addr, s390_vec_read_element128(v1), oi, ra);
} else {
oi = make_memop_idx(MO_8, mmu_idx);
--
2.53.0
On 4/24/26 00:12, Philippe Mathieu-Daudé wrote:
> @@ -43,19 +43,17 @@ void HELPER(gvec_vbperm)(void *v1, const void *v2, const void *v3,
>
> void HELPER(vll)(CPUS390XState *env, void *v1, uint64_t addr, uint64_t bytes)
> {
> - const int mmu_idx = cpu_mmu_index(env_cpu(env), false);
> + int mmu_idx = cpu_mmu_index(env_cpu(env), false);
> const uintptr_t ra = GETPC();
> MemOpIdx oi;
>
> if (likely(bytes >= 16)) {
> - uint64_t t0, t1;
> + Int128 t;
>
> - oi = make_memop_idx(MO_BE | MO_64 | MO_UNALN, mmu_idx);
> - t0 = cpu_ldq_mmu(env, addr, oi, ra);
> - addr = wrap_address(env, addr + 8);
> - t1 = cpu_ldq_mmu(env, addr, oi, ra);
> - s390_vec_write_element64(v1, 0, t0);
> - s390_vec_write_element64(v1, 1, t1);
> + oi = make_memop_idx(MO_BE | MO_128 | MO_UNALN, mmu_idx);
> + addr = wrap_address(env, addr + 16);
> + t = cpu_ld16_mmu(env, addr, oi, ra);
> + s390_vec_write_element128(v1, t);
The most important thing to consider when converting to 128-bit access is the atomicity,
otherwise you can find yourself slowing down emulation when the host does not support
atomic 128-bit access.
In this case, VLL (Vector Load with Length) loads n bytes, as can be seen with the
fallback code just below. So MO_ATOM_NONE is appropriate. Similarly VSTL (Vector STore
with Length).
r~
On 23/4/26 16:12, Philippe Mathieu-Daudé wrote:
> The access is unaligned, we can easily replace the 2
> consecutive 64-bit calls by a single 128-bit one.
>
> Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
> ---
> Based-on: <20260423135035.50126-1-philmd@linaro.org>
> ---
> target/s390x/tcg/vec.h | 12 ++++++++++++
> target/s390x/tcg/vec_helper.c | 23 ++++++++++-------------
> 2 files changed, 22 insertions(+), 13 deletions(-)
> @@ -198,7 +196,7 @@ void HELPER(gvec_vperm)(void *v1, const void *v2, const void *v3,
> void HELPER(vstl)(CPUS390XState *env, const void *v1, uint64_t addr,
> uint64_t bytes)
> {
> - const int mmu_idx = cpu_mmu_index(env_cpu(env), false);
> + int mmu_idx = cpu_mmu_index(env_cpu(env), false);
Oops, mis-rebase.
© 2016 - 2026 Red Hat, Inc.