[RFC PATCH v1 03/43] accel/tcg: Add gvec size changing operations

Anton Johansson via posted 43 patches 2 days, 13 hours ago
[RFC PATCH v1 03/43] accel/tcg: Add gvec size changing operations
Posted by Anton Johansson via 2 days, 13 hours ago
Adds new functions to the gvec API for truncating, sign- or zero
extending vector elements.  Currently implemented as helper functions,
these may be mapped onto host vector instructions in the future.

For the time being, allows translation of more complicated vector
instructions by helper-to-tcg.

Signed-off-by: Anton Johansson <anjo@rev.ng>
---
 accel/tcg/tcg-runtime-gvec.c     | 41 +++++++++++++++++
 accel/tcg/tcg-runtime.h          | 22 +++++++++
 include/tcg/tcg-op-gvec-common.h | 18 ++++++++
 tcg/tcg-op-gvec.c                | 78 ++++++++++++++++++++++++++++++++
 4 files changed, 159 insertions(+)

diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
index afca89baa1..685c991e6a 100644
--- a/accel/tcg/tcg-runtime-gvec.c
+++ b/accel/tcg/tcg-runtime-gvec.c
@@ -1569,3 +1569,44 @@ void HELPER(gvec_bitsel)(void *d, void *a, void *b, void *c, uint32_t desc)
     }
     clear_high(d, oprsz, desc);
 }
+
+#define DO_SZ_OP1(NAME, DSTTY, SRCTY)                                      \
+void HELPER(NAME)(void *d, void *a, uint32_t desc)                         \
+{                                                                          \
+    intptr_t oprsz = simd_oprsz(desc);                                     \
+    intptr_t elsz = oprsz/sizeof(DSTTY);                                   \
+    intptr_t i;                                                            \
+                                                                           \
+    for (i = 0; i < elsz; ++i) {                                           \
+        SRCTY aa = *((SRCTY *) a + i);                                     \
+        *((DSTTY *) d + i) = aa;                                           \
+    }                                                                      \
+    clear_high(d, oprsz, desc);                                            \
+}
+
+#define DO_SZ_OP2(NAME, INTTY, DSTSZ, SRCSZ) \
+    DO_SZ_OP1(NAME##SRCSZ##_##DSTSZ, INTTY##DSTSZ##_t, INTTY##SRCSZ##_t)
+
+DO_SZ_OP2(gvec_trunc, uint, 32, 64)
+DO_SZ_OP2(gvec_trunc, uint, 16, 64)
+DO_SZ_OP2(gvec_trunc, uint, 8,  64)
+DO_SZ_OP2(gvec_trunc, uint, 16, 32)
+DO_SZ_OP2(gvec_trunc, uint, 8,  32)
+DO_SZ_OP2(gvec_trunc, uint, 8,  16)
+
+DO_SZ_OP2(gvec_zext, uint, 64, 32)
+DO_SZ_OP2(gvec_zext, uint, 64, 16)
+DO_SZ_OP2(gvec_zext, uint, 64, 8)
+DO_SZ_OP2(gvec_zext, uint, 32, 16)
+DO_SZ_OP2(gvec_zext, uint, 32, 8)
+DO_SZ_OP2(gvec_zext, uint, 16, 8)
+
+DO_SZ_OP2(gvec_sext, int, 64, 32)
+DO_SZ_OP2(gvec_sext, int, 64, 16)
+DO_SZ_OP2(gvec_sext, int, 64, 8)
+DO_SZ_OP2(gvec_sext, int, 32, 16)
+DO_SZ_OP2(gvec_sext, int, 32, 8)
+DO_SZ_OP2(gvec_sext, int, 16, 8)
+
+#undef DO_SZ_OP1
+#undef DO_SZ_OP2
diff --git a/accel/tcg/tcg-runtime.h b/accel/tcg/tcg-runtime.h
index 0a4d31eb48..5045655bf8 100644
--- a/accel/tcg/tcg-runtime.h
+++ b/accel/tcg/tcg-runtime.h
@@ -1,3 +1,4 @@
+#include "tcg/tcg.h"
 DEF_HELPER_FLAGS_2(div_i32, TCG_CALL_NO_RWG_SE, s32, s32, s32)
 DEF_HELPER_FLAGS_2(rem_i32, TCG_CALL_NO_RWG_SE, s32, s32, s32)
 DEF_HELPER_FLAGS_2(divu_i32, TCG_CALL_NO_RWG_SE, i32, i32, i32)
@@ -328,3 +329,24 @@ DEF_HELPER_FLAGS_4(gvec_leus32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
 DEF_HELPER_FLAGS_4(gvec_leus64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
 
 DEF_HELPER_FLAGS_5(gvec_bitsel, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(gvec_trunc64_32, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_trunc64_16, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_trunc64_8,  TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_trunc32_16, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_trunc32_8,  TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_trunc16_8,  TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(gvec_zext32_64, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_zext16_64, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_zext8_64,  TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_zext16_32, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_zext8_32,  TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_zext8_16,  TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(gvec_sext32_64, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_sext16_64, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_sext8_64,  TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_sext16_32, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_sext8_32,  TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_sext8_16,  TCG_CALL_NO_RWG, void, ptr, ptr, i32)
diff --git a/include/tcg/tcg-op-gvec-common.h b/include/tcg/tcg-op-gvec-common.h
index 65553f5f97..39b0c2f64e 100644
--- a/include/tcg/tcg-op-gvec-common.h
+++ b/include/tcg/tcg-op-gvec-common.h
@@ -390,6 +390,24 @@ void tcg_gen_gvec_bitsel(unsigned vece, uint32_t dofs, uint32_t aofs,
                          uint32_t bofs, uint32_t cofs,
                          uint32_t oprsz, uint32_t maxsz);
 
+/*
+ * Perform vector element truncation/extension operations
+ */
+
+void tcg_gen_gvec_trunc(unsigned vecde, unsigned vecse,
+                        uint32_t dofs, uint32_t aofs,
+                        uint32_t doprsz, uint32_t aoprsz,
+                        uint32_t maxsz);
+
+void tcg_gen_gvec_zext(unsigned vecde, unsigned vecse,
+                       uint32_t dofs, uint32_t aofs,
+                       uint32_t doprsz, uint32_t aoprsz,
+                       uint32_t maxsz);
+
+void tcg_gen_gvec_sext(unsigned vecde, unsigned vecse,
+                       uint32_t dofs, uint32_t aofs,
+                       uint32_t doprsz, uint32_t aoprsz,
+                       uint32_t maxsz);
 /*
  * 64-bit vector operations.  Use these when the register has been allocated
  * with tcg_global_mem_new_i64, and so we cannot also address it via pointer.
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index 97e4df221a..80649dc0d2 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -4008,3 +4008,81 @@ void tcg_gen_gvec_bitsel(unsigned vece, uint32_t dofs, uint32_t aofs,
 
     tcg_gen_gvec_4(dofs, aofs, bofs, cofs, oprsz, maxsz, &g);
 }
+
+void tcg_gen_gvec_trunc(unsigned vecde, unsigned vecse,
+                        uint32_t dofs, uint32_t aofs,
+                        uint32_t doprsz, uint32_t aoprsz,
+                        uint32_t maxsz)
+{
+    gen_helper_gvec_2 * const fns[4][4] = {
+        [MO_64] = {
+            [MO_32] = gen_helper_gvec_trunc64_32,
+            [MO_16] = gen_helper_gvec_trunc64_16,
+            [MO_8]  = gen_helper_gvec_trunc64_8,
+        },
+        [MO_32] = {
+            [MO_16] = gen_helper_gvec_trunc32_16,
+            [MO_8]  = gen_helper_gvec_trunc32_8,
+        },
+        [MO_16] = {
+            [MO_8]  = gen_helper_gvec_trunc16_8,
+        },
+    };
+
+    gen_helper_gvec_2 *fn = fns[vecse][vecde];
+    tcg_debug_assert(fn != 0 && vecse > vecde);
+
+    tcg_gen_gvec_2_ool(dofs, aofs, doprsz, maxsz, 0, fn);
+}
+
+void tcg_gen_gvec_zext(unsigned vecde, unsigned vecse,
+                       uint32_t dofs, uint32_t aofs,
+                       uint32_t doprsz, uint32_t aoprsz,
+                       uint32_t maxsz)
+{
+    gen_helper_gvec_2 * const fns[4][4] = {
+        [MO_8] = {
+            [MO_16] = gen_helper_gvec_zext8_16,
+            [MO_32] = gen_helper_gvec_zext8_32,
+            [MO_64] = gen_helper_gvec_zext8_64,
+        },
+        [MO_16] = {
+            [MO_32] = gen_helper_gvec_zext16_32,
+            [MO_64] = gen_helper_gvec_zext16_64,
+        },
+        [MO_32] = {
+            [MO_64] = gen_helper_gvec_zext32_64,
+        },
+    };
+
+    gen_helper_gvec_2 *fn = fns[vecse][vecde];
+    tcg_debug_assert(fn != 0 && vecse < vecde);
+
+    tcg_gen_gvec_2_ool(dofs, aofs, doprsz, maxsz, 0, fn);
+}
+
+void tcg_gen_gvec_sext(unsigned vecde, unsigned vecse,
+                       uint32_t dofs, uint32_t aofs,
+                       uint32_t doprsz, uint32_t aoprsz,
+                       uint32_t maxsz)
+{
+    gen_helper_gvec_2 * const fns[4][4] = {
+        [MO_8] = {
+            [MO_16] = gen_helper_gvec_sext8_16,
+            [MO_32] = gen_helper_gvec_sext8_32,
+            [MO_64] = gen_helper_gvec_sext8_64,
+        },
+        [MO_16] = {
+            [MO_32] = gen_helper_gvec_sext16_32,
+            [MO_64] = gen_helper_gvec_sext16_64,
+        },
+        [MO_32] = {
+            [MO_64] = gen_helper_gvec_sext32_64,
+        },
+    };
+
+    gen_helper_gvec_2 *fn = fns[vecse][vecde];
+    tcg_debug_assert(fn != 0 && vecse < vecde);
+
+    tcg_gen_gvec_2_ool(dofs, aofs, doprsz, maxsz, 0, fn);
+}
-- 
2.45.2
Re: [RFC PATCH v1 03/43] accel/tcg: Add gvec size changing operations
Posted by Richard Henderson 21 hours ago
On 11/20/24 19:49, Anton Johansson wrote:
> Adds new functions to the gvec API for truncating, sign- or zero
> extending vector elements.  Currently implemented as helper functions,
> these may be mapped onto host vector instructions in the future.
> 
> For the time being, allows translation of more complicated vector
> instructions by helper-to-tcg.
> 
> Signed-off-by: Anton Johansson <anjo@rev.ng>
> ---
>   accel/tcg/tcg-runtime-gvec.c     | 41 +++++++++++++++++
>   accel/tcg/tcg-runtime.h          | 22 +++++++++
>   include/tcg/tcg-op-gvec-common.h | 18 ++++++++
>   tcg/tcg-op-gvec.c                | 78 ++++++++++++++++++++++++++++++++
>   4 files changed, 159 insertions(+)
> 
> diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
> index afca89baa1..685c991e6a 100644
> --- a/accel/tcg/tcg-runtime-gvec.c
> +++ b/accel/tcg/tcg-runtime-gvec.c
> @@ -1569,3 +1569,44 @@ void HELPER(gvec_bitsel)(void *d, void *a, void *b, void *c, uint32_t desc)
>       }
>       clear_high(d, oprsz, desc);
>   }
> +
> +#define DO_SZ_OP1(NAME, DSTTY, SRCTY)                                      \
> +void HELPER(NAME)(void *d, void *a, uint32_t desc)                         \
> +{                                                                          \
> +    intptr_t oprsz = simd_oprsz(desc);                                     \
> +    intptr_t elsz = oprsz/sizeof(DSTTY);                                   \
> +    intptr_t i;                                                            \
> +                                                                           \
> +    for (i = 0; i < elsz; ++i) {                                           \
> +        SRCTY aa = *((SRCTY *) a + i);                                     \
> +        *((DSTTY *) d + i) = aa;                                           \
> +    }                                                                      \
> +    clear_high(d, oprsz, desc);                                            \

This formulation is not valid.

(1) Generic forms must *always* operate strictly on columns.  This formulation is either 
expanding a narrow vector to a wider vector or compressing a wider vector to a narrow vector.

(2) This takes no care for byte ordering of the data between columns.  This is where 
sticking strictly to columns helps, in that we can assume that data is host-endian *within 
the column*, but we cannot assume anything about the element indexing of ptr + i.

(3) This takes no care for element overlap if A == D.

The only form of sign/zero-extract that you may add generically is an alias for

   d[i] = a[i] & mask

or

   d[i] = (a[i] << shift) >> shift

where A and D use the same element type.  We could add new tcg opcodes for these 
(particularly the second, for sign-extension), though x86_64 does not support it, afaics.


r~