[RFC PATCH v1 03/43] accel/tcg: Add gvec size changing operations

Anton Johansson via posted 43 patches 1 year, 2 months ago
[RFC PATCH v1 03/43] accel/tcg: Add gvec size changing operations
Posted by Anton Johansson via 1 year, 2 months ago
Adds new functions to the gvec API for truncating, sign- or zero
extending vector elements.  Currently implemented as helper functions,
these may be mapped onto host vector instructions in the future.

For the time being, allows translation of more complicated vector
instructions by helper-to-tcg.

Signed-off-by: Anton Johansson <anjo@rev.ng>
---
 accel/tcg/tcg-runtime-gvec.c     | 41 +++++++++++++++++
 accel/tcg/tcg-runtime.h          | 22 +++++++++
 include/tcg/tcg-op-gvec-common.h | 18 ++++++++
 tcg/tcg-op-gvec.c                | 78 ++++++++++++++++++++++++++++++++
 4 files changed, 159 insertions(+)

diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
index afca89baa1..685c991e6a 100644
--- a/accel/tcg/tcg-runtime-gvec.c
+++ b/accel/tcg/tcg-runtime-gvec.c
@@ -1569,3 +1569,44 @@ void HELPER(gvec_bitsel)(void *d, void *a, void *b, void *c, uint32_t desc)
     }
     clear_high(d, oprsz, desc);
 }
+
+#define DO_SZ_OP1(NAME, DSTTY, SRCTY)                                      \
+void HELPER(NAME)(void *d, void *a, uint32_t desc)                         \
+{                                                                          \
+    intptr_t oprsz = simd_oprsz(desc);                                     \
+    intptr_t elsz = oprsz/sizeof(DSTTY);                                   \
+    intptr_t i;                                                            \
+                                                                           \
+    for (i = 0; i < elsz; ++i) {                                           \
+        SRCTY aa = *((SRCTY *) a + i);                                     \
+        *((DSTTY *) d + i) = aa;                                           \
+    }                                                                      \
+    clear_high(d, oprsz, desc);                                            \
+}
+
+#define DO_SZ_OP2(NAME, INTTY, DSTSZ, SRCSZ) \
+    DO_SZ_OP1(NAME##SRCSZ##_##DSTSZ, INTTY##DSTSZ##_t, INTTY##SRCSZ##_t)
+
+DO_SZ_OP2(gvec_trunc, uint, 32, 64)
+DO_SZ_OP2(gvec_trunc, uint, 16, 64)
+DO_SZ_OP2(gvec_trunc, uint, 8,  64)
+DO_SZ_OP2(gvec_trunc, uint, 16, 32)
+DO_SZ_OP2(gvec_trunc, uint, 8,  32)
+DO_SZ_OP2(gvec_trunc, uint, 8,  16)
+
+DO_SZ_OP2(gvec_zext, uint, 64, 32)
+DO_SZ_OP2(gvec_zext, uint, 64, 16)
+DO_SZ_OP2(gvec_zext, uint, 64, 8)
+DO_SZ_OP2(gvec_zext, uint, 32, 16)
+DO_SZ_OP2(gvec_zext, uint, 32, 8)
+DO_SZ_OP2(gvec_zext, uint, 16, 8)
+
+DO_SZ_OP2(gvec_sext, int, 64, 32)
+DO_SZ_OP2(gvec_sext, int, 64, 16)
+DO_SZ_OP2(gvec_sext, int, 64, 8)
+DO_SZ_OP2(gvec_sext, int, 32, 16)
+DO_SZ_OP2(gvec_sext, int, 32, 8)
+DO_SZ_OP2(gvec_sext, int, 16, 8)
+
+#undef DO_SZ_OP1
+#undef DO_SZ_OP2
diff --git a/accel/tcg/tcg-runtime.h b/accel/tcg/tcg-runtime.h
index 0a4d31eb48..5045655bf8 100644
--- a/accel/tcg/tcg-runtime.h
+++ b/accel/tcg/tcg-runtime.h
@@ -1,3 +1,4 @@
+#include "tcg/tcg.h"
 DEF_HELPER_FLAGS_2(div_i32, TCG_CALL_NO_RWG_SE, s32, s32, s32)
 DEF_HELPER_FLAGS_2(rem_i32, TCG_CALL_NO_RWG_SE, s32, s32, s32)
 DEF_HELPER_FLAGS_2(divu_i32, TCG_CALL_NO_RWG_SE, i32, i32, i32)
@@ -328,3 +329,24 @@ DEF_HELPER_FLAGS_4(gvec_leus32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
 DEF_HELPER_FLAGS_4(gvec_leus64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
 
 DEF_HELPER_FLAGS_5(gvec_bitsel, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(gvec_trunc64_32, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_trunc64_16, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_trunc64_8,  TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_trunc32_16, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_trunc32_8,  TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_trunc16_8,  TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(gvec_zext32_64, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_zext16_64, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_zext8_64,  TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_zext16_32, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_zext8_32,  TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_zext8_16,  TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(gvec_sext32_64, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_sext16_64, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_sext8_64,  TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_sext16_32, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_sext8_32,  TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_sext8_16,  TCG_CALL_NO_RWG, void, ptr, ptr, i32)
diff --git a/include/tcg/tcg-op-gvec-common.h b/include/tcg/tcg-op-gvec-common.h
index 65553f5f97..39b0c2f64e 100644
--- a/include/tcg/tcg-op-gvec-common.h
+++ b/include/tcg/tcg-op-gvec-common.h
@@ -390,6 +390,24 @@ void tcg_gen_gvec_bitsel(unsigned vece, uint32_t dofs, uint32_t aofs,
                          uint32_t bofs, uint32_t cofs,
                          uint32_t oprsz, uint32_t maxsz);
 
+/*
+ * Perform vector element truncation/extension operations
+ */
+
+void tcg_gen_gvec_trunc(unsigned vecde, unsigned vecse,
+                        uint32_t dofs, uint32_t aofs,
+                        uint32_t doprsz, uint32_t aoprsz,
+                        uint32_t maxsz);
+
+void tcg_gen_gvec_zext(unsigned vecde, unsigned vecse,
+                       uint32_t dofs, uint32_t aofs,
+                       uint32_t doprsz, uint32_t aoprsz,
+                       uint32_t maxsz);
+
+void tcg_gen_gvec_sext(unsigned vecde, unsigned vecse,
+                       uint32_t dofs, uint32_t aofs,
+                       uint32_t doprsz, uint32_t aoprsz,
+                       uint32_t maxsz);
 /*
  * 64-bit vector operations.  Use these when the register has been allocated
  * with tcg_global_mem_new_i64, and so we cannot also address it via pointer.
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index 97e4df221a..80649dc0d2 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -4008,3 +4008,81 @@ void tcg_gen_gvec_bitsel(unsigned vece, uint32_t dofs, uint32_t aofs,
 
     tcg_gen_gvec_4(dofs, aofs, bofs, cofs, oprsz, maxsz, &g);
 }
+
+void tcg_gen_gvec_trunc(unsigned vecde, unsigned vecse,
+                        uint32_t dofs, uint32_t aofs,
+                        uint32_t doprsz, uint32_t aoprsz,
+                        uint32_t maxsz)
+{
+    gen_helper_gvec_2 * const fns[4][4] = {
+        [MO_64] = {
+            [MO_32] = gen_helper_gvec_trunc64_32,
+            [MO_16] = gen_helper_gvec_trunc64_16,
+            [MO_8]  = gen_helper_gvec_trunc64_8,
+        },
+        [MO_32] = {
+            [MO_16] = gen_helper_gvec_trunc32_16,
+            [MO_8]  = gen_helper_gvec_trunc32_8,
+        },
+        [MO_16] = {
+            [MO_8]  = gen_helper_gvec_trunc16_8,
+        },
+    };
+
+    gen_helper_gvec_2 *fn = fns[vecse][vecde];
+    tcg_debug_assert(fn != 0 && vecse > vecde);
+
+    tcg_gen_gvec_2_ool(dofs, aofs, doprsz, maxsz, 0, fn);
+}
+
+void tcg_gen_gvec_zext(unsigned vecde, unsigned vecse,
+                       uint32_t dofs, uint32_t aofs,
+                       uint32_t doprsz, uint32_t aoprsz,
+                       uint32_t maxsz)
+{
+    gen_helper_gvec_2 * const fns[4][4] = {
+        [MO_8] = {
+            [MO_16] = gen_helper_gvec_zext8_16,
+            [MO_32] = gen_helper_gvec_zext8_32,
+            [MO_64] = gen_helper_gvec_zext8_64,
+        },
+        [MO_16] = {
+            [MO_32] = gen_helper_gvec_zext16_32,
+            [MO_64] = gen_helper_gvec_zext16_64,
+        },
+        [MO_32] = {
+            [MO_64] = gen_helper_gvec_zext32_64,
+        },
+    };
+
+    gen_helper_gvec_2 *fn = fns[vecse][vecde];
+    tcg_debug_assert(fn != 0 && vecse < vecde);
+
+    tcg_gen_gvec_2_ool(dofs, aofs, doprsz, maxsz, 0, fn);
+}
+
+void tcg_gen_gvec_sext(unsigned vecde, unsigned vecse,
+                       uint32_t dofs, uint32_t aofs,
+                       uint32_t doprsz, uint32_t aoprsz,
+                       uint32_t maxsz)
+{
+    gen_helper_gvec_2 * const fns[4][4] = {
+        [MO_8] = {
+            [MO_16] = gen_helper_gvec_sext8_16,
+            [MO_32] = gen_helper_gvec_sext8_32,
+            [MO_64] = gen_helper_gvec_sext8_64,
+        },
+        [MO_16] = {
+            [MO_32] = gen_helper_gvec_sext16_32,
+            [MO_64] = gen_helper_gvec_sext16_64,
+        },
+        [MO_32] = {
+            [MO_64] = gen_helper_gvec_sext32_64,
+        },
+    };
+
+    gen_helper_gvec_2 *fn = fns[vecse][vecde];
+    tcg_debug_assert(fn != 0 && vecse < vecde);
+
+    tcg_gen_gvec_2_ool(dofs, aofs, doprsz, maxsz, 0, fn);
+}
-- 
2.45.2
Re: [RFC PATCH v1 03/43] accel/tcg: Add gvec size changing operations
Posted by Richard Henderson 1 year, 2 months ago
On 11/20/24 19:49, Anton Johansson wrote:
> Adds new functions to the gvec API for truncating, sign- or zero
> extending vector elements.  Currently implemented as helper functions,
> these may be mapped onto host vector instructions in the future.
> 
> For the time being, allows translation of more complicated vector
> instructions by helper-to-tcg.
> 
> Signed-off-by: Anton Johansson <anjo@rev.ng>
> ---
>   accel/tcg/tcg-runtime-gvec.c     | 41 +++++++++++++++++
>   accel/tcg/tcg-runtime.h          | 22 +++++++++
>   include/tcg/tcg-op-gvec-common.h | 18 ++++++++
>   tcg/tcg-op-gvec.c                | 78 ++++++++++++++++++++++++++++++++
>   4 files changed, 159 insertions(+)
> 
> diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
> index afca89baa1..685c991e6a 100644
> --- a/accel/tcg/tcg-runtime-gvec.c
> +++ b/accel/tcg/tcg-runtime-gvec.c
> @@ -1569,3 +1569,44 @@ void HELPER(gvec_bitsel)(void *d, void *a, void *b, void *c, uint32_t desc)
>       }
>       clear_high(d, oprsz, desc);
>   }
> +
> +#define DO_SZ_OP1(NAME, DSTTY, SRCTY)                                      \
> +void HELPER(NAME)(void *d, void *a, uint32_t desc)                         \
> +{                                                                          \
> +    intptr_t oprsz = simd_oprsz(desc);                                     \
> +    intptr_t elsz = oprsz/sizeof(DSTTY);                                   \
> +    intptr_t i;                                                            \
> +                                                                           \
> +    for (i = 0; i < elsz; ++i) {                                           \
> +        SRCTY aa = *((SRCTY *) a + i);                                     \
> +        *((DSTTY *) d + i) = aa;                                           \
> +    }                                                                      \
> +    clear_high(d, oprsz, desc);                                            \

This formulation is not valid.

(1) Generic forms must *always* operate strictly on columns.  This formulation is either 
expanding a narrow vector to a wider vector or compressing a wider vector to a narrow vector.

(2) This takes no care for byte ordering of the data between columns.  This is where 
sticking strictly to columns helps, in that we can assume that data is host-endian *within 
the column*, but we cannot assume anything about the element indexing of ptr + i.

(3) This takes no care for element overlap if A == D.

The only form of sign/zero-extract that you may add generically is an alias for

   d[i] = a[i] & mask

or

   d[i] = (a[i] << shift) >> shift

where A and D use the same element type.  We could add new tcg opcodes for these 
(particularly the second, for sign-extension), though x86_64 does not support it, afaics.


r~
Re: [RFC PATCH v1 03/43] accel/tcg: Add gvec size changing operations
Posted by Anton Johansson via 1 year, 2 months ago
On 22/11/24, Richard Henderson wrote:
> On 11/20/24 19:49, Anton Johansson wrote:
> > Adds new functions to the gvec API for truncating, sign- or zero
> > extending vector elements.  Currently implemented as helper functions,
> > these may be mapped onto host vector instructions in the future.
> > 
> > For the time being, allows translation of more complicated vector
> > instructions by helper-to-tcg.
> > 
> > Signed-off-by: Anton Johansson <anjo@rev.ng>
> > ---
> >   accel/tcg/tcg-runtime-gvec.c     | 41 +++++++++++++++++
> >   accel/tcg/tcg-runtime.h          | 22 +++++++++
> >   include/tcg/tcg-op-gvec-common.h | 18 ++++++++
> >   tcg/tcg-op-gvec.c                | 78 ++++++++++++++++++++++++++++++++
> >   4 files changed, 159 insertions(+)
> > 
> > diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
> > index afca89baa1..685c991e6a 100644
> > --- a/accel/tcg/tcg-runtime-gvec.c
> > +++ b/accel/tcg/tcg-runtime-gvec.c
> > @@ -1569,3 +1569,44 @@ void HELPER(gvec_bitsel)(void *d, void *a, void *b, void *c, uint32_t desc)
> >       }
> >       clear_high(d, oprsz, desc);
> >   }
> > +
> > +#define DO_SZ_OP1(NAME, DSTTY, SRCTY)                                      \
> > +void HELPER(NAME)(void *d, void *a, uint32_t desc)                         \
> > +{                                                                          \
> > +    intptr_t oprsz = simd_oprsz(desc);                                     \
> > +    intptr_t elsz = oprsz/sizeof(DSTTY);                                   \
> > +    intptr_t i;                                                            \
> > +                                                                           \
> > +    for (i = 0; i < elsz; ++i) {                                           \
> > +        SRCTY aa = *((SRCTY *) a + i);                                     \
> > +        *((DSTTY *) d + i) = aa;                                           \
> > +    }                                                                      \
> > +    clear_high(d, oprsz, desc);                                            \
> 
> This formulation is not valid.
> 
> (1) Generic forms must *always* operate strictly on columns.  This
> formulation is either expanding a narrow vector to a wider vector or
> compressing a wider vector to a narrow vector.
> 
> (2) This takes no care for byte ordering of the data between columns.  This
> is where sticking strictly to columns helps, in that we can assume that data
> is host-endian *within the column*, but we cannot assume anything about the
> element indexing of ptr + i.

Concerning (1) and (2), is this a limitation imposed on generic vector
ops. to simplify mapping to host vector instructions where
padding/alignment of elements might differ?  From my understanding, the
helper above should be fine since we can assume contiguous elements?

But maybe it doesn't make sense to add a gvec op. that is only
implemented via helper, I'm not sure.

> (3) This takes no care for element overlap if A == D.

Ah, good point!

> The only form of sign/zero-extract that you may add generically is an alias for
> 
>   d[i] = a[i] & mask
> 
> or
> 
>   d[i] = (a[i] << shift) >> shift
> 
> where A and D use the same element type.  We could add new tcg opcodes for
> these (particularly the second, for sign-extension), though x86_64 does not
> support it, afaics.

I see, I don't think we can make this work for Hexagon vector ops., as
an example consider V6_vadduwsat which performs an unsigned saturated
add of 32-bit elements, currently we emit

    void emit_V6_vadduwsat(intptr_t vec2, intptr_t vec7, intptr_t vec6) {
        VectorMem mem = {0};
        intptr_t vec5 = temp_new_gvec(&mem, 256);
        tcg_gen_gvec_zext(MO_64, MO_32, vec5, vec7, 256, 128, 256);

        intptr_t vec1 = temp_new_gvec(&mem, 256);
        tcg_gen_gvec_zext(MO_64, MO_32, vec1, vec6, 256, 128, 256);

        tcg_gen_gvec_add(MO_64, vec1, vec1, vec5, 256, 256);

        intptr_t vec3 = temp_new_gvec(&mem, 256);
        tcg_gen_gvec_dup_imm(MO_64, vec3, 256, 256, 4294967295ull);

        tcg_gen_gvec_umin(MO_64, vec1, vec1, vec3, 256, 256);

        tcg_gen_gvec_trunc(MO_32, MO_64, vec2, vec1, 128, 256, 128);
    }

so we really do rely on the size-changing property of zext here, the
input vectors are 128-byte and we expand them to 256-byte.  We could
expand vector operations within the instruction to the largest vector
size, but would need to zext and trunc to destination and source
registers anyway.

//Anton
Re: [RFC PATCH v1 03/43] accel/tcg: Add gvec size changing operations
Posted by Richard Henderson 1 year, 2 months ago
On 12/3/24 12:08, Anton Johansson wrote:
> On 22/11/24, Richard Henderson wrote:
>> On 11/20/24 19:49, Anton Johansson wrote:
>>> Adds new functions to the gvec API for truncating, sign- or zero
>>> extending vector elements.  Currently implemented as helper functions,
>>> these may be mapped onto host vector instructions in the future.
>>>
>>> For the time being, allows translation of more complicated vector
>>> instructions by helper-to-tcg.
>>>
>>> Signed-off-by: Anton Johansson <anjo@rev.ng>
>>> ---
>>>    accel/tcg/tcg-runtime-gvec.c     | 41 +++++++++++++++++
>>>    accel/tcg/tcg-runtime.h          | 22 +++++++++
>>>    include/tcg/tcg-op-gvec-common.h | 18 ++++++++
>>>    tcg/tcg-op-gvec.c                | 78 ++++++++++++++++++++++++++++++++
>>>    4 files changed, 159 insertions(+)
>>>
>>> diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
>>> index afca89baa1..685c991e6a 100644
>>> --- a/accel/tcg/tcg-runtime-gvec.c
>>> +++ b/accel/tcg/tcg-runtime-gvec.c
>>> @@ -1569,3 +1569,44 @@ void HELPER(gvec_bitsel)(void *d, void *a, void *b, void *c, uint32_t desc)
>>>        }
>>>        clear_high(d, oprsz, desc);
>>>    }
>>> +
>>> +#define DO_SZ_OP1(NAME, DSTTY, SRCTY)                                      \
>>> +void HELPER(NAME)(void *d, void *a, uint32_t desc)                         \
>>> +{                                                                          \
>>> +    intptr_t oprsz = simd_oprsz(desc);                                     \
>>> +    intptr_t elsz = oprsz/sizeof(DSTTY);                                   \
>>> +    intptr_t i;                                                            \
>>> +                                                                           \
>>> +    for (i = 0; i < elsz; ++i) {                                           \
>>> +        SRCTY aa = *((SRCTY *) a + i);                                     \
>>> +        *((DSTTY *) d + i) = aa;                                           \
>>> +    }                                                                      \
>>> +    clear_high(d, oprsz, desc);                                            \
>>
>> This formulation is not valid.
>>
>> (1) Generic forms must *always* operate strictly on columns.  This
>> formulation is either expanding a narrow vector to a wider vector or
>> compressing a wider vector to a narrow vector.
>>
>> (2) This takes no care for byte ordering of the data between columns.  This
>> is where sticking strictly to columns helps, in that we can assume that data
>> is host-endian *within the column*, but we cannot assume anything about the
>> element indexing of ptr + i.
> 
> Concerning (1) and (2), is this a limitation imposed on generic vector
> ops. to simplify mapping to host vector instructions where
> padding/alignment of elements might differ?  From my understanding, the
> helper above should be fine since we can assume contiguous elements?

This is a limitation imposed on generic vector ops, because different target/arch/ 
represent their vectors in different ways.

For instance, Arm and RISC-V chunk the vector in to host-endian uint64_t, with the chunks 
indexed little-endian.  But PPC puts the entire 128-bit vector in host-endian bit 
ordering, so the uint64_t chunks are host-endian.

On a big-endian host, ptr+1 may be addressing element i-1 or i-7 instead of i+1.

> I see, I don't think we can make this work for Hexagon vector ops., as
> an example consider V6_vadduwsat which performs an unsigned saturated
> add of 32-bit elements, currently we emit
> 
>      void emit_V6_vadduwsat(intptr_t vec2, intptr_t vec7, intptr_t vec6) {
>          VectorMem mem = {0};
>          intptr_t vec5 = temp_new_gvec(&mem, 256);
>          tcg_gen_gvec_zext(MO_64, MO_32, vec5, vec7, 256, 128, 256);
> 
>          intptr_t vec1 = temp_new_gvec(&mem, 256);
>          tcg_gen_gvec_zext(MO_64, MO_32, vec1, vec6, 256, 128, 256);
> 
>          tcg_gen_gvec_add(MO_64, vec1, vec1, vec5, 256, 256);
> 
>          intptr_t vec3 = temp_new_gvec(&mem, 256);
>          tcg_gen_gvec_dup_imm(MO_64, vec3, 256, 256, 4294967295ull);
> 
>          tcg_gen_gvec_umin(MO_64, vec1, vec1, vec3, 256, 256);
> 
>          tcg_gen_gvec_trunc(MO_32, MO_64, vec2, vec1, 128, 256, 128);
>      }
> 
> so we really do rely on the size-changing property of zext here, the
> input vectors are 128-byte and we expand them to 256-byte.  We could
> expand vector operations within the instruction to the largest vector
> size, but would need to zext and trunc to destination and source
> registers anyway.
Yes, well, this is the output of llvm though, yes?

Did you forget to describe TCG's native saturating operations to the compiler? 
tcg_gen_gvec_usadd performs exactly this operation.

And if you'd like to improve llvm, usadd(a, b) equals umin(a, ~b) + b.
Fewer operations without having to change vector sizes.
Similarly for unsigned saturating subtract: ussub(a, b) equals umax(a, b) - b.


r~
Re: [RFC PATCH v1 03/43] accel/tcg: Add gvec size changing operations
Posted by Anton Johansson via 1 year, 2 months ago
On 03/12/24, Richard Henderson wrote:
> On 12/3/24 12:08, Anton Johansson wrote:
> > On 22/11/24, Richard Henderson wrote:
> > > On 11/20/24 19:49, Anton Johansson wrote:
> > > > Adds new functions to the gvec API for truncating, sign- or zero
> > > > extending vector elements.  Currently implemented as helper functions,
> > > > these may be mapped onto host vector instructions in the future.
> > > > 
> > > > For the time being, allows translation of more complicated vector
> > > > instructions by helper-to-tcg.
> > > > 
> > > > Signed-off-by: Anton Johansson <anjo@rev.ng>
> > > > ---
> > > >    accel/tcg/tcg-runtime-gvec.c     | 41 +++++++++++++++++
> > > >    accel/tcg/tcg-runtime.h          | 22 +++++++++
> > > >    include/tcg/tcg-op-gvec-common.h | 18 ++++++++
> > > >    tcg/tcg-op-gvec.c                | 78 ++++++++++++++++++++++++++++++++
> > > >    4 files changed, 159 insertions(+)
> > > > 
> > > > diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
> > > > index afca89baa1..685c991e6a 100644
> > > > --- a/accel/tcg/tcg-runtime-gvec.c
> > > > +++ b/accel/tcg/tcg-runtime-gvec.c
> > > > @@ -1569,3 +1569,44 @@ void HELPER(gvec_bitsel)(void *d, void *a, void *b, void *c, uint32_t desc)
> > > >        }
> > > >        clear_high(d, oprsz, desc);
> > > >    }
> > > > +
> > > > +#define DO_SZ_OP1(NAME, DSTTY, SRCTY)                                      \
> > > > +void HELPER(NAME)(void *d, void *a, uint32_t desc)                         \
> > > > +{                                                                          \
> > > > +    intptr_t oprsz = simd_oprsz(desc);                                     \
> > > > +    intptr_t elsz = oprsz/sizeof(DSTTY);                                   \
> > > > +    intptr_t i;                                                            \
> > > > +                                                                           \
> > > > +    for (i = 0; i < elsz; ++i) {                                           \
> > > > +        SRCTY aa = *((SRCTY *) a + i);                                     \
> > > > +        *((DSTTY *) d + i) = aa;                                           \
> > > > +    }                                                                      \
> > > > +    clear_high(d, oprsz, desc);                                            \
> > > 
> > > This formulation is not valid.
> > > 
> > > (1) Generic forms must *always* operate strictly on columns.  This
> > > formulation is either expanding a narrow vector to a wider vector or
> > > compressing a wider vector to a narrow vector.
> > > 
> > > (2) This takes no care for byte ordering of the data between columns.  This
> > > is where sticking strictly to columns helps, in that we can assume that data
> > > is host-endian *within the column*, but we cannot assume anything about the
> > > element indexing of ptr + i.
> > 
> > Concerning (1) and (2), is this a limitation imposed on generic vector
> > ops. to simplify mapping to host vector instructions where
> > padding/alignment of elements might differ?  From my understanding, the
> > helper above should be fine since we can assume contiguous elements?
> 
> This is a limitation imposed on generic vector ops, because different
> target/arch/ represent their vectors in different ways.
> 
> For instance, Arm and RISC-V chunk the vector in to host-endian uint64_t,
> with the chunks indexed little-endian.  But PPC puts the entire 128-bit
> vector in host-endian bit ordering, so the uint64_t chunks are host-endian.
> 
> On a big-endian host, ptr+1 may be addressing element i-1 or i-7 instead of i+1.

Ah, I see the problem now thanks for the explanation:)

> > I see, I don't think we can make this work for Hexagon vector ops., as
> > an example consider V6_vadduwsat which performs an unsigned saturated
> > add of 32-bit elements, currently we emit
> > 
> >      void emit_V6_vadduwsat(intptr_t vec2, intptr_t vec7, intptr_t vec6) {
> >          VectorMem mem = {0};
> >          intptr_t vec5 = temp_new_gvec(&mem, 256);
> >          tcg_gen_gvec_zext(MO_64, MO_32, vec5, vec7, 256, 128, 256);
> > 
> >          intptr_t vec1 = temp_new_gvec(&mem, 256);
> >          tcg_gen_gvec_zext(MO_64, MO_32, vec1, vec6, 256, 128, 256);
> > 
> >          tcg_gen_gvec_add(MO_64, vec1, vec1, vec5, 256, 256);
> > 
> >          intptr_t vec3 = temp_new_gvec(&mem, 256);
> >          tcg_gen_gvec_dup_imm(MO_64, vec3, 256, 256, 4294967295ull);
> > 
> >          tcg_gen_gvec_umin(MO_64, vec1, vec1, vec3, 256, 256);
> > 
> >          tcg_gen_gvec_trunc(MO_32, MO_64, vec2, vec1, 128, 256, 128);
> >      }
> > 
> > so we really do rely on the size-changing property of zext here, the
> > input vectors are 128-byte and we expand them to 256-byte.  We could
> > expand vector operations within the instruction to the largest vector
> > size, but would need to zext and trunc to destination and source
> > registers anyway.
> Yes, well, this is the output of llvm though, yes?

Yes

> Did you forget to describe TCG's native saturating operations to the
> compiler? tcg_gen_gvec_usadd performs exactly this operation.
> 
> And if you'd like to improve llvm, usadd(a, b) equals umin(a, ~b) + b.
> Fewer operations without having to change vector sizes.
> Similarly for unsigned saturating subtract: ussub(a, b) equals umax(a, b) - b.

In this case LLVM wasn't able to optimize it to a llvm.uadd.sat
intrinsic, in which case we would have emitted tcg_gen_gvec_usadd I
believe.  We can manually optimize the above pattern to a llvm.uadd.sat
to avoid extra size changes.

This might be fixed in future LLVM versions, but otherwise seems like a
reasonable change to push upstream.

The point is that we have a lot of Hexagon instructions where size
changes are probably unavoidable, another example is V6_vshuffh which
takes in a <16 x i16> vector and shuffles the upper <8xi16> into the upper
16-bits of a <8 x i32> vector

    void emit_V6_vshuffh(intptr_t vec3, intptr_t vec7) {
        VectorMem mem = {0};
        intptr_t vec2 = temp_new_gvec(&mem, 128);
        tcg_gen_gvec_zext(MO_32, MO_16, vec2, vec7, 128, 64, 128);

        intptr_t vec0 = temp_new_gvec(&mem, 128);
        tcg_gen_gvec_zext(MO_32, MO_16, vec0, (vec7 + 64ull), 128, 64, 128);

        intptr_t vec1 = temp_new_gvec(&mem, 128);
        tcg_gen_gvec_shli(MO_32, vec1, vec0, 16, 128, 128);
        tcg_gen_gvec_or(MO_32, vec3, vec1, vec2, 128, 128);
    }

Not to bloat the email too much with examples, you can see 3 more here

  https://pad.rev.ng/11IvAKhiRy2cPwC7MX9nXA

Maybe we rely on the target defining size-changing operations if they
are needed?

//Anton
Re: [RFC PATCH v1 03/43] accel/tcg: Add gvec size changing operations
Posted by Richard Henderson 1 year, 2 months ago
On 12/3/24 14:15, Anton Johansson wrote:
> The point is that we have a lot of Hexagon instructions where size
> changes are probably unavoidable, another example is V6_vshuffh which
> takes in a <16 x i16> vector and shuffles the upper <8xi16> into the upper
> 16-bits of a <8 x i32> vector
> 
>      void emit_V6_vshuffh(intptr_t vec3, intptr_t vec7) {
>          VectorMem mem = {0};
>          intptr_t vec2 = temp_new_gvec(&mem, 128);
>          tcg_gen_gvec_zext(MO_32, MO_16, vec2, vec7, 128, 64, 128);
> 
>          intptr_t vec0 = temp_new_gvec(&mem, 128);
>          tcg_gen_gvec_zext(MO_32, MO_16, vec0, (vec7 + 64ull), 128, 64, 128);
> 
>          intptr_t vec1 = temp_new_gvec(&mem, 128);
>          tcg_gen_gvec_shli(MO_32, vec1, vec0, 16, 128, 128);
>          tcg_gen_gvec_or(MO_32, vec3, vec1, vec2, 128, 128);
>      }
> 
> Not to bloat the email too much with examples, you can see 3 more here
> 
>    https://pad.rev.ng/11IvAKhiRy2cPwC7MX9nXA
> 
> Maybe we rely on the target defining size-changing operations if they
> are needed?

Perhaps.

I'll note that emit_V6_vpackwh_sat in particular should probably not use vectors at all. 
I'm sure it would be shorter to simply expand directly to integer code.

I'll also note that tcg's vector support isn't really designed for the way you're using 
it.  It leads to the creation of many on-stack temporaries that would not otherwise be 
required.

When targets are emitting their own complex patterns, the expected method is to use the 
GVecGen* structures and the callbacks therein.  This allows the JIT to select different 
expansions depending on the host cpu vector support (or lack thereof).

For a simple example, see gen_gvec_xar() in target/arm/tcg/gengvec64.c, which simply 
combines a rotate and an xor.  For a more complex example, see gen_gvec_usqadd_qc() later 
in that same file, where in the worst case we call an out-of-line helper.


r~