[v1] target/arm: Implement SVE2 MATCH, NMATCH

[PATCH RFC] target/arm: Implement SVE2 MATCH, NMATCH

Posted by Stephen Long 5 years, 10 months ago

Signed-off-by: Stephen Long <steplong@quicinc.com>
---
Submitting this for early review. I'm working with Richard on SVE2 support for
qemu. I'll be attempting to tackle the insns in the 'SVE2 integer add/subtract
narrow high part' category next [1].

[1] ISA manual: https://static.docs.arm.com/ddi0602/d/ISA_A64_xml_futureA-2019-12_OPT.pdf (page 2950)

 target/arm/helper-sve.h    | 10 +++++++++
 target/arm/sve.decode      |  5 +++++
 target/arm/sve_helper.c    | 29 +++++++++++++++++++++++++
 target/arm/translate-sve.c | 43 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 87 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index 5dd880cf6d..2077df9a95 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -2516,6 +2516,16 @@ DEF_HELPER_FLAGS_3(sve2_uqrshrnt_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
 DEF_HELPER_FLAGS_3(sve2_uqrshrnt_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
 DEF_HELPER_FLAGS_3(sve2_uqrshrnt_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_5(sve2_match_zpzz_b, TCG_CALL_NO_RWG,
+                   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_match_zpzz_h, TCG_CALL_NO_RWG,
+                   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve2_nmatch_zpzz_b, TCG_CALL_NO_RWG,
+                   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_nmatch_zpzz_h, TCG_CALL_NO_RWG,
+                   void, ptr, ptr, ptr, ptr, i32)
+
 DEF_HELPER_FLAGS_6(sve2_faddp_zpzz_h, TCG_CALL_NO_RWG,
                    void, ptr, ptr, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_6(sve2_faddp_zpzz_s, TCG_CALL_NO_RWG,
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index 374e47fb05..652668df02 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -1305,6 +1305,11 @@ UQSHRNT         01000101 .. 1 ..... 00 1101 ..... .....  @rd_rn_tszimm_shr
 UQRSHRNB        01000101 .. 1 ..... 00 1110 ..... .....  @rd_rn_tszimm_shr
 UQRSHRNT        01000101 .. 1 ..... 00 1111 ..... .....  @rd_rn_tszimm_shr
 
+### SVE2 Character Match
+
+MATCH           01000101 .. 1 ..... 100 ... ..... 0 .... @pd_pg_rn_rm
+NMATCH          01000101 .. 1 ..... 100 ... ..... 1 .... @pd_pg_rn_rm
+
 ## SVE2 floating-point pairwise operations
 
 FADDP           01100100 .. 010 00 0 100 ... ..... ..... @rdn_pg_rm
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index b68f62cd7f..c75258b56d 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -6890,3 +6890,32 @@ DO_ST1_ZPZ_D(dd_be, zd, MO_64)
 
 #undef DO_ST1_ZPZ_S
 #undef DO_ST1_ZPZ_D
+
+#define DO_ZPZZ_CHAR_MATCH(NAME, TYPE, H, EQUALS)                            \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)     \
+{                                                                            \
+    intptr_t i, opr_sz = simd_oprsz(desc);                                   \
+    for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                             \
+        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                      \
+        uint16_t *pd = (uint16_t *)(vd + H1_2(i >> 3));                      \
+        *pd = (*pd & ~1) | ((0 & EQUALS) | (1 & !EQUALS));                   \
+        if (pg & 1) {                                                        \
+            TYPE nn = *(TYPE *)(vn + H(i));                                  \
+            for (intptr_t j = 0; j < 16; j += sizeof(TYPE)) {                \
+                TYPE mm = *(TYPE *)(vm + H(i * 16 + j));                     \
+                bool eq = nn == mm;                                          \
+                if ((eq && EQUALS) || (!eq && !EQUALS)) {                    \
+                    *pd = (*pd & ~1) | ((1 & EQUALS) | (0 & !EQUALS));       \
+                }                                                            \
+            }                                                                \
+        }                                                                    \
+    }                                                                        \
+}
+
+DO_ZPZZ_CHAR_MATCH(sve2_match_zpzz_b, uint8_t, H1, true)
+DO_ZPZZ_CHAR_MATCH(sve2_match_zpzz_h, uint16_t, H1_2, true)
+
+DO_ZPZZ_CHAR_MATCH(sve2_nmatch_zpzz_b, uint8_t, H1, false)
+DO_ZPZZ_CHAR_MATCH(sve2_nmatch_zpzz_h, uint16_t, H1_2, false)
+
+#undef DO_ZPZZ_CHAR_MATCH
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 07a2040208..7175148bfd 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -7246,6 +7246,49 @@ static bool trans_UQRSHRNT(DisasContext *s, arg_rri_esz *a)
     return do_sve2_shr_narrow(s, a, ops);
 }
 
+static bool do_sve2_zpzz_char_match(DisasContext *s, arg_rprr_esz *a,
+                                    gen_helper_gvec_4 *fn)
+{
+    if (!dc_isar_feature(aa64_sve2, s)) {
+        return false;
+    }
+    if (fn == NULL) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        unsigned vsz = vec_full_reg_size(s);
+        unsigned psz = pred_full_reg_size(s);
+        int dofs = pred_full_reg_offset(s, a->rd);
+        int nofs = vec_full_reg_offset(s, a->rn);
+        int mofs = vec_full_reg_offset(s, a->rm);
+        int gofs = pred_full_reg_offset(s, a->pg);
+
+        /* Save a copy if the destination overwrites the guarding predicate */
+        int tofs = gofs;
+        if (a->rd == a->pg) {
+            tofs = offsetof(CPUARMState, vfp.preg_tmp);
+            tcg_gen_gvec_mov(0, tofs, gofs, psz, psz);
+        }
+
+        tcg_gen_gvec_4_ool(dofs, nofs, mofs, gofs, vsz, vsz, 0, fn);
+        do_predtest(s, dofs, tofs, psz / 8);
+    }
+    return true;
+}
+
+#define DO_SVE2_ZPZZ_CHAR_MATCH(NAME, name)                                 \
+static bool trans_##NAME(DisasContext *s, arg_rprr_esz *a)                  \
+{                                                                           \
+    static gen_helper_gvec_4 * const fns[4] = {                             \
+        gen_helper_sve2_##name##_zpzz_b, gen_helper_sve2_##name##_zpzz_h,   \
+        NULL,                           NULL                                \
+    };                                                                      \
+    return do_sve2_zpzz_char_match(s, a, fns[a->esz]);                      \
+}
+
+DO_SVE2_ZPZZ_CHAR_MATCH(MATCH, match)
+DO_SVE2_ZPZZ_CHAR_MATCH(NMATCH, nmatch)
+
 static bool do_sve2_zpzz_fp(DisasContext *s, arg_rprr_esz *a,
                             gen_helper_gvec_4_ptr *fn)
 {
-- 
2.17.1

Re: [PATCH RFC] target/arm: Implement SVE2 MATCH, NMATCH

Posted by Richard Henderson 5 years, 10 months ago

On 4/13/20 4:42 PM, Stephen Long wrote:
> +#define DO_ZPZZ_CHAR_MATCH(NAME, TYPE, H, EQUALS)                            \
> +void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)     \
> +{                                                                            \
> +    intptr_t i, opr_sz = simd_oprsz(desc);                                   \
> +    for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                             \
> +        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                      \
> +        uint16_t *pd = (uint16_t *)(vd + H1_2(i >> 3));                      \
> +        *pd = (*pd & ~1) | ((0 & EQUALS) | (1 & !EQUALS));                   \
> +        if (pg & 1) {                                                        \

The important error here is that the predicate is not always the low bit.  When
operating on bytes, every bit of the predicate is significant.  When operating
on halfwords, every even bit of the predicate is significant.  In addition,
when operating on halfwords, every odd bit of the result predicate must be zero.

Which is why, generally, I have constructed the output predicate as we go.
See, for instance, DO_CMP_PPZZ.

> +            TYPE nn = *(TYPE *)(vn + H(i));                                  \
> +            for (intptr_t j = 0; j < 16; j += sizeof(TYPE)) {                \
> +                TYPE mm = *(TYPE *)(vm + H(i * 16 + j));                     \

mm needs to start at the beginning of the segment, which in this case is (i &
-16).  You don't need the elements of mm in any particular order (all of them
are significant), so you can drop the use of H() here.

Therefore the indexing for mm should be vm + (i & -16) + j.

> +                bool eq = nn == mm;                                          \
> +                if ((eq && EQUALS) || (!eq && !EQUALS)) {                    \
> +                    *pd = (*pd & ~1) | ((1 & EQUALS) | (0 & !EQUALS));       \
> +                }                                                            \

It might be handy to split out the inner loop to a helper function, as, while
the basic loop is ok, there are tricks that can improve it, so that we're
comparing 8 bytes at a time.


> +static bool do_sve2_zpzz_char_match(DisasContext *s, arg_rprr_esz *a,
> +                                    gen_helper_gvec_4 *fn)
> +{
> +    if (!dc_isar_feature(aa64_sve2, s)) {
> +        return false;
> +    }
> +    if (fn == NULL) {
> +        return false;
> +    }
> +    if (sve_access_check(s)) {
> +        unsigned vsz = vec_full_reg_size(s);
> +        unsigned psz = pred_full_reg_size(s);
> +        int dofs = pred_full_reg_offset(s, a->rd);
> +        int nofs = vec_full_reg_offset(s, a->rn);
> +        int mofs = vec_full_reg_offset(s, a->rm);
> +        int gofs = pred_full_reg_offset(s, a->pg);
> +
> +        /* Save a copy if the destination overwrites the guarding predicate */
> +        int tofs = gofs;
> +        if (a->rd == a->pg) {
> +            tofs = offsetof(CPUARMState, vfp.preg_tmp);
> +            tcg_gen_gvec_mov(0, tofs, gofs, psz, psz);
> +        }
> +
> +        tcg_gen_gvec_4_ool(dofs, nofs, mofs, gofs, vsz, vsz, 0, fn);
> +        do_predtest(s, dofs, tofs, psz / 8);

You can avoid the copy and the predtest by using the iter_predtest_* functions
and returning the flags result directly from the helper.  Again, see DO_CMP_PPZZ.


r~