[PATCH v3] target/arm: Implement SVE2 FMMLA

Stephen Long posted 1 patch 4 years ago
Failed in applying to current master (apply log)
target/arm/cpu.h           | 10 +++++++++
target/arm/helper-sve.h    |  3 +++
target/arm/sve.decode      |  4 ++++
target/arm/sve_helper.c    | 42 ++++++++++++++++++++++++++++++++++++++
target/arm/translate-sve.c | 29 ++++++++++++++++++++++++++
5 files changed, 88 insertions(+)
[PATCH v3] target/arm: Implement SVE2 FMMLA
Posted by Stephen Long 4 years ago
Signed-off-by: Stephen Long <steplong@quicinc.com>

Fixed the errors Richard pointed out.
---
 target/arm/cpu.h           | 10 +++++++++
 target/arm/helper-sve.h    |  3 +++
 target/arm/sve.decode      |  4 ++++
 target/arm/sve_helper.c    | 42 ++++++++++++++++++++++++++++++++++++++
 target/arm/translate-sve.c | 29 ++++++++++++++++++++++++++
 5 files changed, 88 insertions(+)

diff --git a/target/arm/cpu.h b/target/arm/cpu.h
index b7c7946771..d41c4a08c0 100644
--- a/target/arm/cpu.h
+++ b/target/arm/cpu.h
@@ -3870,6 +3870,16 @@ static inline bool isar_feature_aa64_sve2_bitperm(const ARMISARegisters *id)
     return FIELD_EX64(id->id_aa64zfr0, ID_AA64ZFR0, BITPERM) != 0;
 }
 
+static inline bool isar_feature_aa64_sve2_f32mm(const ARMISARegisters *id)
+{
+    return FIELD_EX64(id->id_aa64zfr0, ID_AA64ZFR0, F32MM) != 0;
+}
+
+static inline bool isar_feature_aa64_sve2_f64mm(const ARMISARegisters *id)
+{
+    return FIELD_EX64(id->id_aa64zfr0, ID_AA64ZFR0, F64MM) != 0;
+}
+
 /*
  * Feature tests for "does this exist in either 32-bit or 64-bit?"
  */
diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index ea53750141..8104d23c5f 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -2683,3 +2683,6 @@ DEF_HELPER_FLAGS_5(sve2_sqrdcmlah_zzzz_s, TCG_CALL_NO_RWG,
                    void, ptr, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_5(sve2_sqrdcmlah_zzzz_d, TCG_CALL_NO_RWG,
                    void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_6(fmmla_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(fmmla_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, ptr, i32)
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index 95c73c665a..dd987da648 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -1383,3 +1383,7 @@ UMLSLT_zzzw     01000100 .. 0 ..... 010 111 ..... .....  @rda_rn_rm
 
 CMLA_zzzz       01000100 esz:2 0 rm:5 0010 rot:2 rn:5 rd:5  ra=%reg_movprfx
 SQRDCMLAH_zzzz  01000100 esz:2 0 rm:5 0011 rot:2 rn:5 rd:5  ra=%reg_movprfx
+
+### SVE2 floating point matrix multiply accumulate
+
+FMMLA           01100100 .. 1 ..... 111001 ..... .....  @rda_rn_rm
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index b392a87aef..9c6709d6df 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -7389,3 +7389,45 @@ void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc)
         *(uint64_t *)(vd + i + 8) = out1;
     }
 }
+
+#define DO_FP_MATRIX_MUL(NAME, TYPE, MUL, ADD, H)                           \
+void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,                   \
+                     void *status, uint32_t desc)                           \
+{                                                                           \
+    intptr_t s;                                                             \
+    intptr_t opr_sz = simd_oprsz(desc) / (sizeof(TYPE) >> 2);               \
+                                                                            \
+    for (s = 0; s < opr_sz; ++s) {                                          \
+        TYPE *n = vn + s * (sizeof(TYPE) >> 2);                             \
+        TYPE *m = vm + s * (sizeof(TYPE) >> 2);                             \
+        TYPE *a = va + s * (sizeof(TYPE) >> 2);                             \
+        TYPE *d = vd + s * (sizeof(TYPE) >> 2);                             \
+                                                                            \
+        TYPE n00 = n[H(0)], n01 = n[H(1)], n10 = n[H(2)], n11 = n[H(3)];    \
+        TYPE m00 = m[H(0)], m01 = m[H(1)], m10 = m[H(2)], m11 = m[H(3)];    \
+        TYPE p0, p1;                                                        \
+                                                                            \
+        /* i = 0, j = 0 */                                                  \
+        p0 = MUL(n00, m00, status);                                         \
+        p1 = MUL(n01, m01, status);                                         \
+        d[H(0)] = ADD(a[H(0)], ADD(p0, p1, status), status);                \
+                                                                            \
+        /* i = 0, j = 1 */                                                  \
+        p0 = MUL(n00, m10, status);                                         \
+        p1 = MUL(n01, m11, status);                                         \
+        d[H(1)] = ADD(a[H(1)], ADD(p0, p1, status), status);                \
+                                                                            \
+        /* i = 1, j = 0 */                                                  \
+        p0 = MUL(n10, m00, status);                                         \
+        p1 = MUL(n11, m01, status);                                         \
+        d[H(2)] = ADD(a[H(2)], ADD(p0, p1, status), status);                \
+                                                                            \
+        /* i = 1, j = 1 */                                                  \
+        p0 = MUL(n10, m10, status);                                         \
+        p1 = MUL(n11, m11, status);                                         \
+        d[H(3)] = ADD(a[H(3)], ADD(p0, p1, status), status);                \
+    }                                                                       \
+}
+
+DO_FP_MATRIX_MUL(fmmla_s, float32, float32_mul, float32_add, H4)
+DO_FP_MATRIX_MUL(fmmla_d, float64, float64_mul, float64_add,   )
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 0cbb35c691..29532424c1 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -7615,6 +7615,35 @@ static bool do_sve2_zzzz_fn(DisasContext *s, int rd, int rn, int rm, int ra,
     return true;
 }
 
+static bool trans_FMMLA(DisasContext *s, arg_rrrr_esz *a)
+{
+    if (a->esz < MO_32) {
+        return false;
+    }
+
+    if (a->esz == MO_32 && !dc_isar_feature(aa64_sve2_f32mm, s)) {
+        return false;
+    }
+
+    if (a->esz == MO_64 && !dc_isar_feature(aa64_sve2_f64mm, s)) {
+        return false;
+    }
+
+    static gen_helper_gvec_4_ptr * const fns[2] = {
+        gen_helper_fmmla_s, gen_helper_fmmla_d
+    };
+    if (sve_access_check(s)) {
+        unsigned vsz = vec_full_reg_size(s);
+        TCGv_ptr status = get_fpstatus_ptr(a->esz == MO_16);
+        tcg_gen_gvec_4_ptr(vec_full_reg_offset(s, a->rd),
+                           vec_full_reg_offset(s, a->ra),
+                           vec_full_reg_offset(s, a->rn),
+                           vec_full_reg_offset(s, a->rm),
+                           status, vsz, vsz, 0, fns[a->esz - 2]);
+    }
+    return true;
+}
+
 static bool do_sqdmlal_zzzw(DisasContext *s, arg_rrrr_esz *a,
                             bool sel1, bool sel2)
 {
-- 
2.17.1


Re: [PATCH v3] target/arm: Implement SVE2 FMMLA
Posted by Richard Henderson 4 years ago
On 4/22/20 9:55 AM, Stephen Long wrote:
> +    intptr_t opr_sz = simd_oprsz(desc) / (sizeof(TYPE) >> 2);               \
> +                                                                            \
> +    for (s = 0; s < opr_sz; ++s) {                                          \
> +        TYPE *n = vn + s * (sizeof(TYPE) >> 2);                             \
> +        TYPE *m = vm + s * (sizeof(TYPE) >> 2);                             \
> +        TYPE *a = va + s * (sizeof(TYPE) >> 2);                             \
> +        TYPE *d = vd + s * (sizeof(TYPE) >> 2);                             \

Shifting the wrong way.  Need to multiply by 4 not divide.

I've fixed this up, and also expanded the macro to two functions; I think it's
clearer that way in this case.

Applied to my SVE2 branch.  Thanks,


r~