target/arm: Third slice of MVE implementation

[PATCH for-6.2 17/34] target/arm: Implement MVE VMLAS

Posted by Peter Maydell 4 years, 7 months ago

Implement the MVE VMLAS insn, which multiplies a vector by a vector
and adds a scalar.

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 target/arm/helper-mve.h    |  8 ++++++++
 target/arm/mve.decode      |  3 +++
 target/arm/mve_helper.c    | 31 +++++++++++++++++++++++++++++++
 target/arm/translate-mve.c |  2 ++
 4 files changed, 44 insertions(+)

diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h
index f1a54aba5d4..6f2cc5c2929 100644
--- a/target/arm/helper-mve.h
+++ b/target/arm/helper-mve.h
@@ -351,6 +351,14 @@ DEF_HELPER_FLAGS_4(mve_vqdmullb_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i3
 DEF_HELPER_FLAGS_4(mve_vqdmullt_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(mve_vqdmullt_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_4(mve_vmlassb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(mve_vmlassh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(mve_vmlassw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(mve_vmlasub, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(mve_vmlasuh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(mve_vmlasuw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
+
 DEF_HELPER_FLAGS_4(mve_vmlaldavsh, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64)
 DEF_HELPER_FLAGS_4(mve_vmlaldavsw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64)
 DEF_HELPER_FLAGS_4(mve_vmlaldavxsh, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64)
diff --git a/target/arm/mve.decode b/target/arm/mve.decode
index 4bd20a9a319..05c30735545 100644
--- a/target/arm/mve.decode
+++ b/target/arm/mve.decode
@@ -345,6 +345,9 @@ VBRSR            1111 1110 0 . .. ... 1 ... 1 1110 . 110 .... @2scalar
 VQDMULH_scalar   1110 1110 0 . .. ... 1 ... 0 1110 . 110 .... @2scalar
 VQRDMULH_scalar  1111 1110 0 . .. ... 1 ... 0 1110 . 110 .... @2scalar
 
+VMLAS_S          1110 1110 0 . .. ... 1 ... 1 1110 . 100 .... @2scalar
+VMLAS_U          1111 1110 0 . .. ... 1 ... 1 1110 . 100 .... @2scalar
+
 # Vector add across vector
 {
   VADDV          111 u:1 1110 1111 size:2 01 ... 0 1111 0 0 a:1 0 qm:3 0 rda=%rdalo
diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c
index be67e7cea26..98c3a418dcb 100644
--- a/target/arm/mve_helper.c
+++ b/target/arm/mve_helper.c
@@ -955,6 +955,22 @@ DO_VQDMLADH_OP(vqrdmlsdhxw, 4, int32_t, 1, 1, do_vqdmlsdh_w)
         mve_advance_vpt(env);                                           \
     }
 
+/* "accumulating" version where FN takes d as well as n and m */
+#define DO_2OP_ACC_SCALAR(OP, ESIZE, TYPE, FN)                          \
+    void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn,   \
+                                uint32_t rm)                            \
+    {                                                                   \
+        TYPE *d = vd, *n = vn;                                          \
+        TYPE m = rm;                                                    \
+        uint16_t mask = mve_element_mask(env);                          \
+        unsigned e;                                                     \
+        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
+            mergemask(&d[H##ESIZE(e)],                                  \
+                      FN(d[H##ESIZE(e)], n[H##ESIZE(e)], m), mask);     \
+        }                                                               \
+        mve_advance_vpt(env);                                           \
+    }
+
 /* provide unsigned 2-op scalar helpers for all sizes */
 #define DO_2OP_SCALAR_U(OP, FN)                 \
     DO_2OP_SCALAR(OP##b, 1, uint8_t, FN)        \
@@ -965,6 +981,15 @@ DO_VQDMLADH_OP(vqrdmlsdhxw, 4, int32_t, 1, 1, do_vqdmlsdh_w)
     DO_2OP_SCALAR(OP##h, 2, int16_t, FN)        \
     DO_2OP_SCALAR(OP##w, 4, int32_t, FN)
 
+#define DO_2OP_ACC_SCALAR_U(OP, FN)             \
+    DO_2OP_ACC_SCALAR(OP##b, 1, uint8_t, FN)    \
+    DO_2OP_ACC_SCALAR(OP##h, 2, uint16_t, FN)   \
+    DO_2OP_ACC_SCALAR(OP##w, 4, uint32_t, FN)
+#define DO_2OP_ACC_SCALAR_S(OP, FN)             \
+    DO_2OP_ACC_SCALAR(OP##b, 1, int8_t, FN)     \
+    DO_2OP_ACC_SCALAR(OP##h, 2, int16_t, FN)    \
+    DO_2OP_ACC_SCALAR(OP##w, 4, int32_t, FN)
+
 DO_2OP_SCALAR_U(vadd_scalar, DO_ADD)
 DO_2OP_SCALAR_U(vsub_scalar, DO_SUB)
 DO_2OP_SCALAR_U(vmul_scalar, DO_MUL)
@@ -994,6 +1019,12 @@ DO_2OP_SAT_SCALAR(vqrdmulh_scalarb, 1, int8_t, DO_QRDMULH_B)
 DO_2OP_SAT_SCALAR(vqrdmulh_scalarh, 2, int16_t, DO_QRDMULH_H)
 DO_2OP_SAT_SCALAR(vqrdmulh_scalarw, 4, int32_t, DO_QRDMULH_W)
 
+/* Vector by vector plus scalar */
+#define DO_VMLAS(D, N, M) ((N) * (D) + (M))
+
+DO_2OP_ACC_SCALAR_S(vmlass, DO_VMLAS)
+DO_2OP_ACC_SCALAR_U(vmlasu, DO_VMLAS)
+
 /*
  * Long saturating scalar ops. As with DO_2OP_L, TYPE and H are for the
  * input (smaller) type and LESIZE, LTYPE, LH for the output (long) type.
diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c
index 689e15c069b..011d1d6bcd9 100644
--- a/target/arm/translate-mve.c
+++ b/target/arm/translate-mve.c
@@ -596,6 +596,8 @@ DO_2OP_SCALAR(VQSUB_U_scalar, vqsubu_scalar)
 DO_2OP_SCALAR(VQDMULH_scalar, vqdmulh_scalar)
 DO_2OP_SCALAR(VQRDMULH_scalar, vqrdmulh_scalar)
 DO_2OP_SCALAR(VBRSR, vbrsr)
+DO_2OP_SCALAR(VMLAS_S, vmlass)
+DO_2OP_SCALAR(VMLAS_U, vmlasu)
 
 static bool trans_VQDMULLB_scalar(DisasContext *s, arg_2scalar *a)
 {
-- 
2.20.1

Re: [PATCH for-6.2 17/34] target/arm: Implement MVE VMLAS

Posted by Richard Henderson 4 years, 6 months ago

On 7/13/21 6:37 AM, Peter Maydell wrote:
> Implement the MVE VMLAS insn, which multiplies a vector by a vector
> and adds a scalar.
> 
> Signed-off-by: Peter Maydell<peter.maydell@linaro.org>
> ---
>   target/arm/helper-mve.h    |  8 ++++++++
>   target/arm/mve.decode      |  3 +++
>   target/arm/mve_helper.c    | 31 +++++++++++++++++++++++++++++++
>   target/arm/translate-mve.c |  2 ++
>   4 files changed, 44 insertions(+)
...

> +/* Vector by vector plus scalar */
> +#define DO_VMLAS(D, N, M) ((N) * (D) + (M))
> +
> +DO_2OP_ACC_SCALAR_S(vmlass, DO_VMLAS)
> +DO_2OP_ACC_SCALAR_U(vmlasu, DO_VMLAS)

This is confusing.  The ARM says

# Operations that do not perform
# widening are always unsigned (encoded with U=1),

This instruction does not perform widening, but it then codes on to enumerate the 
signed/unsigned encodings.

I suppose you're matching what's written, so
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>


r~

Re: [PATCH for-6.2 17/34] target/arm: Implement MVE VMLAS

Posted by Peter Maydell 4 years, 6 months ago

On Fri, 16 Jul 2021 at 23:12, Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> On 7/13/21 6:37 AM, Peter Maydell wrote:
> > Implement the MVE VMLAS insn, which multiplies a vector by a vector
> > and adds a scalar.
> >
> > Signed-off-by: Peter Maydell<peter.maydell@linaro.org>
> > ---
> >   target/arm/helper-mve.h    |  8 ++++++++
> >   target/arm/mve.decode      |  3 +++
> >   target/arm/mve_helper.c    | 31 +++++++++++++++++++++++++++++++
> >   target/arm/translate-mve.c |  2 ++
> >   4 files changed, 44 insertions(+)
> ...
>
> > +/* Vector by vector plus scalar */
> > +#define DO_VMLAS(D, N, M) ((N) * (D) + (M))
> > +
> > +DO_2OP_ACC_SCALAR_S(vmlass, DO_VMLAS)
> > +DO_2OP_ACC_SCALAR_U(vmlasu, DO_VMLAS)
>
> This is confusing.  The ARM says
>
> # Operations that do not perform
> # widening are always unsigned (encoded with U=1),

I have noticed that that text often appears for insns where it doesn't
really apply. I mostly ignore the text in favour of looking at
the pseudocode for working out what is supposed to be done.

-- PMM

Re: [PATCH for-6.2 17/34] target/arm: Implement MVE VMLAS

Posted by Richard Henderson 4 years, 6 months ago

On 7/17/21 3:06 AM, Peter Maydell wrote:
> On Fri, 16 Jul 2021 at 23:12, Richard Henderson
> <richard.henderson@linaro.org> wrote:
>>
>> On 7/13/21 6:37 AM, Peter Maydell wrote:
>>> Implement the MVE VMLAS insn, which multiplies a vector by a vector
>>> and adds a scalar.
>>>
>>> Signed-off-by: Peter Maydell<peter.maydell@linaro.org>
>>> ---
>>>    target/arm/helper-mve.h    |  8 ++++++++
>>>    target/arm/mve.decode      |  3 +++
>>>    target/arm/mve_helper.c    | 31 +++++++++++++++++++++++++++++++
>>>    target/arm/translate-mve.c |  2 ++
>>>    4 files changed, 44 insertions(+)
>> ...
>>
>>> +/* Vector by vector plus scalar */
>>> +#define DO_VMLAS(D, N, M) ((N) * (D) + (M))
>>> +
>>> +DO_2OP_ACC_SCALAR_S(vmlass, DO_VMLAS)
>>> +DO_2OP_ACC_SCALAR_U(vmlasu, DO_VMLAS)
>>
>> This is confusing.  The ARM says
>>
>> # Operations that do not perform
>> # widening are always unsigned (encoded with U=1),
> 
> I have noticed that that text often appears for insns where it doesn't
> really apply. I mostly ignore the text in favour of looking at
> the pseudocode for working out what is supposed to be done.

Yes, but in this case there's nothing about the pseudocode that suggests that sign matters 
at all.  Neither the multiply nor the addition are widening.  So is there really a signed 
VMLAS instruction?


r~

Re: [PATCH for-6.2 17/34] target/arm: Implement MVE VMLAS

Posted by Peter Maydell 4 years, 6 months ago

On Sat, 17 Jul 2021 at 21:40, Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> On 7/17/21 3:06 AM, Peter Maydell wrote:
> > On Fri, 16 Jul 2021 at 23:12, Richard Henderson
> > <richard.henderson@linaro.org> wrote:
> >>
> >> On 7/13/21 6:37 AM, Peter Maydell wrote:
> >>> Implement the MVE VMLAS insn, which multiplies a vector by a vector
> >>> and adds a scalar.
> >>>
> >>> Signed-off-by: Peter Maydell<peter.maydell@linaro.org>
> >>> ---
> >>>    target/arm/helper-mve.h    |  8 ++++++++
> >>>    target/arm/mve.decode      |  3 +++
> >>>    target/arm/mve_helper.c    | 31 +++++++++++++++++++++++++++++++
> >>>    target/arm/translate-mve.c |  2 ++
> >>>    4 files changed, 44 insertions(+)
> >> ...
> >>
> >>> +/* Vector by vector plus scalar */
> >>> +#define DO_VMLAS(D, N, M) ((N) * (D) + (M))
> >>> +
> >>> +DO_2OP_ACC_SCALAR_S(vmlass, DO_VMLAS)
> >>> +DO_2OP_ACC_SCALAR_U(vmlasu, DO_VMLAS)
> >>
> >> This is confusing.  The ARM says
> >>
> >> # Operations that do not perform
> >> # widening are always unsigned (encoded with U=1),
> >
> > I have noticed that that text often appears for insns where it doesn't
> > really apply. I mostly ignore the text in favour of looking at
> > the pseudocode for working out what is supposed to be done.

NB that in the latest version of the v8M Arm ARM (B.p) the
"operations that do not perform widening..." text has been removed.

> Yes, but in this case there's nothing about the pseudocode that suggests that sign matters
> at all.  Neither the multiply nor the addition are widening.  So is there really a signed
> VMLAS instruction?

There really is a U bit, but it doesn't affect the result :-)
I will remove the unnecessary extra set of helpers (similarly
for VMLA vector-by-scalar-plus-vector)...

thanks
-- PMM