[v2] target/arm: Fix 32-bit SMOPA

[PATCH v2] target/arm: Fix 32-bit SMOPA

Posted by Richard Henderson 1 month, 3 weeks ago

While the 8-bit input elements are sequential in the input vector,
the 32-bit output elements are not sequential in the output matrix.
Do not attempt to compute 2 32-bit outputs at the same time.

Cc: qemu-stable@nongnu.org
Fixes: 23a5e3859f5 ("target/arm: Implement SME integer outer product")
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2083
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---

v2: Fixed endian issue; double-checked on s390x.

---
 target/arm/tcg/sme_helper.c       | 77 ++++++++++++++++++-------------
 tests/tcg/aarch64/sme-smopa-1.c   | 47 +++++++++++++++++++
 tests/tcg/aarch64/sme-smopa-2.c   | 54 ++++++++++++++++++++++
 tests/tcg/aarch64/Makefile.target |  2 +-
 4 files changed, 147 insertions(+), 33 deletions(-)
 create mode 100644 tests/tcg/aarch64/sme-smopa-1.c
 create mode 100644 tests/tcg/aarch64/sme-smopa-2.c

diff --git a/target/arm/tcg/sme_helper.c b/target/arm/tcg/sme_helper.c
index 904bfdac43..e2e0575039 100644
--- a/target/arm/tcg/sme_helper.c
+++ b/target/arm/tcg/sme_helper.c
@@ -1083,11 +1083,32 @@ void HELPER(sme_bfmopa)(void *vza, void *vzn, void *vzm, void *vpn,
     }
 }
 
-typedef uint64_t IMOPFn(uint64_t, uint64_t, uint64_t, uint8_t, bool);
+typedef uint32_t IMOPFn32(uint32_t, uint32_t, uint32_t, uint8_t, bool);
+static inline void do_imopa_s(uint32_t *za, uint32_t *zn, uint32_t *zm,
+                              uint8_t *pn, uint8_t *pm,
+                              uint32_t desc, IMOPFn32 *fn)
+{
+    intptr_t row, col, oprsz = simd_oprsz(desc) / 4;
+    bool neg = simd_data(desc);
 
-static inline void do_imopa(uint64_t *za, uint64_t *zn, uint64_t *zm,
-                            uint8_t *pn, uint8_t *pm,
-                            uint32_t desc, IMOPFn *fn)
+    for (row = 0; row < oprsz; ++row) {
+        uint8_t pa = (pn[H1(row >> 1)] >> ((row & 1) * 4)) & 0xf;
+        uint32_t *za_row = &za[tile_vslice_index(row)];
+        uint32_t n = zn[H4(row)];
+
+        for (col = 0; col < oprsz; ++col) {
+            uint8_t pb = pm[H1(col >> 1)] >> ((col & 1) * 4);
+            uint32_t *a = &za_row[H4(col)];
+
+            *a = fn(n, zm[H4(col)], *a, pa & pb, neg);
+        }
+    }
+}
+
+typedef uint64_t IMOPFn64(uint64_t, uint64_t, uint64_t, uint8_t, bool);
+static inline void do_imopa_d(uint64_t *za, uint64_t *zn, uint64_t *zm,
+                              uint8_t *pn, uint8_t *pm,
+                              uint32_t desc, IMOPFn64 *fn)
 {
     intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
     bool neg = simd_data(desc);
@@ -1107,25 +1128,16 @@ static inline void do_imopa(uint64_t *za, uint64_t *zn, uint64_t *zm,
 }
 
 #define DEF_IMOP_32(NAME, NTYPE, MTYPE) \
-static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \
+static uint32_t NAME(uint32_t n, uint32_t m, uint32_t a, uint8_t p, bool neg) \
 {                                                                           \
-    uint32_t sum0 = 0, sum1 = 0;                                            \
+    uint32_t sum = 0;                                                       \
     /* Apply P to N as a mask, making the inactive elements 0. */           \
     n &= expand_pred_b(p);                                                  \
-    sum0 += (NTYPE)(n >> 0) * (MTYPE)(m >> 0);                              \
-    sum0 += (NTYPE)(n >> 8) * (MTYPE)(m >> 8);                              \
-    sum0 += (NTYPE)(n >> 16) * (MTYPE)(m >> 16);                            \
-    sum0 += (NTYPE)(n >> 24) * (MTYPE)(m >> 24);                            \
-    sum1 += (NTYPE)(n >> 32) * (MTYPE)(m >> 32);                            \
-    sum1 += (NTYPE)(n >> 40) * (MTYPE)(m >> 40);                            \
-    sum1 += (NTYPE)(n >> 48) * (MTYPE)(m >> 48);                            \
-    sum1 += (NTYPE)(n >> 56) * (MTYPE)(m >> 56);                            \
-    if (neg) {                                                              \
-        sum0 = (uint32_t)a - sum0, sum1 = (uint32_t)(a >> 32) - sum1;       \
-    } else {                                                                \
-        sum0 = (uint32_t)a + sum0, sum1 = (uint32_t)(a >> 32) + sum1;       \
-    }                                                                       \
-    return ((uint64_t)sum1 << 32) | sum0;                                   \
+    sum += (NTYPE)(n >> 0) * (MTYPE)(m >> 0);                               \
+    sum += (NTYPE)(n >> 8) * (MTYPE)(m >> 8);                               \
+    sum += (NTYPE)(n >> 16) * (MTYPE)(m >> 16);                             \
+    sum += (NTYPE)(n >> 24) * (MTYPE)(m >> 24);                             \
+    return neg ? a - sum : a + sum;                                         \
 }
 
 #define DEF_IMOP_64(NAME, NTYPE, MTYPE) \
@@ -1151,16 +1163,17 @@ DEF_IMOP_64(umopa_d, uint16_t, uint16_t)
 DEF_IMOP_64(sumopa_d, int16_t, uint16_t)
 DEF_IMOP_64(usmopa_d, uint16_t, int16_t)
 
-#define DEF_IMOPH(NAME) \
-    void HELPER(sme_##NAME)(void *vza, void *vzn, void *vzm, void *vpn,      \
-                            void *vpm, uint32_t desc)                        \
-    { do_imopa(vza, vzn, vzm, vpn, vpm, desc, NAME); }
+#define DEF_IMOPH(NAME, S) \
+    void HELPER(sme_##NAME##_##S)(void *vza, void *vzn, void *vzm,          \
+                                  void *vpn, void *vpm, uint32_t desc)      \
+    { do_imopa_##S(vza, vzn, vzm, vpn, vpm, desc, NAME##_##S); }
 
-DEF_IMOPH(smopa_s)
-DEF_IMOPH(umopa_s)
-DEF_IMOPH(sumopa_s)
-DEF_IMOPH(usmopa_s)
-DEF_IMOPH(smopa_d)
-DEF_IMOPH(umopa_d)
-DEF_IMOPH(sumopa_d)
-DEF_IMOPH(usmopa_d)
+DEF_IMOPH(smopa, s)
+DEF_IMOPH(umopa, s)
+DEF_IMOPH(sumopa, s)
+DEF_IMOPH(usmopa, s)
+
+DEF_IMOPH(smopa, d)
+DEF_IMOPH(umopa, d)
+DEF_IMOPH(sumopa, d)
+DEF_IMOPH(usmopa, d)
diff --git a/tests/tcg/aarch64/sme-smopa-1.c b/tests/tcg/aarch64/sme-smopa-1.c
new file mode 100644
index 0000000000..c62d5e0007
--- /dev/null
+++ b/tests/tcg/aarch64/sme-smopa-1.c
@@ -0,0 +1,47 @@
+#include <stdio.h>
+#include <string.h>
+
+int main()
+{
+    static const int cmp[4][4] = {
+        {  110,  134,  158,  182 },
+        {  390,  478,  566,  654 },
+        {  670,  822,  974, 1126 },
+        {  950, 1166, 1382, 1598 }
+    };
+    int dst[4][4];
+    int *tmp = &dst[0][0];
+
+    asm volatile(
+        ".arch armv8-r+sme\n\t"
+        "smstart\n\t"
+        "index z0.b, #0, #1\n\t"
+        "movprfx z1, z0\n\t"
+        "add z1.b, z1.b, #16\n\t"
+        "ptrue p0.b\n\t"
+        "smopa za0.s, p0/m, p0/m, z0.b, z1.b\n\t"
+        "ptrue p0.s, vl4\n\t"
+        "mov w12, #0\n\t"
+        "st1w { za0h.s[w12, #0] }, p0, [%0]\n\t"
+        "add %0, %0, #16\n\t"
+        "st1w { za0h.s[w12, #1] }, p0, [%0]\n\t"
+        "add %0, %0, #16\n\t"
+        "st1w { za0h.s[w12, #2] }, p0, [%0]\n\t"
+        "add %0, %0, #16\n\t"
+        "st1w { za0h.s[w12, #3] }, p0, [%0]\n\t"
+        "smstop"
+        : "+r"(tmp) : : "memory");
+
+    if (memcmp(cmp, dst, sizeof(dst)) == 0) {
+        return 0;
+    }
+
+    /* See above for correct results. */
+    for (int i = 0; i < 4; ++i) {
+        for (int j = 0; j < 4; ++j) {
+            printf("%6d", dst[i][j]);
+        }
+        printf("\n");
+    }
+    return 1;
+}
diff --git a/tests/tcg/aarch64/sme-smopa-2.c b/tests/tcg/aarch64/sme-smopa-2.c
new file mode 100644
index 0000000000..c9f48c3bfc
--- /dev/null
+++ b/tests/tcg/aarch64/sme-smopa-2.c
@@ -0,0 +1,54 @@
+#include <stdio.h>
+#include <string.h>
+
+int main()
+{
+    static const long cmp[4][4] = {
+        {  110,  134,  158,  182 },
+        {  390,  478,  566,  654 },
+        {  670,  822,  974, 1126 },
+        {  950, 1166, 1382, 1598 }
+    };
+    long dst[4][4];
+    long *tmp = &dst[0][0];
+    long svl;
+
+    /* Validate that we have a wide enough vector for 4 elements. */
+    asm(".arch armv8-r+sme-i64\n\trdsvl %0, #1" : "=r"(svl));
+    if (svl < 32) {
+        return 0;
+    }
+
+    asm volatile(
+        "smstart\n\t"
+        "index z0.h, #0, #1\n\t"
+        "movprfx z1, z0\n\t"
+        "add z1.h, z1.h, #16\n\t"
+        "ptrue p0.b\n\t"
+        "smopa za0.d, p0/m, p0/m, z0.h, z1.h\n\t"
+        "ptrue p0.d, vl4\n\t"
+        "mov w12, #0\n\t"
+        "st1d { za0h.d[w12, #0] }, p0, [%0]\n\t"
+        "add %0, %0, #32\n\t"
+        "st1d { za0h.d[w12, #1] }, p0, [%0]\n\t"
+        "mov w12, #2\n\t"
+        "add %0, %0, #32\n\t"
+        "st1d { za0h.d[w12, #0] }, p0, [%0]\n\t"
+        "add %0, %0, #32\n\t"
+        "st1d { za0h.d[w12, #1] }, p0, [%0]\n\t"
+        "smstop"
+        : "+r"(tmp) : : "memory");
+
+    if (memcmp(cmp, dst, sizeof(dst)) == 0) {
+        return 0;
+    }
+
+    /* See above for correct results. */
+    for (int i = 0; i < 4; ++i) {
+        for (int j = 0; j < 4; ++j) {
+            printf("%6ld", dst[i][j]);
+        }
+        printf("\n");
+    }
+    return 1;
+}
diff --git a/tests/tcg/aarch64/Makefile.target b/tests/tcg/aarch64/Makefile.target
index cded1d01fc..ea3e232e65 100644
--- a/tests/tcg/aarch64/Makefile.target
+++ b/tests/tcg/aarch64/Makefile.target
@@ -67,7 +67,7 @@ endif
 
 # SME Tests
 ifneq ($(CROSS_AS_HAS_ARMV9_SME),)
-AARCH64_TESTS += sme-outprod1
+AARCH64_TESTS += sme-outprod1 sme-smopa-1 sme-smopa-2
 endif
 
 # System Registers Tests
-- 
2.34.1

Re: [PATCH v2] target/arm: Fix 32-bit SMOPA

Posted by Michael Tokarev 1 month, 2 weeks ago

05.03.2024 19:39, Richard Henderson wrote:
> While the 8-bit input elements are sequential in the input vector,
> the 32-bit output elements are not sequential in the output matrix.
> Do not attempt to compute 2 32-bit outputs at the same time.
> 
> Cc: qemu-stable@nongnu.org
> Fixes: 23a5e3859f5 ("target/arm: Implement SME integer outer product")
> Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2083
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

...
> diff --git a/tests/tcg/aarch64/Makefile.target b/tests/tcg/aarch64/Makefile.target
> index cded1d01fc..ea3e232e65 100644
> --- a/tests/tcg/aarch64/Makefile.target
> +++ b/tests/tcg/aarch64/Makefile.target
> @@ -67,7 +67,7 @@ endif
>   
>   # SME Tests
>   ifneq ($(CROSS_AS_HAS_ARMV9_SME),)
> -AARCH64_TESTS += sme-outprod1
> +AARCH64_TESTS += sme-outprod1 sme-smopa-1 sme-smopa-2
>   endif

I tried to pick this one up for stable-7.2 (since the fix is for older commit),
and faced a fun issue in this change to tests/tcg/aarch64/Makefile.target,
since 7.2. doesn't have CROSS_AS_HAS_ARMV9_SME yet.  I went on and found out
that the following commits:

v7.2.0-374-gbc6bd20ee3  target/arm: align exposed ID registers with Linux
v8.0.0-2358-g3dc2afeab2 tests/tcg/aarch64/sysregs.c: Use S syntax for id_aa64zfr0_el1
                         and id_aa64smfr0_el1
v8.0.0-2361-g1f51573f79 target/arm: Fix SME full tile indexing

applies to 7.2, and lets this "Fix 32-bit SMOPA" change to apply cleanly,
including this very change in Makefile.target.

Now, 1f51573f79 "Fix SME full tile indexing" is Cc'd qemu-stable already, but
it is not in 7.2.x for some reason which I don't remember anymore, so it is
a good one to pick up already.  3dc2afeab is tests-only.

And bc6bd20ee3 (from Dec-2022) seems like a good candidate too, is it not?

I'm picking up all the bunch for now, let's see..

Thanks,

/mjt

Re: [PATCH v2] target/arm: Fix 32-bit SMOPA

Posted by Richard Henderson 1 month, 2 weeks ago

On 3/9/24 08:40, Michael Tokarev wrote:
> 05.03.2024 19:39, Richard Henderson wrote:
>> While the 8-bit input elements are sequential in the input vector,
>> the 32-bit output elements are not sequential in the output matrix.
>> Do not attempt to compute 2 32-bit outputs at the same time.
>>
>> Cc: qemu-stable@nongnu.org
>> Fixes: 23a5e3859f5 ("target/arm: Implement SME integer outer product")
>> Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2083
>> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> 
> ...
>> diff --git a/tests/tcg/aarch64/Makefile.target b/tests/tcg/aarch64/Makefile.target
>> index cded1d01fc..ea3e232e65 100644
>> --- a/tests/tcg/aarch64/Makefile.target
>> +++ b/tests/tcg/aarch64/Makefile.target
>> @@ -67,7 +67,7 @@ endif
>>   # SME Tests
>>   ifneq ($(CROSS_AS_HAS_ARMV9_SME),)
>> -AARCH64_TESTS += sme-outprod1
>> +AARCH64_TESTS += sme-outprod1 sme-smopa-1 sme-smopa-2
>>   endif
> 
> I tried to pick this one up for stable-7.2 (since the fix is for older commit),
> and faced a fun issue in this change to tests/tcg/aarch64/Makefile.target,
> since 7.2. doesn't have CROSS_AS_HAS_ARMV9_SME yet.  I went on and found out
> that the following commits:
> 
> v7.2.0-374-gbc6bd20ee3  target/arm: align exposed ID registers with Linux
> v8.0.0-2358-g3dc2afeab2 tests/tcg/aarch64/sysregs.c: Use S syntax for id_aa64zfr0_el1
>                          and id_aa64smfr0_el1
> v8.0.0-2361-g1f51573f79 target/arm: Fix SME full tile indexing
> 
> applies to 7.2, and lets this "Fix 32-bit SMOPA" change to apply cleanly,
> including this very change in Makefile.target.
> 
> Now, 1f51573f79 "Fix SME full tile indexing" is Cc'd qemu-stable already, but
> it is not in 7.2.x for some reason which I don't remember anymore, so it is
> a good one to pick up already.  3dc2afeab is tests-only.

Oh wow, I didn't expect the fix to get propagated back that far.
I was expecting only back into the 8.x series...

> And bc6bd20ee3 (from Dec-2022) seems like a good candidate too, is it not?

Sure, couldn't hurt.

If it all applies without drama, all is well.


r~

Re: [PATCH v2] target/arm: Fix 32-bit SMOPA

Posted by Michael Tokarev 1 month, 2 weeks ago

10.03.2024 21:13, Richard Henderson wrote:
> On 3/9/24 08:40, Michael Tokarev wrote:
...
>> I tried to pick this one up for stable-7.2 (since the fix is for older commit),
>> and faced a fun issue in this change to tests/tcg/aarch64/Makefile.target,
>> since 7.2. doesn't have CROSS_AS_HAS_ARMV9_SME yet.  I went on and found out
>> that the following commits:
>>
>> v7.2.0-374-gbc6bd20ee3  target/arm: align exposed ID registers with Linux
>> v8.0.0-2358-g3dc2afeab2 tests/tcg/aarch64/sysregs.c: Use S syntax for id_aa64zfr0_el1
>>                          and id_aa64smfr0_el1
>> v8.0.0-2361-g1f51573f79 target/arm: Fix SME full tile indexing
>>
>> applies to 7.2, and lets this "Fix 32-bit SMOPA" change to apply cleanly,
>> including this very change in Makefile.target.
>>
>> Now, 1f51573f79 "Fix SME full tile indexing" is Cc'd qemu-stable already, but
>> it is not in 7.2.x for some reason which I don't remember anymore, so it is
>> a good one to pick up already.  3dc2afeab is tests-only.
> 
> Oh wow, I didn't expect the fix to get propagated back that far.
> I was expecting only back into the 8.x series...
> 
>> And bc6bd20ee3 (from Dec-2022) seems like a good candidate too, is it not?
> 
> Sure, couldn't hurt.
> 
> If it all applies without drama, all is well.

While it goes fine, it doesn't quite work actually:

https://gitlab.com/qemu-project/qemu/-/jobs/6386966720#L2482

timeout --foreground 90  /home/gitlab-runner/builds/E8PpwMky/0/qemu-project/qemu/build/qemu-aarch64  sme-smopa-1 >  sme-smopa-1.out
warning: TCG temporary leaks before 0000000000400714
qemu-aarch64: ../tcg/tcg.c:1052: tcg_temp_free_internal: Assertion `ts->temp_allocated != 0' failed.
timeout: the monitored command dumped core
Trace/breakpoint trap
make[1]: *** [Makefile:170: run-sme-smopa-1] Error 133
make[1]: Leaving directory '/home/gitlab-runner/builds/E8PpwMky/0/qemu-project/qemu/build/tests/tcg/aarch64-linux-user'
make[1]: *** Waiting for unfinished jobs....
make[1]: Entering directory '/home/gitlab-runner/builds/E8PpwMky/0/qemu-project/qemu/build/tests/tcg/aarch64-linux-user'
timeout --foreground 90  /home/gitlab-runner/builds/E8PpwMky/0/qemu-project/qemu/build/qemu-aarch64  sme-smopa-2 >  sme-smopa-2.out
warning: TCG temporary leaks before 0000000000400730
qemu-aarch64: ../tcg/tcg.c:1052: tcg_temp_free_internal: Assertion `ts->temp_allocated != 0' failed.
timeout: the monitored command dumped core
Trace/breakpoint trap

Does it make sense to pick this for 7.2 with the above failures?

Unfortunately this test does not run on a private repository clone on gitlab,
that's why I haven't noticed this immediately.

Thanks,

/mjt

Re: [PATCH v2] target/arm: Fix 32-bit SMOPA

Posted by Richard Henderson 1 month, 2 weeks ago

On 3/13/24 09:12, Michael Tokarev wrote:
> warning: TCG temporary leaks before 0000000000400730
> qemu-aarch64: ../tcg/tcg.c:1052: tcg_temp_free_internal: Assertion `ts->temp_allocated != 
> 0' failed.
> timeout: the monitored command dumped core
> Trace/breakpoint trap
> 
> Does it make sense to pick this for 7.2 with the above failures?

No, it doesn't.

I guess it was in the 8.x series that we eliminated the need for freeing tcg temporaries. 
The patch set would need adjustment for that, and I don't think it's worth the effort.

r~

Re: [PATCH v2] target/arm: Fix 32-bit SMOPA

Posted by Michael Tokarev 1 month, 2 weeks ago

10.03.2024 21:13, Richard Henderson :
...
> If it all applies without drama, all is well.

This very fix applies and works just fine on top of 7.2, it is only
the tests part (the Makefile.target fragment) which isn't, so it's
kinda problematic to apply it without a test.  And while figuring out
what to do with the tests, I found other changes which also should go
there.

It isn't just about lack of drama.  It applies cleanly and works fine
so far (unfortunately I don't have lots'a arm* guests to test it) and
the CI works fine too.

That's why I mentioned it in the first place, - the situation is kinda
fun :)

(and to let everyone know the changes which are being picked up too,
ofcourse).

Thanks,

/mjt

Re: [PATCH v2] target/arm: Fix 32-bit SMOPA

Posted by Peter Maydell 1 month, 3 weeks ago

On Tue, 5 Mar 2024 at 16:39, Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> While the 8-bit input elements are sequential in the input vector,
> the 32-bit output elements are not sequential in the output matrix.
> Do not attempt to compute 2 32-bit outputs at the same time.
>
> Cc: qemu-stable@nongnu.org
> Fixes: 23a5e3859f5 ("target/arm: Implement SME integer outer product")
> Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2083
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>
> v2: Fixed endian issue; double-checked on s390x.
>


Applied to target-arm.next, thanks.

-- PMM

Re: [PATCH v2] target/arm: Fix 32-bit SMOPA

Posted by Philippe Mathieu-Daudé 1 month, 3 weeks ago

On 5/3/24 17:39, Richard Henderson wrote:
> While the 8-bit input elements are sequential in the input vector,
> the 32-bit output elements are not sequential in the output matrix.
> Do not attempt to compute 2 32-bit outputs at the same time.
> 
> Cc: qemu-stable@nongnu.org
> Fixes: 23a5e3859f5 ("target/arm: Implement SME integer outer product")
> Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2083
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
> 
> v2: Fixed endian issue; double-checked on s390x.
> 
> ---
>   target/arm/tcg/sme_helper.c       | 77 ++++++++++++++++++-------------
>   tests/tcg/aarch64/sme-smopa-1.c   | 47 +++++++++++++++++++
>   tests/tcg/aarch64/sme-smopa-2.c   | 54 ++++++++++++++++++++++
>   tests/tcg/aarch64/Makefile.target |  2 +-
>   4 files changed, 147 insertions(+), 33 deletions(-)
>   create mode 100644 tests/tcg/aarch64/sme-smopa-1.c
>   create mode 100644 tests/tcg/aarch64/sme-smopa-2.c

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>