Implement FPProcessNaNs4 within f16_dotadd, rather than
simply letting NaNs propagate through the function.
Cc: qemu-stable@nongnu.org
Fixes: 0e1850182a1 ("target/arm: Implement FPCR.EBF=1 semantics for bfdotadd()")
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/tcg/vec_helper.c | 75 ++++++++++++++++++++++++++-----------
1 file changed, 53 insertions(+), 22 deletions(-)
diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c
index 986eaf8ffa..21c6175d2e 100644
--- a/target/arm/tcg/vec_helper.c
+++ b/target/arm/tcg/vec_helper.c
@@ -2989,31 +2989,62 @@ float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst)
float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2,
float_status *fpst, float_status *fpst_odd)
{
- /*
- * Compare f16_dotadd() in sme_helper.c, but here we have
- * bfloat16 inputs. In particular that means that we do not
- * want the FPCR.FZ16 flush semantics, so we use the normal
- * float_status for the input handling here.
- */
- float64 e1r = float32_to_float64(e1 << 16, fpst);
- float64 e1c = float32_to_float64(e1 & 0xffff0000u, fpst);
- float64 e2r = float32_to_float64(e2 << 16, fpst);
- float64 e2c = float32_to_float64(e2 & 0xffff0000u, fpst);
- float64 t64;
+ float32 s1r = e1 << 16;
+ float32 s1c = e1 & 0xffff0000u;
+ float32 s2r = e2 << 16;
+ float32 s2c = e2 & 0xffff0000u;
float32 t32;
- /*
- * The ARM pseudocode function FPDot performs both multiplies
- * and the add with a single rounding operation. Emulate this
- * by performing the first multiply in round-to-odd, then doing
- * the second multiply as fused multiply-add, and rounding to
- * float32 all in one step.
- */
- t64 = float64_mul(e1r, e2r, fpst_odd);
- t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst);
+ /* C.f. FPProcessNaNs4 */
+ if (float32_is_any_nan(s1r) || float32_is_any_nan(s1c) ||
+ float32_is_any_nan(s2r) || float32_is_any_nan(s2c)) {
+ if (float32_is_signaling_nan(s2r, fpst)) {
+ t32 = s2r;
+ } else if (float32_is_signaling_nan(s2c, fpst)) {
+ t32 = s2c;
+ } else if (float32_is_signaling_nan(s2r, fpst)) {
+ t32 = s2r;
+ } else if (float32_is_signaling_nan(s2c, fpst)) {
+ t32 = s2c;
+ } else if (float32_is_any_nan(s2r)) {
+ t32 = s2r;
+ } else if (float32_is_any_nan(s2c)) {
+ t32 = s2c;
+ } else if (float32_is_any_nan(s2r)) {
+ t32 = s2r;
+ } else {
+ t32 = s2c;
+ }
+ /*
+ * FPConvertNaN(FPProcessNaN(t32)) will be done as part
+ * of the final addition below.
+ */
+ } else {
+ /*
+ * Compare f16_dotadd() in sme_helper.c, but here we have
+ * bfloat16 inputs. In particular that means that we do not
+ * want the FPCR.FZ16 flush semantics, so we use the normal
+ * float_status for the input handling here.
+ */
+ float64 e1r = float32_to_float64(s1r, fpst);
+ float64 e1c = float32_to_float64(s1c, fpst);
+ float64 e2r = float32_to_float64(s2r, fpst);
+ float64 e2c = float32_to_float64(s2c, fpst);
+ float64 t64;
- /* This conversion is exact, because we've already rounded. */
- t32 = float64_to_float32(t64, fpst);
+ /*
+ * The ARM pseudocode function FPDot performs both multiplies
+ * and the add with a single rounding operation. Emulate this
+ * by performing the first multiply in round-to-odd, then doing
+ * the second multiply as fused multiply-add, and rounding to
+ * float32 all in one step.
+ */
+ t64 = float64_mul(e1r, e2r, fpst_odd);
+ t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst);
+
+ /* This conversion is exact, because we've already rounded. */
+ t32 = float64_to_float32(t64, fpst);
+ }
/* The final accumulation step is not fused. */
return float32_add(sum, t32, fpst);
--
2.43.0