[PATCH 4/4] target/i386: implement FMA instructions

Paolo Bonzini posted 4 patches 3 years, 3 months ago
Maintainers: Paolo Bonzini <pbonzini@redhat.com>, Richard Henderson <richard.henderson@linaro.org>, Eduardo Habkost <eduardo@habkost.net>
[PATCH 4/4] target/i386: implement FMA instructions
Posted by Paolo Bonzini 3 years, 3 months ago
The only issue with FMA instructions is that there are _a lot_ of them
(30 opcodes, each of which comes in up to 4 versions depending on VEX.W
and VEX.L).

We can reduce the number of helpers to one third by passing four operands
(one output and three inputs); the reordering of which operands go to
the multiply and which go to the add is done in emit.c.

Scalar versions do not do any merging; they only affect the bottom 32
or 64 bits of the output operand.  Therefore, there is no separate XMM
and YMM of the scalar helpers.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/cpu.c                |  5 ++-
 target/i386/ops_sse.h            | 63 ++++++++++++++++++++++++++++++++
 target/i386/ops_sse_header.h     | 28 ++++++++++++++
 target/i386/tcg/decode-new.c.inc | 38 +++++++++++++++++++
 target/i386/tcg/decode-new.h     |  1 +
 target/i386/tcg/emit.c.inc       | 43 ++++++++++++++++++++++
 tests/tcg/i386/test-avx.py       |  2 +-
 7 files changed, 177 insertions(+), 3 deletions(-)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 6292b7e12f..22b681ca37 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -625,10 +625,11 @@ void x86_cpu_vendor_words2str(char *dst, uint32_t vendor1,
           CPUID_EXT_SSE41 | CPUID_EXT_SSE42 | CPUID_EXT_POPCNT | \
           CPUID_EXT_XSAVE | /* CPUID_EXT_OSXSAVE is dynamic */   \
           CPUID_EXT_MOVBE | CPUID_EXT_AES | CPUID_EXT_HYPERVISOR | \
-          CPUID_EXT_RDRAND | CPUID_EXT_AVX | CPUID_EXT_F16C)
+          CPUID_EXT_RDRAND | CPUID_EXT_AVX | CPUID_EXT_F16C | \
+          CPUID_EXT_FMA)
           /* missing:
           CPUID_EXT_DTES64, CPUID_EXT_DSCPL, CPUID_EXT_VMX, CPUID_EXT_SMX,
-          CPUID_EXT_EST, CPUID_EXT_TM2, CPUID_EXT_CID, CPUID_EXT_FMA,
+          CPUID_EXT_EST, CPUID_EXT_TM2, CPUID_EXT_CID,
           CPUID_EXT_XTPR, CPUID_EXT_PDCM, CPUID_EXT_PCID, CPUID_EXT_DCA,
           CPUID_EXT_X2APIC, CPUID_EXT_TSC_DEADLINE_TIMER */
 
diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index 33c61896ee..041a048a70 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -2522,6 +2522,69 @@ void helper_vpermd_ymm(Reg *d, Reg *v, Reg *s)
 }
 #endif
 
+/* FMA3 op helpers */
+#if SHIFT == 1
+#define SSE_HELPER_FMAS(name, elem, F)                                         \
+    void name(CPUX86State *env, Reg *d, Reg *a, Reg *b, Reg *c)                \
+    {                                                                          \
+        d->elem(0) = F(a->elem(0), b->elem(0), c->elem(0));                    \
+    }
+#define SSE_HELPER_FMAP(name, elem, num, F)                                    \
+    void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *a, Reg *b, Reg *c)  \
+    {                                                                          \
+        int i;                                                                 \
+        for (i = 0; i < num; i++) {                                            \
+            d->elem(i) = F(a->elem(i), b->elem(i), c->elem(i));                \
+        }                                                                      \
+    }
+
+#define FMADD32(a, b, c) float32_muladd(a, b, c, 0, &env->sse_status)
+#define FMADD64(a, b, c) float64_muladd(a, b, c, 0, &env->sse_status)
+
+#define FMNADD32(a, b, c) float32_muladd(a, b, c, float_muladd_negate_product, &env->sse_status)
+#define FMNADD64(a, b, c) float64_muladd(a, b, c, float_muladd_negate_product, &env->sse_status)
+
+#define FMSUB32(a, b, c) float32_muladd(a, b, c, float_muladd_negate_c, &env->sse_status)
+#define FMSUB64(a, b, c) float64_muladd(a, b, c, float_muladd_negate_c, &env->sse_status)
+
+#define FMNSUB32(a, b, c) float32_muladd(a, b, c, float_muladd_negate_c|float_muladd_negate_product, &env->sse_status)
+#define FMNSUB64(a, b, c) float64_muladd(a, b, c, float_muladd_negate_c|float_muladd_negate_product, &env->sse_status)
+
+#define FMADDSUB32(a, b, c) float32_muladd(a, b, c, (i & 1) ? 0 : float_muladd_negate_c, &env->sse_status)
+#define FMADDSUB64(a, b, c) float64_muladd(a, b, c, (i & 1) ? 0 : float_muladd_negate_c, &env->sse_status)
+
+#define FMSUBADD32(a, b, c) float32_muladd(a, b, c, (i & 1) ? float_muladd_negate_c : 0, &env->sse_status)
+#define FMSUBADD64(a, b, c) float64_muladd(a, b, c, (i & 1) ? float_muladd_negate_c : 0, &env->sse_status)
+
+SSE_HELPER_FMAS(helper_fmaddss,  ZMM_S,             FMADD32)
+SSE_HELPER_FMAS(helper_fmaddsd,  ZMM_D,             FMADD64)
+SSE_HELPER_FMAS(helper_fmnaddss, ZMM_S,             FMNADD32)
+SSE_HELPER_FMAS(helper_fmnaddsd, ZMM_D,             FMNADD64)
+SSE_HELPER_FMAS(helper_fmsubss,  ZMM_S,             FMSUB32)
+SSE_HELPER_FMAS(helper_fmsubsd,  ZMM_D,             FMSUB64)
+SSE_HELPER_FMAS(helper_fmnsubss, ZMM_S,             FMNSUB32)
+SSE_HELPER_FMAS(helper_fmnsubsd, ZMM_D,             FMNSUB64)
+#endif
+
+#if SHIFT >= 1
+SSE_HELPER_FMAP(helper_fmaddps,  ZMM_S, 2 << SHIFT, FMADD32)
+SSE_HELPER_FMAP(helper_fmaddpd,  ZMM_D, 1 << SHIFT, FMADD64)
+
+SSE_HELPER_FMAP(helper_fmnaddps, ZMM_S, 2 << SHIFT, FMNADD32)
+SSE_HELPER_FMAP(helper_fmnaddpd, ZMM_D, 1 << SHIFT, FMNADD64)
+
+SSE_HELPER_FMAP(helper_fmsubps,  ZMM_S, 2 << SHIFT, FMSUB32)
+SSE_HELPER_FMAP(helper_fmsubpd,  ZMM_D, 1 << SHIFT, FMSUB64)
+
+SSE_HELPER_FMAP(helper_fmnsubps, ZMM_S, 2 << SHIFT, FMNSUB32)
+SSE_HELPER_FMAP(helper_fmnsubpd, ZMM_D, 1 << SHIFT, FMNSUB64)
+
+SSE_HELPER_FMAP(helper_fmaddsubps,  ZMM_S, 2 << SHIFT, FMADDSUB32)
+SSE_HELPER_FMAP(helper_fmaddsubpd,  ZMM_D, 1 << SHIFT, FMADDSUB64)
+SSE_HELPER_FMAP(helper_fmsubaddps,  ZMM_S, 2 << SHIFT, FMSUBADD32)
+SSE_HELPER_FMAP(helper_fmsubaddpd,  ZMM_D, 1 << SHIFT, FMSUBADD64)
+#endif
+
 #undef SSE_HELPER_S
 
 #undef LANE_WIDTH
diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h
index c4c41976c0..1f9a5c9e94 100644
--- a/target/i386/ops_sse_header.h
+++ b/target/i386/ops_sse_header.h
@@ -359,6 +359,34 @@ DEF_HELPER_3(glue(cvtph2ps, SUFFIX), void, env, Reg, Reg)
 DEF_HELPER_4(glue(cvtps2ph, SUFFIX), void, env, Reg, Reg, int)
 #endif
 
+/* FMA3 helpers */
+#if SHIFT == 1
+DEF_HELPER_5(fmaddss, void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(fmaddsd, void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(fmnaddss, void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(fmnaddsd, void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(fmsubss, void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(fmsubsd, void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(fmnsubss, void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(fmnsubsd, void, env, Reg, Reg, Reg, Reg)
+#endif
+
+#if SHIFT >= 1
+DEF_HELPER_5(glue(fmaddps, SUFFIX), void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(glue(fmaddpd, SUFFIX), void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(glue(fmnaddps,SUFFIX), void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(glue(fmnaddpd,SUFFIX), void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(glue(fmsubps, SUFFIX), void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(glue(fmsubpd, SUFFIX), void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(glue(fmnsubps, SUFFIX), void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(glue(fmnsubpd, SUFFIX), void, env, Reg, Reg, Reg, Reg)
+
+DEF_HELPER_5(glue(fmaddsubps, SUFFIX), void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(glue(fmaddsubpd, SUFFIX), void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(glue(fmsubaddps, SUFFIX), void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(glue(fmsubaddpd, SUFFIX), void, env, Reg, Reg, Reg, Reg)
+#endif
+
 /* AVX helpers */
 #if SHIFT >= 1
 DEF_HELPER_4(glue(vpermilpd, SUFFIX), void, env, Reg, Reg, Reg)
diff --git a/target/i386/tcg/decode-new.c.inc b/target/i386/tcg/decode-new.c.inc
index 8baee9018a..8a6b0ae37c 100644
--- a/target/i386/tcg/decode-new.c.inc
+++ b/target/i386/tcg/decode-new.c.inc
@@ -376,6 +376,15 @@ static const X86OpEntry opcodes_0F38_00toEF[240] = {
     [0x92] = X86_OP_ENTRY3(VPGATHERD, V,x,  H,x,  M,d,  vex12 cpuid(AVX2) p_66), /* vgatherdps/d */
     [0x93] = X86_OP_ENTRY3(VPGATHERQ, V,x,  H,x,  M,q,  vex12 cpuid(AVX2) p_66), /* vgatherqps/d */
 
+    [0x96] = X86_OP_ENTRY3(VFMADDSUB132Px, V,x,  H,x, W,x,  vex6 cpuid(FMA) p_66),
+    [0x97] = X86_OP_ENTRY3(VFMSUBADD132Px, V,x,  H,x, W,x,  vex6 cpuid(FMA) p_66),
+
+    [0xa6] = X86_OP_ENTRY3(VFMADDSUB213Px, V,x,  H,x, W,x,  vex6 cpuid(FMA) p_66),
+    [0xa7] = X86_OP_ENTRY3(VFMSUBADD213Px, V,x,  H,x, W,x,  vex6 cpuid(FMA) p_66),
+
+    [0xb6] = X86_OP_ENTRY3(VFMADDSUB231Px, V,x,  H,x, W,x,  vex6 cpuid(FMA) p_66),
+    [0xb7] = X86_OP_ENTRY3(VFMSUBADD231Px, V,x,  H,x, W,x,  vex6 cpuid(FMA) p_66),
+
     [0x08] = X86_OP_ENTRY3(PSIGNB,    V,x,        H,x,  W,x,  vex4 cpuid(SSSE3) mmx avx2_256 p_00_66),
     [0x09] = X86_OP_ENTRY3(PSIGNW,    V,x,        H,x,  W,x,  vex4 cpuid(SSSE3) mmx avx2_256 p_00_66),
     [0x0a] = X86_OP_ENTRY3(PSIGND,    V,x,        H,x,  W,x,  vex4 cpuid(SSSE3) mmx avx2_256 p_00_66),
@@ -421,6 +430,33 @@ static const X86OpEntry opcodes_0F38_00toEF[240] = {
     [0x8c] = X86_OP_ENTRY3(VPMASKMOV,    V,x,  H,x, WM,x, vex6 cpuid(AVX2) p_66),
     [0x8e] = X86_OP_ENTRY3(VPMASKMOV_st, M,x,  V,x, H,x,  vex6 cpuid(AVX2) p_66),
 
+    [0x98] = X86_OP_ENTRY3(VFMADD132Px,  V,x,  H,x, W,x,  vex6 cpuid(FMA) p_66),
+    [0x99] = X86_OP_ENTRY3(VFMADD132Sx,  V,x,  H,x, W,x,  vex6 cpuid(FMA) p_66),
+    [0x9a] = X86_OP_ENTRY3(VFMSUB132Px,  V,x,  H,x, W,x,  vex6 cpuid(FMA) p_66),
+    [0x9b] = X86_OP_ENTRY3(VFMSUB132Sx,  V,x,  H,x, W,x,  vex6 cpuid(FMA) p_66),
+    [0x9c] = X86_OP_ENTRY3(VFNMADD132Px, V,x,  H,x, W,x,  vex6 cpuid(FMA) p_66),
+    [0x9d] = X86_OP_ENTRY3(VFNMADD132Sx, V,x,  H,x, W,x,  vex6 cpuid(FMA) p_66),
+    [0x9e] = X86_OP_ENTRY3(VFNMSUB132Px, V,x,  H,x, W,x,  vex6 cpuid(FMA) p_66),
+    [0x9f] = X86_OP_ENTRY3(VFNMSUB132Sx, V,x,  H,x, W,x,  vex6 cpuid(FMA) p_66),
+
+    [0xa8] = X86_OP_ENTRY3(VFMADD213Px,  V,x,  H,x, W,x,  vex6 cpuid(FMA) p_66),
+    [0xa9] = X86_OP_ENTRY3(VFMADD213Sx,  V,x,  H,x, W,x,  vex6 cpuid(FMA) p_66),
+    [0xaa] = X86_OP_ENTRY3(VFMSUB213Px,  V,x,  H,x, W,x,  vex6 cpuid(FMA) p_66),
+    [0xab] = X86_OP_ENTRY3(VFMSUB213Sx,  V,x,  H,x, W,x,  vex6 cpuid(FMA) p_66),
+    [0xac] = X86_OP_ENTRY3(VFNMADD213Px, V,x,  H,x, W,x,  vex6 cpuid(FMA) p_66),
+    [0xad] = X86_OP_ENTRY3(VFNMADD213Sx, V,x,  H,x, W,x,  vex6 cpuid(FMA) p_66),
+    [0xae] = X86_OP_ENTRY3(VFNMSUB213Px, V,x,  H,x, W,x,  vex6 cpuid(FMA) p_66),
+    [0xaf] = X86_OP_ENTRY3(VFNMSUB213Sx, V,x,  H,x, W,x,  vex6 cpuid(FMA) p_66),
+
+    [0xb8] = X86_OP_ENTRY3(VFMADD231Px,  V,x,  H,x, W,x,  vex6 cpuid(FMA) p_66),
+    [0xb9] = X86_OP_ENTRY3(VFMADD231Sx,  V,x,  H,x, W,x,  vex6 cpuid(FMA) p_66),
+    [0xba] = X86_OP_ENTRY3(VFMSUB231Px,  V,x,  H,x, W,x,  vex6 cpuid(FMA) p_66),
+    [0xbb] = X86_OP_ENTRY3(VFMSUB231Sx,  V,x,  H,x, W,x,  vex6 cpuid(FMA) p_66),
+    [0xbc] = X86_OP_ENTRY3(VFNMADD231Px, V,x,  H,x, W,x,  vex6 cpuid(FMA) p_66),
+    [0xbd] = X86_OP_ENTRY3(VFNMADD231Sx, V,x,  H,x, W,x,  vex6 cpuid(FMA) p_66),
+    [0xbe] = X86_OP_ENTRY3(VFNMSUB231Px, V,x,  H,x, W,x,  vex6 cpuid(FMA) p_66),
+    [0xbf] = X86_OP_ENTRY3(VFNMSUB231Sx, V,x,  H,x, W,x,  vex6 cpuid(FMA) p_66),
+
     [0xdb] = X86_OP_ENTRY3(VAESIMC,     V,dq, None,None, W,dq, vex4 cpuid(AES) p_66),
     [0xdc] = X86_OP_ENTRY3(VAESENC,     V,x,  H,x,       W,x,  vex4 cpuid(AES) p_66),
     [0xdd] = X86_OP_ENTRY3(VAESENCLAST, V,x,  H,x,       W,x,  vex4 cpuid(AES) p_66),
@@ -1350,6 +1386,8 @@ static bool has_cpuid_feature(DisasContext *s, X86CPUIDFeature cpuid)
         return true;
     case X86_FEAT_F16C:
         return (s->cpuid_ext_features & CPUID_EXT_F16C);
+    case X86_FEAT_FMA:
+        return (s->cpuid_ext_features & CPUID_EXT_FMA);
     case X86_FEAT_MOVBE:
         return (s->cpuid_ext_features & CPUID_EXT_MOVBE);
     case X86_FEAT_PCLMULQDQ:
diff --git a/target/i386/tcg/decode-new.h b/target/i386/tcg/decode-new.h
index 0ef54628ee..cb6b8bcf67 100644
--- a/target/i386/tcg/decode-new.h
+++ b/target/i386/tcg/decode-new.h
@@ -105,6 +105,7 @@ typedef enum X86CPUIDFeature {
     X86_FEAT_BMI1,
     X86_FEAT_BMI2,
     X86_FEAT_F16C,
+    X86_FEAT_FMA,
     X86_FEAT_MOVBE,
     X86_FEAT_PCLMULQDQ,
     X86_FEAT_SSE,
diff --git a/target/i386/tcg/emit.c.inc b/target/i386/tcg/emit.c.inc
index 9334f0939d..9e234f71f7 100644
--- a/target/i386/tcg/emit.c.inc
+++ b/target/i386/tcg/emit.c.inc
@@ -491,6 +491,49 @@ FP_SSE(VMIN, min)
 FP_SSE(VDIV, div)
 FP_SSE(VMAX, max)
 
+#define FMA_SSE_PACKED(uname, lname, ptr0, ptr1, ptr2)                             \
+static void gen_##uname##Px(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+{                                                                                  \
+    SSEFunc_0_epppp xmm = s->vex_w ? gen_helper_##lname##pd_xmm : gen_helper_##lname##ps_xmm; \
+    SSEFunc_0_epppp ymm = s->vex_w ? gen_helper_##lname##pd_ymm : gen_helper_##lname##ps_ymm; \
+    SSEFunc_0_epppp fn = s->vex_l ? ymm : xmm;                                     \
+                                                                                   \
+    fn(cpu_env, OP_PTR0, ptr0, ptr1, ptr2);                                        \
+}
+
+#define FMA_SSE(uname, lname, ptr0, ptr1, ptr2)                                    \
+FMA_SSE_PACKED(uname, lname, ptr0, ptr1, ptr2)                                     \
+static void gen_##uname##Sx(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+{                                                                                  \
+    SSEFunc_0_epppp fn = s->vex_w ? gen_helper_##lname##sd : gen_helper_##lname##ss; \
+                                                                                   \
+    fn(cpu_env, OP_PTR0, ptr0, ptr1, ptr2);                                        \
+}                                                                                  \
+
+FMA_SSE(VFMADD231,    fmadd,    OP_PTR1, OP_PTR2, OP_PTR0)
+FMA_SSE(VFMADD213,    fmadd,    OP_PTR1, OP_PTR0, OP_PTR2)
+FMA_SSE(VFMADD132,    fmadd,    OP_PTR0, OP_PTR2, OP_PTR1)
+
+FMA_SSE(VFNMADD231,   fmnadd,   OP_PTR1, OP_PTR2, OP_PTR0)
+FMA_SSE(VFNMADD213,   fmnadd,   OP_PTR1, OP_PTR0, OP_PTR2)
+FMA_SSE(VFNMADD132,   fmnadd,   OP_PTR0, OP_PTR2, OP_PTR1)
+
+FMA_SSE(VFMSUB231,    fmsub,    OP_PTR1, OP_PTR2, OP_PTR0)
+FMA_SSE(VFMSUB213,    fmsub,    OP_PTR1, OP_PTR0, OP_PTR2)
+FMA_SSE(VFMSUB132,    fmsub,    OP_PTR0, OP_PTR2, OP_PTR1)
+
+FMA_SSE(VFNMSUB231,   fmnsub,   OP_PTR1, OP_PTR2, OP_PTR0)
+FMA_SSE(VFNMSUB213,   fmnsub,   OP_PTR1, OP_PTR0, OP_PTR2)
+FMA_SSE(VFNMSUB132,   fmnsub,   OP_PTR0, OP_PTR2, OP_PTR1)
+
+FMA_SSE_PACKED(VFMADDSUB231, fmaddsub, OP_PTR1, OP_PTR2, OP_PTR0)
+FMA_SSE_PACKED(VFMADDSUB213, fmaddsub, OP_PTR1, OP_PTR0, OP_PTR2)
+FMA_SSE_PACKED(VFMADDSUB132, fmaddsub, OP_PTR0, OP_PTR2, OP_PTR1)
+
+FMA_SSE_PACKED(VFMSUBADD231, fmsubadd, OP_PTR1, OP_PTR2, OP_PTR0)
+FMA_SSE_PACKED(VFMSUBADD213, fmsubadd, OP_PTR1, OP_PTR0, OP_PTR2)
+FMA_SSE_PACKED(VFMSUBADD132, fmsubadd, OP_PTR0, OP_PTR2, OP_PTR1)
+
 #define FP_UNPACK_SSE(uname, lname)                                                \
 static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
 {                                                                                  \
diff --git a/tests/tcg/i386/test-avx.py b/tests/tcg/i386/test-avx.py
index ebb1d99c5e..d9ca00a49e 100755
--- a/tests/tcg/i386/test-avx.py
+++ b/tests/tcg/i386/test-avx.py
@@ -9,7 +9,7 @@
 archs = [
     "SSE", "SSE2", "SSE3", "SSSE3", "SSE4_1", "SSE4_2",
     "AES", "AVX", "AVX2", "AES+AVX", "VAES+AVX",
-    "F16C",
+    "F16C", "FMA",
 ]
 
 ignore = set(["FISTTP",
-- 
2.37.3
Re: [PATCH 4/4] target/i386: implement FMA instructions
Posted by Richard Henderson 3 years, 3 months ago
On 10/20/22 01:06, Paolo Bonzini wrote:
> The only issue with FMA instructions is that there are _a lot_ of them
> (30 opcodes, each of which comes in up to 4 versions depending on VEX.W
> and VEX.L).
> 
> We can reduce the number of helpers to one third by passing four operands
> (one output and three inputs); the reordering of which operands go to
> the multiply and which go to the add is done in emit.c.
> 
> Scalar versions do not do any merging; they only affect the bottom 32
> or 64 bits of the output operand.  Therefore, there is no separate XMM
> and YMM of the scalar helpers.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>   target/i386/cpu.c                |  5 ++-
>   target/i386/ops_sse.h            | 63 ++++++++++++++++++++++++++++++++
>   target/i386/ops_sse_header.h     | 28 ++++++++++++++
>   target/i386/tcg/decode-new.c.inc | 38 +++++++++++++++++++
>   target/i386/tcg/decode-new.h     |  1 +
>   target/i386/tcg/emit.c.inc       | 43 ++++++++++++++++++++++
>   tests/tcg/i386/test-avx.py       |  2 +-
>   7 files changed, 177 insertions(+), 3 deletions(-)
> 
> diff --git a/target/i386/cpu.c b/target/i386/cpu.c
> index 6292b7e12f..22b681ca37 100644
> --- a/target/i386/cpu.c
> +++ b/target/i386/cpu.c
> @@ -625,10 +625,11 @@ void x86_cpu_vendor_words2str(char *dst, uint32_t vendor1,
>             CPUID_EXT_SSE41 | CPUID_EXT_SSE42 | CPUID_EXT_POPCNT | \
>             CPUID_EXT_XSAVE | /* CPUID_EXT_OSXSAVE is dynamic */   \
>             CPUID_EXT_MOVBE | CPUID_EXT_AES | CPUID_EXT_HYPERVISOR | \
> -          CPUID_EXT_RDRAND | CPUID_EXT_AVX | CPUID_EXT_F16C)
> +          CPUID_EXT_RDRAND | CPUID_EXT_AVX | CPUID_EXT_F16C | \
> +          CPUID_EXT_FMA)
>             /* missing:
>             CPUID_EXT_DTES64, CPUID_EXT_DSCPL, CPUID_EXT_VMX, CPUID_EXT_SMX,
> -          CPUID_EXT_EST, CPUID_EXT_TM2, CPUID_EXT_CID, CPUID_EXT_FMA,
> +          CPUID_EXT_EST, CPUID_EXT_TM2, CPUID_EXT_CID,
>             CPUID_EXT_XTPR, CPUID_EXT_PDCM, CPUID_EXT_PCID, CPUID_EXT_DCA,
>             CPUID_EXT_X2APIC, CPUID_EXT_TSC_DEADLINE_TIMER */
>   
> diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
> index 33c61896ee..041a048a70 100644
> --- a/target/i386/ops_sse.h
> +++ b/target/i386/ops_sse.h
> @@ -2522,6 +2522,69 @@ void helper_vpermd_ymm(Reg *d, Reg *v, Reg *s)
>   }
>   #endif
>   
> +/* FMA3 op helpers */
> +#if SHIFT == 1
> +#define SSE_HELPER_FMAS(name, elem, F)                                         \
> +    void name(CPUX86State *env, Reg *d, Reg *a, Reg *b, Reg *c)                \
> +    {                                                                          \
> +        d->elem(0) = F(a->elem(0), b->elem(0), c->elem(0));                    \
> +    }
> +#define SSE_HELPER_FMAP(name, elem, num, F)                                    \
> +    void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *a, Reg *b, Reg *c)  \
> +    {                                                                          \
> +        int i;                                                                 \
> +        for (i = 0; i < num; i++) {                                            \
> +            d->elem(i) = F(a->elem(i), b->elem(i), c->elem(i));                \
> +        }                                                                      \
> +    }
> +
> +#define FMADD32(a, b, c) float32_muladd(a, b, c, 0, &env->sse_status)
> +#define FMADD64(a, b, c) float64_muladd(a, b, c, 0, &env->sse_status)
> +
> +#define FMNADD32(a, b, c) float32_muladd(a, b, c, float_muladd_negate_product, &env->sse_status)
> +#define FMNADD64(a, b, c) float64_muladd(a, b, c, float_muladd_negate_product, &env->sse_status)
> +
> +#define FMSUB32(a, b, c) float32_muladd(a, b, c, float_muladd_negate_c, &env->sse_status)
> +#define FMSUB64(a, b, c) float64_muladd(a, b, c, float_muladd_negate_c, &env->sse_status)
> +
> +#define FMNSUB32(a, b, c) float32_muladd(a, b, c, float_muladd_negate_c|float_muladd_negate_product, &env->sse_status)
> +#define FMNSUB64(a, b, c) float64_muladd(a, b, c, float_muladd_negate_c|float_muladd_negate_product, &env->sse_status)
> +
> +#define FMADDSUB32(a, b, c) float32_muladd(a, b, c, (i & 1) ? 0 : float_muladd_negate_c, &env->sse_status)
> +#define FMADDSUB64(a, b, c) float64_muladd(a, b, c, (i & 1) ? 0 : float_muladd_negate_c, &env->sse_status)
> +
> +#define FMSUBADD32(a, b, c) float32_muladd(a, b, c, (i & 1) ? float_muladd_negate_c : 0, &env->sse_status)
> +#define FMSUBADD64(a, b, c) float64_muladd(a, b, c, (i & 1) ? float_muladd_negate_c : 0, &env->sse_status)
> +
> +SSE_HELPER_FMAS(helper_fmaddss,  ZMM_S,             FMADD32)
> +SSE_HELPER_FMAS(helper_fmaddsd,  ZMM_D,             FMADD64)
> +SSE_HELPER_FMAS(helper_fmnaddss, ZMM_S,             FMNADD32)
> +SSE_HELPER_FMAS(helper_fmnaddsd, ZMM_D,             FMNADD64)
> +SSE_HELPER_FMAS(helper_fmsubss,  ZMM_S,             FMSUB32)
> +SSE_HELPER_FMAS(helper_fmsubsd,  ZMM_D,             FMSUB64)
> +SSE_HELPER_FMAS(helper_fmnsubss, ZMM_S,             FMNSUB32)
> +SSE_HELPER_FMAS(helper_fmnsubsd, ZMM_D,             FMNSUB64)

Would it be worth passing the muladd constant(s) as a parameter to a reduced number of 
helper functions?

E.g.

void fmas_name(..., int flags)
{
     d = type_muladd(a, b, c, flags, status);
}

void fmap_name(..., int flags2)
{
     int f_even = flags2 & 0xf;
     int f_odd = flags2 >> 4;
     for (int i = 0; i < num; ) {
        d(i) = type_muladd(a(i), b(i), c(i), f_even, status);
        i++;
        d(i) = type_muladd(a(i), b(i), c(i), f_odd, status);
        i++;
     }

> +#define FMA_SSE_PACKED(uname, lname, ptr0, ptr1, ptr2)                             \
> +static void gen_##uname##Px(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
> +{                                                                                  \
> +    SSEFunc_0_epppp xmm = s->vex_w ? gen_helper_##lname##pd_xmm : gen_helper_##lname##ps_xmm; \
> +    SSEFunc_0_epppp ymm = s->vex_w ? gen_helper_##lname##pd_ymm : gen_helper_##lname##ps_ymm; \
> +    SSEFunc_0_epppp fn = s->vex_l ? ymm : xmm;                                     \
> +                                                                                   \
> +    fn(cpu_env, OP_PTR0, ptr0, ptr1, ptr2);                                        \
> +}
> +
> +#define FMA_SSE(uname, lname, ptr0, ptr1, ptr2)                                    \
> +FMA_SSE_PACKED(uname, lname, ptr0, ptr1, ptr2)                                     \
> +static void gen_##uname##Sx(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
> +{                                                                                  \
> +    SSEFunc_0_epppp fn = s->vex_w ? gen_helper_##lname##sd : gen_helper_##lname##ss; \
> +                                                                                   \
> +    fn(cpu_env, OP_PTR0, ptr0, ptr1, ptr2);                                        \
> +}                                                                                  \
> +
> +FMA_SSE(VFMADD231,    fmadd,    OP_PTR1, OP_PTR2, OP_PTR0)
> +FMA_SSE(VFMADD213,    fmadd,    OP_PTR1, OP_PTR0, OP_PTR2)
> +FMA_SSE(VFMADD132,    fmadd,    OP_PTR0, OP_PTR2, OP_PTR1)
> +
> +FMA_SSE(VFNMADD231,   fmnadd,   OP_PTR1, OP_PTR2, OP_PTR0)
> +FMA_SSE(VFNMADD213,   fmnadd,   OP_PTR1, OP_PTR0, OP_PTR2)
> +FMA_SSE(VFNMADD132,   fmnadd,   OP_PTR0, OP_PTR2, OP_PTR1)
> +
> +FMA_SSE(VFMSUB231,    fmsub,    OP_PTR1, OP_PTR2, OP_PTR0)
> +FMA_SSE(VFMSUB213,    fmsub,    OP_PTR1, OP_PTR0, OP_PTR2)
> +FMA_SSE(VFMSUB132,    fmsub,    OP_PTR0, OP_PTR2, OP_PTR1)
> +
> +FMA_SSE(VFNMSUB231,   fmnsub,   OP_PTR1, OP_PTR2, OP_PTR0)
> +FMA_SSE(VFNMSUB213,   fmnsub,   OP_PTR1, OP_PTR0, OP_PTR2)
> +FMA_SSE(VFNMSUB132,   fmnsub,   OP_PTR0, OP_PTR2, OP_PTR1)
> +
> +FMA_SSE_PACKED(VFMADDSUB231, fmaddsub, OP_PTR1, OP_PTR2, OP_PTR0)
> +FMA_SSE_PACKED(VFMADDSUB213, fmaddsub, OP_PTR1, OP_PTR0, OP_PTR2)
> +FMA_SSE_PACKED(VFMADDSUB132, fmaddsub, OP_PTR0, OP_PTR2, OP_PTR1)
> +
> +FMA_SSE_PACKED(VFMSUBADD231, fmsubadd, OP_PTR1, OP_PTR2, OP_PTR0)
> +FMA_SSE_PACKED(VFMSUBADD213, fmsubadd, OP_PTR1, OP_PTR0, OP_PTR2)
> +FMA_SSE_PACKED(VFMSUBADD132, fmsubadd, OP_PTR0, OP_PTR2, OP_PTR1)

Is it more or less confusing to macroize this further?

#define MULADD_S_VFMADD  0
#define MULADD_S_VFMSUB  float_muladd_negate_c
...
#define MULADD_P_VFMADD  (MULADD_S_VFMADD * 0x11)
#define MULADD_P_VFMSUB  (MULADD_S_VFMSUB * 0x11)
...
#define MULADD_P_VFMADDSUB (MULADD_S_VFMADD * 0x10 + MULADD_S_VFMSUB)
#define MULADD_P_VFMSUBADD (MULADD_S_VFMSUB * 0x10 + MULADD_S_VFMADD)

#define OP_PTR1   OP_PTR1
#define OP_PTR2_231   OP_PTR2
#define OP_PTR3_231   OP_PTR0
...

#define FMA_SSE_PACKED(uname, lname, order)     \
static void name(args) {                        \
    fn = select;                                 \
    fn(cpu_env, OP_PTR0,                         \
       glue(OP_PTR_1_,order),                    \
       glue(OP_PTR_2_,order),                    \
       glue(OP_PTR_3_,order),                    \
       tcg_constant_i32(glue(MULADD_P_,UNAME))); \
}

#define FMA_SSE(uname, lname, order)            \
FMA_SSE_PACKED(uname, lname, order)             \
static void name(args) {                        \
    fn = select;                                 \
    fn(cpu_env, OP_PTR0,                         \
       glue(OP_PTR_1_,order),                    \
       glue(OP_PTR_2_,order),                    \
       glue(OP_PTR_3_,order),                    \
       tcg_constant_i32(glue(MULADD_S_,UNAME))); \
}

FMA_SSE(VFMADD, fmadd, 231)
FMA_SSE(VFMADD, fmadd, 213)
FMA_SSE(VFMADD, fmadd, 132)

etc.


r~
Re: [PATCH 4/4] target/i386: implement FMA instructions
Posted by Paolo Bonzini 3 years, 3 months ago
On 10/20/22 05:02, Richard Henderson wrote:
> On 10/20/22 01:06, Paolo Bonzini wrote:
>> The only issue with FMA instructions is that there are _a lot_ of them
>> (30 opcodes, each of which comes in up to 4 versions depending on VEX.W
>> and VEX.L).
>>
>> We can reduce the number of helpers to one third by passing four operands
>> (one output and three inputs); the reordering of which operands go to
>> the multiply and which go to the add is done in emit.c.
>>
>> Scalar versions do not do any merging; they only affect the bottom 32
>> or 64 bits of the output operand.  Therefore, there is no separate XMM
>> and YMM of the scalar helpers.
>>
>> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
>> ---
>>   target/i386/cpu.c                |  5 ++-
>>   target/i386/ops_sse.h            | 63 ++++++++++++++++++++++++++++++++
>>   target/i386/ops_sse_header.h     | 28 ++++++++++++++
>>   target/i386/tcg/decode-new.c.inc | 38 +++++++++++++++++++
>>   target/i386/tcg/decode-new.h     |  1 +
>>   target/i386/tcg/emit.c.inc       | 43 ++++++++++++++++++++++
>>   tests/tcg/i386/test-avx.py       |  2 +-
>>   7 files changed, 177 insertions(+), 3 deletions(-)
>>
>> diff --git a/target/i386/cpu.c b/target/i386/cpu.c
>> index 6292b7e12f..22b681ca37 100644
>> --- a/target/i386/cpu.c
>> +++ b/target/i386/cpu.c
>> @@ -625,10 +625,11 @@ void x86_cpu_vendor_words2str(char *dst, 
>> uint32_t vendor1,
>>             CPUID_EXT_SSE41 | CPUID_EXT_SSE42 | CPUID_EXT_POPCNT | \
>>             CPUID_EXT_XSAVE | /* CPUID_EXT_OSXSAVE is dynamic */   \
>>             CPUID_EXT_MOVBE | CPUID_EXT_AES | CPUID_EXT_HYPERVISOR | \
>> -          CPUID_EXT_RDRAND | CPUID_EXT_AVX | CPUID_EXT_F16C)
>> +          CPUID_EXT_RDRAND | CPUID_EXT_AVX | CPUID_EXT_F16C | \
>> +          CPUID_EXT_FMA)
>>             /* missing:
>>             CPUID_EXT_DTES64, CPUID_EXT_DSCPL, CPUID_EXT_VMX, 
>> CPUID_EXT_SMX,
>> -          CPUID_EXT_EST, CPUID_EXT_TM2, CPUID_EXT_CID, CPUID_EXT_FMA,
>> +          CPUID_EXT_EST, CPUID_EXT_TM2, CPUID_EXT_CID,
>>             CPUID_EXT_XTPR, CPUID_EXT_PDCM, CPUID_EXT_PCID, 
>> CPUID_EXT_DCA,
>>             CPUID_EXT_X2APIC, CPUID_EXT_TSC_DEADLINE_TIMER */
>> diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
>> index 33c61896ee..041a048a70 100644
>> --- a/target/i386/ops_sse.h
>> +++ b/target/i386/ops_sse.h
>> @@ -2522,6 +2522,69 @@ void helper_vpermd_ymm(Reg *d, Reg *v, Reg *s)
>>   }
>>   #endif
>> +/* FMA3 op helpers */
>> +#if SHIFT == 1
>> +#define SSE_HELPER_FMAS(name, elem, 
>> F)                                         \
>> +    void name(CPUX86State *env, Reg *d, Reg *a, Reg *b, Reg 
>> *c)                \
>> +    
>> {                                                                          \
>> +        d->elem(0) = F(a->elem(0), b->elem(0), 
>> c->elem(0));                    \
>> +    }
>> +#define SSE_HELPER_FMAP(name, elem, num, 
>> F)                                    \
>> +    void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *a, Reg *b, 
>> Reg *c)  \
>> +    
>> {                                                                          \
>> +        int 
>> i;                                                                 \
>> +        for (i = 0; i < num; i++) 
>> {                                            \
>> +            d->elem(i) = F(a->elem(i), b->elem(i), 
>> c->elem(i));                \
>> +        
>> }                                                                      \
>> +    }
>> +
>> +#define FMADD32(a, b, c) float32_muladd(a, b, c, 0, &env->sse_status)
>> +#define FMADD64(a, b, c) float64_muladd(a, b, c, 0, &env->sse_status)
>> +
>> +#define FMNADD32(a, b, c) float32_muladd(a, b, c, float_muladd_negate_product, &env->sse_status)
>> +#define FMNADD64(a, b, c) float64_muladd(a, b, c, float_muladd_negate_product, &env->sse_status)
>> [...]
>
> Would it be worth passing the muladd constant(s) as a parameter to a 
> reduced number of helper functions?

Sure, will do that.

> void fmas_name(..., int flags)
> {
>      d = type_muladd(a, b, c, flags, status);
> }
> 
> void fmap_name(..., int flags2)
> {
>      int f_even = flags2 & 0xf;
>      int f_odd = flags2 >> 4;
>      for (int i = 0; i < num; ) {
>         d(i) = type_muladd(a(i), b(i), c(i), f_even, status);
>         i++;
>         d(i) = type_muladd(a(i), b(i), c(i), f_odd, status);
>         i++;
>      }

Another possibility is to add two arguments for even and odd.

>> +FMA_SSE(VFNMSUB231,   fmnsub,   OP_PTR1, OP_PTR2, OP_PTR0)
>> +FMA_SSE(VFNMSUB213,   fmnsub,   OP_PTR1, OP_PTR0, OP_PTR2)
>> +FMA_SSE(VFNMSUB132,   fmnsub,   OP_PTR0, OP_PTR2, OP_PTR1)
>> +
>> +FMA_SSE_PACKED(VFMSUBADD231, fmsubadd, OP_PTR1, OP_PTR2, OP_PTR0)
>> +FMA_SSE_PACKED(VFMSUBADD213, fmsubadd, OP_PTR1, OP_PTR0, OP_PTR2)
>> +FMA_SSE_PACKED(VFMSUBADD132, fmsubadd, OP_PTR0, OP_PTR2, OP_PTR1)
> 
> Is it more or less confusing to macroize this further?

I think more. :)

Paolo