The only issue with FMA instructions is that there are _a lot_ of them
(30 opcodes, each of which comes in up to 4 versions depending on VEX.W
and VEX.L).
We can reduce the number of helpers to one third by passing four operands
(one output and three inputs); the reordering of which operands go to
the multiply and which go to the add is done in emit.c.
Scalar versions do not do any merging; they only affect the bottom 32
or 64 bits of the output operand. Therefore, there is no separate XMM
and YMM of the scalar helpers.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
target/i386/cpu.c | 5 ++-
target/i386/ops_sse.h | 63 ++++++++++++++++++++++++++++++++
target/i386/ops_sse_header.h | 28 ++++++++++++++
target/i386/tcg/decode-new.c.inc | 38 +++++++++++++++++++
target/i386/tcg/decode-new.h | 1 +
target/i386/tcg/emit.c.inc | 43 ++++++++++++++++++++++
tests/tcg/i386/test-avx.py | 2 +-
7 files changed, 177 insertions(+), 3 deletions(-)
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 6292b7e12f..22b681ca37 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -625,10 +625,11 @@ void x86_cpu_vendor_words2str(char *dst, uint32_t vendor1,
CPUID_EXT_SSE41 | CPUID_EXT_SSE42 | CPUID_EXT_POPCNT | \
CPUID_EXT_XSAVE | /* CPUID_EXT_OSXSAVE is dynamic */ \
CPUID_EXT_MOVBE | CPUID_EXT_AES | CPUID_EXT_HYPERVISOR | \
- CPUID_EXT_RDRAND | CPUID_EXT_AVX | CPUID_EXT_F16C)
+ CPUID_EXT_RDRAND | CPUID_EXT_AVX | CPUID_EXT_F16C | \
+ CPUID_EXT_FMA)
/* missing:
CPUID_EXT_DTES64, CPUID_EXT_DSCPL, CPUID_EXT_VMX, CPUID_EXT_SMX,
- CPUID_EXT_EST, CPUID_EXT_TM2, CPUID_EXT_CID, CPUID_EXT_FMA,
+ CPUID_EXT_EST, CPUID_EXT_TM2, CPUID_EXT_CID,
CPUID_EXT_XTPR, CPUID_EXT_PDCM, CPUID_EXT_PCID, CPUID_EXT_DCA,
CPUID_EXT_X2APIC, CPUID_EXT_TSC_DEADLINE_TIMER */
diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index 33c61896ee..041a048a70 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -2522,6 +2522,69 @@ void helper_vpermd_ymm(Reg *d, Reg *v, Reg *s)
}
#endif
+/* FMA3 op helpers */
+#if SHIFT == 1
+#define SSE_HELPER_FMAS(name, elem, F) \
+ void name(CPUX86State *env, Reg *d, Reg *a, Reg *b, Reg *c) \
+ { \
+ d->elem(0) = F(a->elem(0), b->elem(0), c->elem(0)); \
+ }
+#define SSE_HELPER_FMAP(name, elem, num, F) \
+ void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *a, Reg *b, Reg *c) \
+ { \
+ int i; \
+ for (i = 0; i < num; i++) { \
+ d->elem(i) = F(a->elem(i), b->elem(i), c->elem(i)); \
+ } \
+ }
+
+#define FMADD32(a, b, c) float32_muladd(a, b, c, 0, &env->sse_status)
+#define FMADD64(a, b, c) float64_muladd(a, b, c, 0, &env->sse_status)
+
+#define FMNADD32(a, b, c) float32_muladd(a, b, c, float_muladd_negate_product, &env->sse_status)
+#define FMNADD64(a, b, c) float64_muladd(a, b, c, float_muladd_negate_product, &env->sse_status)
+
+#define FMSUB32(a, b, c) float32_muladd(a, b, c, float_muladd_negate_c, &env->sse_status)
+#define FMSUB64(a, b, c) float64_muladd(a, b, c, float_muladd_negate_c, &env->sse_status)
+
+#define FMNSUB32(a, b, c) float32_muladd(a, b, c, float_muladd_negate_c|float_muladd_negate_product, &env->sse_status)
+#define FMNSUB64(a, b, c) float64_muladd(a, b, c, float_muladd_negate_c|float_muladd_negate_product, &env->sse_status)
+
+#define FMADDSUB32(a, b, c) float32_muladd(a, b, c, (i & 1) ? 0 : float_muladd_negate_c, &env->sse_status)
+#define FMADDSUB64(a, b, c) float64_muladd(a, b, c, (i & 1) ? 0 : float_muladd_negate_c, &env->sse_status)
+
+#define FMSUBADD32(a, b, c) float32_muladd(a, b, c, (i & 1) ? float_muladd_negate_c : 0, &env->sse_status)
+#define FMSUBADD64(a, b, c) float64_muladd(a, b, c, (i & 1) ? float_muladd_negate_c : 0, &env->sse_status)
+
+SSE_HELPER_FMAS(helper_fmaddss, ZMM_S, FMADD32)
+SSE_HELPER_FMAS(helper_fmaddsd, ZMM_D, FMADD64)
+SSE_HELPER_FMAS(helper_fmnaddss, ZMM_S, FMNADD32)
+SSE_HELPER_FMAS(helper_fmnaddsd, ZMM_D, FMNADD64)
+SSE_HELPER_FMAS(helper_fmsubss, ZMM_S, FMSUB32)
+SSE_HELPER_FMAS(helper_fmsubsd, ZMM_D, FMSUB64)
+SSE_HELPER_FMAS(helper_fmnsubss, ZMM_S, FMNSUB32)
+SSE_HELPER_FMAS(helper_fmnsubsd, ZMM_D, FMNSUB64)
+#endif
+
+#if SHIFT >= 1
+SSE_HELPER_FMAP(helper_fmaddps, ZMM_S, 2 << SHIFT, FMADD32)
+SSE_HELPER_FMAP(helper_fmaddpd, ZMM_D, 1 << SHIFT, FMADD64)
+
+SSE_HELPER_FMAP(helper_fmnaddps, ZMM_S, 2 << SHIFT, FMNADD32)
+SSE_HELPER_FMAP(helper_fmnaddpd, ZMM_D, 1 << SHIFT, FMNADD64)
+
+SSE_HELPER_FMAP(helper_fmsubps, ZMM_S, 2 << SHIFT, FMSUB32)
+SSE_HELPER_FMAP(helper_fmsubpd, ZMM_D, 1 << SHIFT, FMSUB64)
+
+SSE_HELPER_FMAP(helper_fmnsubps, ZMM_S, 2 << SHIFT, FMNSUB32)
+SSE_HELPER_FMAP(helper_fmnsubpd, ZMM_D, 1 << SHIFT, FMNSUB64)
+
+SSE_HELPER_FMAP(helper_fmaddsubps, ZMM_S, 2 << SHIFT, FMADDSUB32)
+SSE_HELPER_FMAP(helper_fmaddsubpd, ZMM_D, 1 << SHIFT, FMADDSUB64)
+SSE_HELPER_FMAP(helper_fmsubaddps, ZMM_S, 2 << SHIFT, FMSUBADD32)
+SSE_HELPER_FMAP(helper_fmsubaddpd, ZMM_D, 1 << SHIFT, FMSUBADD64)
+#endif
+
#undef SSE_HELPER_S
#undef LANE_WIDTH
diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h
index c4c41976c0..1f9a5c9e94 100644
--- a/target/i386/ops_sse_header.h
+++ b/target/i386/ops_sse_header.h
@@ -359,6 +359,34 @@ DEF_HELPER_3(glue(cvtph2ps, SUFFIX), void, env, Reg, Reg)
DEF_HELPER_4(glue(cvtps2ph, SUFFIX), void, env, Reg, Reg, int)
#endif
+/* FMA3 helpers */
+#if SHIFT == 1
+DEF_HELPER_5(fmaddss, void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(fmaddsd, void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(fmnaddss, void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(fmnaddsd, void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(fmsubss, void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(fmsubsd, void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(fmnsubss, void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(fmnsubsd, void, env, Reg, Reg, Reg, Reg)
+#endif
+
+#if SHIFT >= 1
+DEF_HELPER_5(glue(fmaddps, SUFFIX), void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(glue(fmaddpd, SUFFIX), void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(glue(fmnaddps,SUFFIX), void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(glue(fmnaddpd,SUFFIX), void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(glue(fmsubps, SUFFIX), void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(glue(fmsubpd, SUFFIX), void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(glue(fmnsubps, SUFFIX), void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(glue(fmnsubpd, SUFFIX), void, env, Reg, Reg, Reg, Reg)
+
+DEF_HELPER_5(glue(fmaddsubps, SUFFIX), void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(glue(fmaddsubpd, SUFFIX), void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(glue(fmsubaddps, SUFFIX), void, env, Reg, Reg, Reg, Reg)
+DEF_HELPER_5(glue(fmsubaddpd, SUFFIX), void, env, Reg, Reg, Reg, Reg)
+#endif
+
/* AVX helpers */
#if SHIFT >= 1
DEF_HELPER_4(glue(vpermilpd, SUFFIX), void, env, Reg, Reg, Reg)
diff --git a/target/i386/tcg/decode-new.c.inc b/target/i386/tcg/decode-new.c.inc
index 8baee9018a..8a6b0ae37c 100644
--- a/target/i386/tcg/decode-new.c.inc
+++ b/target/i386/tcg/decode-new.c.inc
@@ -376,6 +376,15 @@ static const X86OpEntry opcodes_0F38_00toEF[240] = {
[0x92] = X86_OP_ENTRY3(VPGATHERD, V,x, H,x, M,d, vex12 cpuid(AVX2) p_66), /* vgatherdps/d */
[0x93] = X86_OP_ENTRY3(VPGATHERQ, V,x, H,x, M,q, vex12 cpuid(AVX2) p_66), /* vgatherqps/d */
+ [0x96] = X86_OP_ENTRY3(VFMADDSUB132Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66),
+ [0x97] = X86_OP_ENTRY3(VFMSUBADD132Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66),
+
+ [0xa6] = X86_OP_ENTRY3(VFMADDSUB213Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66),
+ [0xa7] = X86_OP_ENTRY3(VFMSUBADD213Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66),
+
+ [0xb6] = X86_OP_ENTRY3(VFMADDSUB231Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66),
+ [0xb7] = X86_OP_ENTRY3(VFMSUBADD231Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66),
+
[0x08] = X86_OP_ENTRY3(PSIGNB, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66),
[0x09] = X86_OP_ENTRY3(PSIGNW, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66),
[0x0a] = X86_OP_ENTRY3(PSIGND, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66),
@@ -421,6 +430,33 @@ static const X86OpEntry opcodes_0F38_00toEF[240] = {
[0x8c] = X86_OP_ENTRY3(VPMASKMOV, V,x, H,x, WM,x, vex6 cpuid(AVX2) p_66),
[0x8e] = X86_OP_ENTRY3(VPMASKMOV_st, M,x, V,x, H,x, vex6 cpuid(AVX2) p_66),
+ [0x98] = X86_OP_ENTRY3(VFMADD132Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66),
+ [0x99] = X86_OP_ENTRY3(VFMADD132Sx, V,x, H,x, W,x, vex6 cpuid(FMA) p_66),
+ [0x9a] = X86_OP_ENTRY3(VFMSUB132Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66),
+ [0x9b] = X86_OP_ENTRY3(VFMSUB132Sx, V,x, H,x, W,x, vex6 cpuid(FMA) p_66),
+ [0x9c] = X86_OP_ENTRY3(VFNMADD132Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66),
+ [0x9d] = X86_OP_ENTRY3(VFNMADD132Sx, V,x, H,x, W,x, vex6 cpuid(FMA) p_66),
+ [0x9e] = X86_OP_ENTRY3(VFNMSUB132Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66),
+ [0x9f] = X86_OP_ENTRY3(VFNMSUB132Sx, V,x, H,x, W,x, vex6 cpuid(FMA) p_66),
+
+ [0xa8] = X86_OP_ENTRY3(VFMADD213Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66),
+ [0xa9] = X86_OP_ENTRY3(VFMADD213Sx, V,x, H,x, W,x, vex6 cpuid(FMA) p_66),
+ [0xaa] = X86_OP_ENTRY3(VFMSUB213Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66),
+ [0xab] = X86_OP_ENTRY3(VFMSUB213Sx, V,x, H,x, W,x, vex6 cpuid(FMA) p_66),
+ [0xac] = X86_OP_ENTRY3(VFNMADD213Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66),
+ [0xad] = X86_OP_ENTRY3(VFNMADD213Sx, V,x, H,x, W,x, vex6 cpuid(FMA) p_66),
+ [0xae] = X86_OP_ENTRY3(VFNMSUB213Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66),
+ [0xaf] = X86_OP_ENTRY3(VFNMSUB213Sx, V,x, H,x, W,x, vex6 cpuid(FMA) p_66),
+
+ [0xb8] = X86_OP_ENTRY3(VFMADD231Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66),
+ [0xb9] = X86_OP_ENTRY3(VFMADD231Sx, V,x, H,x, W,x, vex6 cpuid(FMA) p_66),
+ [0xba] = X86_OP_ENTRY3(VFMSUB231Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66),
+ [0xbb] = X86_OP_ENTRY3(VFMSUB231Sx, V,x, H,x, W,x, vex6 cpuid(FMA) p_66),
+ [0xbc] = X86_OP_ENTRY3(VFNMADD231Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66),
+ [0xbd] = X86_OP_ENTRY3(VFNMADD231Sx, V,x, H,x, W,x, vex6 cpuid(FMA) p_66),
+ [0xbe] = X86_OP_ENTRY3(VFNMSUB231Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66),
+ [0xbf] = X86_OP_ENTRY3(VFNMSUB231Sx, V,x, H,x, W,x, vex6 cpuid(FMA) p_66),
+
[0xdb] = X86_OP_ENTRY3(VAESIMC, V,dq, None,None, W,dq, vex4 cpuid(AES) p_66),
[0xdc] = X86_OP_ENTRY3(VAESENC, V,x, H,x, W,x, vex4 cpuid(AES) p_66),
[0xdd] = X86_OP_ENTRY3(VAESENCLAST, V,x, H,x, W,x, vex4 cpuid(AES) p_66),
@@ -1350,6 +1386,8 @@ static bool has_cpuid_feature(DisasContext *s, X86CPUIDFeature cpuid)
return true;
case X86_FEAT_F16C:
return (s->cpuid_ext_features & CPUID_EXT_F16C);
+ case X86_FEAT_FMA:
+ return (s->cpuid_ext_features & CPUID_EXT_FMA);
case X86_FEAT_MOVBE:
return (s->cpuid_ext_features & CPUID_EXT_MOVBE);
case X86_FEAT_PCLMULQDQ:
diff --git a/target/i386/tcg/decode-new.h b/target/i386/tcg/decode-new.h
index 0ef54628ee..cb6b8bcf67 100644
--- a/target/i386/tcg/decode-new.h
+++ b/target/i386/tcg/decode-new.h
@@ -105,6 +105,7 @@ typedef enum X86CPUIDFeature {
X86_FEAT_BMI1,
X86_FEAT_BMI2,
X86_FEAT_F16C,
+ X86_FEAT_FMA,
X86_FEAT_MOVBE,
X86_FEAT_PCLMULQDQ,
X86_FEAT_SSE,
diff --git a/target/i386/tcg/emit.c.inc b/target/i386/tcg/emit.c.inc
index 9334f0939d..9e234f71f7 100644
--- a/target/i386/tcg/emit.c.inc
+++ b/target/i386/tcg/emit.c.inc
@@ -491,6 +491,49 @@ FP_SSE(VMIN, min)
FP_SSE(VDIV, div)
FP_SSE(VMAX, max)
+#define FMA_SSE_PACKED(uname, lname, ptr0, ptr1, ptr2) \
+static void gen_##uname##Px(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+{ \
+ SSEFunc_0_epppp xmm = s->vex_w ? gen_helper_##lname##pd_xmm : gen_helper_##lname##ps_xmm; \
+ SSEFunc_0_epppp ymm = s->vex_w ? gen_helper_##lname##pd_ymm : gen_helper_##lname##ps_ymm; \
+ SSEFunc_0_epppp fn = s->vex_l ? ymm : xmm; \
+ \
+ fn(cpu_env, OP_PTR0, ptr0, ptr1, ptr2); \
+}
+
+#define FMA_SSE(uname, lname, ptr0, ptr1, ptr2) \
+FMA_SSE_PACKED(uname, lname, ptr0, ptr1, ptr2) \
+static void gen_##uname##Sx(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+{ \
+ SSEFunc_0_epppp fn = s->vex_w ? gen_helper_##lname##sd : gen_helper_##lname##ss; \
+ \
+ fn(cpu_env, OP_PTR0, ptr0, ptr1, ptr2); \
+} \
+
+FMA_SSE(VFMADD231, fmadd, OP_PTR1, OP_PTR2, OP_PTR0)
+FMA_SSE(VFMADD213, fmadd, OP_PTR1, OP_PTR0, OP_PTR2)
+FMA_SSE(VFMADD132, fmadd, OP_PTR0, OP_PTR2, OP_PTR1)
+
+FMA_SSE(VFNMADD231, fmnadd, OP_PTR1, OP_PTR2, OP_PTR0)
+FMA_SSE(VFNMADD213, fmnadd, OP_PTR1, OP_PTR0, OP_PTR2)
+FMA_SSE(VFNMADD132, fmnadd, OP_PTR0, OP_PTR2, OP_PTR1)
+
+FMA_SSE(VFMSUB231, fmsub, OP_PTR1, OP_PTR2, OP_PTR0)
+FMA_SSE(VFMSUB213, fmsub, OP_PTR1, OP_PTR0, OP_PTR2)
+FMA_SSE(VFMSUB132, fmsub, OP_PTR0, OP_PTR2, OP_PTR1)
+
+FMA_SSE(VFNMSUB231, fmnsub, OP_PTR1, OP_PTR2, OP_PTR0)
+FMA_SSE(VFNMSUB213, fmnsub, OP_PTR1, OP_PTR0, OP_PTR2)
+FMA_SSE(VFNMSUB132, fmnsub, OP_PTR0, OP_PTR2, OP_PTR1)
+
+FMA_SSE_PACKED(VFMADDSUB231, fmaddsub, OP_PTR1, OP_PTR2, OP_PTR0)
+FMA_SSE_PACKED(VFMADDSUB213, fmaddsub, OP_PTR1, OP_PTR0, OP_PTR2)
+FMA_SSE_PACKED(VFMADDSUB132, fmaddsub, OP_PTR0, OP_PTR2, OP_PTR1)
+
+FMA_SSE_PACKED(VFMSUBADD231, fmsubadd, OP_PTR1, OP_PTR2, OP_PTR0)
+FMA_SSE_PACKED(VFMSUBADD213, fmsubadd, OP_PTR1, OP_PTR0, OP_PTR2)
+FMA_SSE_PACKED(VFMSUBADD132, fmsubadd, OP_PTR0, OP_PTR2, OP_PTR1)
+
#define FP_UNPACK_SSE(uname, lname) \
static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
{ \
diff --git a/tests/tcg/i386/test-avx.py b/tests/tcg/i386/test-avx.py
index ebb1d99c5e..d9ca00a49e 100755
--- a/tests/tcg/i386/test-avx.py
+++ b/tests/tcg/i386/test-avx.py
@@ -9,7 +9,7 @@
archs = [
"SSE", "SSE2", "SSE3", "SSSE3", "SSE4_1", "SSE4_2",
"AES", "AVX", "AVX2", "AES+AVX", "VAES+AVX",
- "F16C",
+ "F16C", "FMA",
]
ignore = set(["FISTTP",
--
2.37.3
On 10/20/22 01:06, Paolo Bonzini wrote:
> The only issue with FMA instructions is that there are _a lot_ of them
> (30 opcodes, each of which comes in up to 4 versions depending on VEX.W
> and VEX.L).
>
> We can reduce the number of helpers to one third by passing four operands
> (one output and three inputs); the reordering of which operands go to
> the multiply and which go to the add is done in emit.c.
>
> Scalar versions do not do any merging; they only affect the bottom 32
> or 64 bits of the output operand. Therefore, there is no separate XMM
> and YMM of the scalar helpers.
>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
> target/i386/cpu.c | 5 ++-
> target/i386/ops_sse.h | 63 ++++++++++++++++++++++++++++++++
> target/i386/ops_sse_header.h | 28 ++++++++++++++
> target/i386/tcg/decode-new.c.inc | 38 +++++++++++++++++++
> target/i386/tcg/decode-new.h | 1 +
> target/i386/tcg/emit.c.inc | 43 ++++++++++++++++++++++
> tests/tcg/i386/test-avx.py | 2 +-
> 7 files changed, 177 insertions(+), 3 deletions(-)
>
> diff --git a/target/i386/cpu.c b/target/i386/cpu.c
> index 6292b7e12f..22b681ca37 100644
> --- a/target/i386/cpu.c
> +++ b/target/i386/cpu.c
> @@ -625,10 +625,11 @@ void x86_cpu_vendor_words2str(char *dst, uint32_t vendor1,
> CPUID_EXT_SSE41 | CPUID_EXT_SSE42 | CPUID_EXT_POPCNT | \
> CPUID_EXT_XSAVE | /* CPUID_EXT_OSXSAVE is dynamic */ \
> CPUID_EXT_MOVBE | CPUID_EXT_AES | CPUID_EXT_HYPERVISOR | \
> - CPUID_EXT_RDRAND | CPUID_EXT_AVX | CPUID_EXT_F16C)
> + CPUID_EXT_RDRAND | CPUID_EXT_AVX | CPUID_EXT_F16C | \
> + CPUID_EXT_FMA)
> /* missing:
> CPUID_EXT_DTES64, CPUID_EXT_DSCPL, CPUID_EXT_VMX, CPUID_EXT_SMX,
> - CPUID_EXT_EST, CPUID_EXT_TM2, CPUID_EXT_CID, CPUID_EXT_FMA,
> + CPUID_EXT_EST, CPUID_EXT_TM2, CPUID_EXT_CID,
> CPUID_EXT_XTPR, CPUID_EXT_PDCM, CPUID_EXT_PCID, CPUID_EXT_DCA,
> CPUID_EXT_X2APIC, CPUID_EXT_TSC_DEADLINE_TIMER */
>
> diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
> index 33c61896ee..041a048a70 100644
> --- a/target/i386/ops_sse.h
> +++ b/target/i386/ops_sse.h
> @@ -2522,6 +2522,69 @@ void helper_vpermd_ymm(Reg *d, Reg *v, Reg *s)
> }
> #endif
>
> +/* FMA3 op helpers */
> +#if SHIFT == 1
> +#define SSE_HELPER_FMAS(name, elem, F) \
> + void name(CPUX86State *env, Reg *d, Reg *a, Reg *b, Reg *c) \
> + { \
> + d->elem(0) = F(a->elem(0), b->elem(0), c->elem(0)); \
> + }
> +#define SSE_HELPER_FMAP(name, elem, num, F) \
> + void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *a, Reg *b, Reg *c) \
> + { \
> + int i; \
> + for (i = 0; i < num; i++) { \
> + d->elem(i) = F(a->elem(i), b->elem(i), c->elem(i)); \
> + } \
> + }
> +
> +#define FMADD32(a, b, c) float32_muladd(a, b, c, 0, &env->sse_status)
> +#define FMADD64(a, b, c) float64_muladd(a, b, c, 0, &env->sse_status)
> +
> +#define FMNADD32(a, b, c) float32_muladd(a, b, c, float_muladd_negate_product, &env->sse_status)
> +#define FMNADD64(a, b, c) float64_muladd(a, b, c, float_muladd_negate_product, &env->sse_status)
> +
> +#define FMSUB32(a, b, c) float32_muladd(a, b, c, float_muladd_negate_c, &env->sse_status)
> +#define FMSUB64(a, b, c) float64_muladd(a, b, c, float_muladd_negate_c, &env->sse_status)
> +
> +#define FMNSUB32(a, b, c) float32_muladd(a, b, c, float_muladd_negate_c|float_muladd_negate_product, &env->sse_status)
> +#define FMNSUB64(a, b, c) float64_muladd(a, b, c, float_muladd_negate_c|float_muladd_negate_product, &env->sse_status)
> +
> +#define FMADDSUB32(a, b, c) float32_muladd(a, b, c, (i & 1) ? 0 : float_muladd_negate_c, &env->sse_status)
> +#define FMADDSUB64(a, b, c) float64_muladd(a, b, c, (i & 1) ? 0 : float_muladd_negate_c, &env->sse_status)
> +
> +#define FMSUBADD32(a, b, c) float32_muladd(a, b, c, (i & 1) ? float_muladd_negate_c : 0, &env->sse_status)
> +#define FMSUBADD64(a, b, c) float64_muladd(a, b, c, (i & 1) ? float_muladd_negate_c : 0, &env->sse_status)
> +
> +SSE_HELPER_FMAS(helper_fmaddss, ZMM_S, FMADD32)
> +SSE_HELPER_FMAS(helper_fmaddsd, ZMM_D, FMADD64)
> +SSE_HELPER_FMAS(helper_fmnaddss, ZMM_S, FMNADD32)
> +SSE_HELPER_FMAS(helper_fmnaddsd, ZMM_D, FMNADD64)
> +SSE_HELPER_FMAS(helper_fmsubss, ZMM_S, FMSUB32)
> +SSE_HELPER_FMAS(helper_fmsubsd, ZMM_D, FMSUB64)
> +SSE_HELPER_FMAS(helper_fmnsubss, ZMM_S, FMNSUB32)
> +SSE_HELPER_FMAS(helper_fmnsubsd, ZMM_D, FMNSUB64)
Would it be worth passing the muladd constant(s) as a parameter to a reduced number of
helper functions?
E.g.
void fmas_name(..., int flags)
{
d = type_muladd(a, b, c, flags, status);
}
void fmap_name(..., int flags2)
{
int f_even = flags2 & 0xf;
int f_odd = flags2 >> 4;
for (int i = 0; i < num; ) {
d(i) = type_muladd(a(i), b(i), c(i), f_even, status);
i++;
d(i) = type_muladd(a(i), b(i), c(i), f_odd, status);
i++;
}
> +#define FMA_SSE_PACKED(uname, lname, ptr0, ptr1, ptr2) \
> +static void gen_##uname##Px(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
> +{ \
> + SSEFunc_0_epppp xmm = s->vex_w ? gen_helper_##lname##pd_xmm : gen_helper_##lname##ps_xmm; \
> + SSEFunc_0_epppp ymm = s->vex_w ? gen_helper_##lname##pd_ymm : gen_helper_##lname##ps_ymm; \
> + SSEFunc_0_epppp fn = s->vex_l ? ymm : xmm; \
> + \
> + fn(cpu_env, OP_PTR0, ptr0, ptr1, ptr2); \
> +}
> +
> +#define FMA_SSE(uname, lname, ptr0, ptr1, ptr2) \
> +FMA_SSE_PACKED(uname, lname, ptr0, ptr1, ptr2) \
> +static void gen_##uname##Sx(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
> +{ \
> + SSEFunc_0_epppp fn = s->vex_w ? gen_helper_##lname##sd : gen_helper_##lname##ss; \
> + \
> + fn(cpu_env, OP_PTR0, ptr0, ptr1, ptr2); \
> +} \
> +
> +FMA_SSE(VFMADD231, fmadd, OP_PTR1, OP_PTR2, OP_PTR0)
> +FMA_SSE(VFMADD213, fmadd, OP_PTR1, OP_PTR0, OP_PTR2)
> +FMA_SSE(VFMADD132, fmadd, OP_PTR0, OP_PTR2, OP_PTR1)
> +
> +FMA_SSE(VFNMADD231, fmnadd, OP_PTR1, OP_PTR2, OP_PTR0)
> +FMA_SSE(VFNMADD213, fmnadd, OP_PTR1, OP_PTR0, OP_PTR2)
> +FMA_SSE(VFNMADD132, fmnadd, OP_PTR0, OP_PTR2, OP_PTR1)
> +
> +FMA_SSE(VFMSUB231, fmsub, OP_PTR1, OP_PTR2, OP_PTR0)
> +FMA_SSE(VFMSUB213, fmsub, OP_PTR1, OP_PTR0, OP_PTR2)
> +FMA_SSE(VFMSUB132, fmsub, OP_PTR0, OP_PTR2, OP_PTR1)
> +
> +FMA_SSE(VFNMSUB231, fmnsub, OP_PTR1, OP_PTR2, OP_PTR0)
> +FMA_SSE(VFNMSUB213, fmnsub, OP_PTR1, OP_PTR0, OP_PTR2)
> +FMA_SSE(VFNMSUB132, fmnsub, OP_PTR0, OP_PTR2, OP_PTR1)
> +
> +FMA_SSE_PACKED(VFMADDSUB231, fmaddsub, OP_PTR1, OP_PTR2, OP_PTR0)
> +FMA_SSE_PACKED(VFMADDSUB213, fmaddsub, OP_PTR1, OP_PTR0, OP_PTR2)
> +FMA_SSE_PACKED(VFMADDSUB132, fmaddsub, OP_PTR0, OP_PTR2, OP_PTR1)
> +
> +FMA_SSE_PACKED(VFMSUBADD231, fmsubadd, OP_PTR1, OP_PTR2, OP_PTR0)
> +FMA_SSE_PACKED(VFMSUBADD213, fmsubadd, OP_PTR1, OP_PTR0, OP_PTR2)
> +FMA_SSE_PACKED(VFMSUBADD132, fmsubadd, OP_PTR0, OP_PTR2, OP_PTR1)
Is it more or less confusing to macroize this further?
#define MULADD_S_VFMADD 0
#define MULADD_S_VFMSUB float_muladd_negate_c
...
#define MULADD_P_VFMADD (MULADD_S_VFMADD * 0x11)
#define MULADD_P_VFMSUB (MULADD_S_VFMSUB * 0x11)
...
#define MULADD_P_VFMADDSUB (MULADD_S_VFMADD * 0x10 + MULADD_S_VFMSUB)
#define MULADD_P_VFMSUBADD (MULADD_S_VFMSUB * 0x10 + MULADD_S_VFMADD)
#define OP_PTR1 OP_PTR1
#define OP_PTR2_231 OP_PTR2
#define OP_PTR3_231 OP_PTR0
...
#define FMA_SSE_PACKED(uname, lname, order) \
static void name(args) { \
fn = select; \
fn(cpu_env, OP_PTR0, \
glue(OP_PTR_1_,order), \
glue(OP_PTR_2_,order), \
glue(OP_PTR_3_,order), \
tcg_constant_i32(glue(MULADD_P_,UNAME))); \
}
#define FMA_SSE(uname, lname, order) \
FMA_SSE_PACKED(uname, lname, order) \
static void name(args) { \
fn = select; \
fn(cpu_env, OP_PTR0, \
glue(OP_PTR_1_,order), \
glue(OP_PTR_2_,order), \
glue(OP_PTR_3_,order), \
tcg_constant_i32(glue(MULADD_S_,UNAME))); \
}
FMA_SSE(VFMADD, fmadd, 231)
FMA_SSE(VFMADD, fmadd, 213)
FMA_SSE(VFMADD, fmadd, 132)
etc.
r~
On 10/20/22 05:02, Richard Henderson wrote:
> On 10/20/22 01:06, Paolo Bonzini wrote:
>> The only issue with FMA instructions is that there are _a lot_ of them
>> (30 opcodes, each of which comes in up to 4 versions depending on VEX.W
>> and VEX.L).
>>
>> We can reduce the number of helpers to one third by passing four operands
>> (one output and three inputs); the reordering of which operands go to
>> the multiply and which go to the add is done in emit.c.
>>
>> Scalar versions do not do any merging; they only affect the bottom 32
>> or 64 bits of the output operand. Therefore, there is no separate XMM
>> and YMM of the scalar helpers.
>>
>> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
>> ---
>> target/i386/cpu.c | 5 ++-
>> target/i386/ops_sse.h | 63 ++++++++++++++++++++++++++++++++
>> target/i386/ops_sse_header.h | 28 ++++++++++++++
>> target/i386/tcg/decode-new.c.inc | 38 +++++++++++++++++++
>> target/i386/tcg/decode-new.h | 1 +
>> target/i386/tcg/emit.c.inc | 43 ++++++++++++++++++++++
>> tests/tcg/i386/test-avx.py | 2 +-
>> 7 files changed, 177 insertions(+), 3 deletions(-)
>>
>> diff --git a/target/i386/cpu.c b/target/i386/cpu.c
>> index 6292b7e12f..22b681ca37 100644
>> --- a/target/i386/cpu.c
>> +++ b/target/i386/cpu.c
>> @@ -625,10 +625,11 @@ void x86_cpu_vendor_words2str(char *dst,
>> uint32_t vendor1,
>> CPUID_EXT_SSE41 | CPUID_EXT_SSE42 | CPUID_EXT_POPCNT | \
>> CPUID_EXT_XSAVE | /* CPUID_EXT_OSXSAVE is dynamic */ \
>> CPUID_EXT_MOVBE | CPUID_EXT_AES | CPUID_EXT_HYPERVISOR | \
>> - CPUID_EXT_RDRAND | CPUID_EXT_AVX | CPUID_EXT_F16C)
>> + CPUID_EXT_RDRAND | CPUID_EXT_AVX | CPUID_EXT_F16C | \
>> + CPUID_EXT_FMA)
>> /* missing:
>> CPUID_EXT_DTES64, CPUID_EXT_DSCPL, CPUID_EXT_VMX,
>> CPUID_EXT_SMX,
>> - CPUID_EXT_EST, CPUID_EXT_TM2, CPUID_EXT_CID, CPUID_EXT_FMA,
>> + CPUID_EXT_EST, CPUID_EXT_TM2, CPUID_EXT_CID,
>> CPUID_EXT_XTPR, CPUID_EXT_PDCM, CPUID_EXT_PCID,
>> CPUID_EXT_DCA,
>> CPUID_EXT_X2APIC, CPUID_EXT_TSC_DEADLINE_TIMER */
>> diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
>> index 33c61896ee..041a048a70 100644
>> --- a/target/i386/ops_sse.h
>> +++ b/target/i386/ops_sse.h
>> @@ -2522,6 +2522,69 @@ void helper_vpermd_ymm(Reg *d, Reg *v, Reg *s)
>> }
>> #endif
>> +/* FMA3 op helpers */
>> +#if SHIFT == 1
>> +#define SSE_HELPER_FMAS(name, elem,
>> F) \
>> + void name(CPUX86State *env, Reg *d, Reg *a, Reg *b, Reg
>> *c) \
>> +
>> { \
>> + d->elem(0) = F(a->elem(0), b->elem(0),
>> c->elem(0)); \
>> + }
>> +#define SSE_HELPER_FMAP(name, elem, num,
>> F) \
>> + void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *a, Reg *b,
>> Reg *c) \
>> +
>> { \
>> + int
>> i; \
>> + for (i = 0; i < num; i++)
>> { \
>> + d->elem(i) = F(a->elem(i), b->elem(i),
>> c->elem(i)); \
>> +
>> } \
>> + }
>> +
>> +#define FMADD32(a, b, c) float32_muladd(a, b, c, 0, &env->sse_status)
>> +#define FMADD64(a, b, c) float64_muladd(a, b, c, 0, &env->sse_status)
>> +
>> +#define FMNADD32(a, b, c) float32_muladd(a, b, c, float_muladd_negate_product, &env->sse_status)
>> +#define FMNADD64(a, b, c) float64_muladd(a, b, c, float_muladd_negate_product, &env->sse_status)
>> [...]
>
> Would it be worth passing the muladd constant(s) as a parameter to a
> reduced number of helper functions?
Sure, will do that.
> void fmas_name(..., int flags)
> {
> d = type_muladd(a, b, c, flags, status);
> }
>
> void fmap_name(..., int flags2)
> {
> int f_even = flags2 & 0xf;
> int f_odd = flags2 >> 4;
> for (int i = 0; i < num; ) {
> d(i) = type_muladd(a(i), b(i), c(i), f_even, status);
> i++;
> d(i) = type_muladd(a(i), b(i), c(i), f_odd, status);
> i++;
> }
Another possibility is to add two arguments for even and odd.
>> +FMA_SSE(VFNMSUB231, fmnsub, OP_PTR1, OP_PTR2, OP_PTR0)
>> +FMA_SSE(VFNMSUB213, fmnsub, OP_PTR1, OP_PTR0, OP_PTR2)
>> +FMA_SSE(VFNMSUB132, fmnsub, OP_PTR0, OP_PTR2, OP_PTR1)
>> +
>> +FMA_SSE_PACKED(VFMSUBADD231, fmsubadd, OP_PTR1, OP_PTR2, OP_PTR0)
>> +FMA_SSE_PACKED(VFMSUBADD213, fmsubadd, OP_PTR1, OP_PTR0, OP_PTR2)
>> +FMA_SSE_PACKED(VFMSUBADD132, fmsubadd, OP_PTR0, OP_PTR2, OP_PTR1)
>
> Is it more or less confusing to macroize this further?
I think more. :)
Paolo
© 2016 - 2026 Red Hat, Inc.