From nobody Tue Feb 10 17:31:11 2026 Delivered-To: importer@patchew.org Authentication-Results: mx.zohomail.com; spf=pass (zohomail.com: domain of gnu.org designates 209.51.188.17 as permitted sender) smtp.mailfrom=qemu-devel-bounces+importer=patchew.org@nongnu.org Return-Path: Received: from lists.gnu.org (lists.gnu.org [209.51.188.17]) by mx.zohomail.com with SMTPS id 1650304086859176.46963827060722; Mon, 18 Apr 2022 10:48:06 -0700 (PDT) Received: from localhost ([::1]:43826 helo=lists1p.gnu.org) by lists.gnu.org with esmtp (Exim 4.90_1) (envelope-from ) id 1ngVTp-0002t3-Ee for importer@patchew.org; Mon, 18 Apr 2022 13:48:05 -0400 Received: from eggs.gnu.org ([2001:470:142:3::10]:50518) by lists.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1ngVLm-0004xa-TM for qemu-devel@nongnu.org; Mon, 18 Apr 2022 13:39:48 -0400 Received: from nowt.default.pbrook.uk0.bigv.io ([2001:41c8:51:832:fcff:ff:fe00:46dd]:41356) by eggs.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_128_GCM_SHA256:128) (Exim 4.90_1) (envelope-from ) id 1ngVLg-0006bQ-EF for qemu-devel@nongnu.org; Mon, 18 Apr 2022 13:39:45 -0400 Received: from cpc91554-seac25-2-0-cust857.7-2.cable.virginm.net ([82.27.199.90] helo=wren.home) by nowt.default.pbrook.uk0.bigv.io with esmtpsa (TLS1.2:ECDHE_RSA_AES_128_GCM_SHA256:128) (Exim 4.84_2) (envelope-from ) id 1ngVLb-000364-LS; Mon, 18 Apr 2022 18:39:36 +0100 From: Paul Brook To: qemu-devel@nongnu.org Subject: [PATCH 2/4] TCG support for AVX Date: Mon, 18 Apr 2022 18:39:02 +0100 Message-Id: <20220418173904.3746036-3-paul@nowt.org> X-Mailer: git-send-email 2.35.2 In-Reply-To: <20220418173904.3746036-1-paul@nowt.org> References: <20220418173904.3746036-1-paul@nowt.org> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Received-SPF: pass (zohomail.com: domain of gnu.org designates 209.51.188.17 as permitted sender) client-ip=209.51.188.17; envelope-from=qemu-devel-bounces+importer=patchew.org@nongnu.org; helo=lists.gnu.org; Received-SPF: pass client-ip=2001:41c8:51:832:fcff:ff:fe00:46dd; envelope-from=paul@nowt.org; helo=nowt.default.pbrook.uk0.bigv.io X-Spam_score_int: -18 X-Spam_score: -1.9 X-Spam_bar: - X-Spam_report: (-1.9 / 5.0 requ) BAYES_00=-1.9, SPF_HELO_NONE=0.001, SPF_PASS=-0.001, T_SCC_BODY_TEXT_LINE=-0.01 autolearn=ham autolearn_force=no X-Spam_action: no action X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Cc: Eduardo Habkost , Paolo Bonzini , Richard Henderson , Paul Brook Errors-To: qemu-devel-bounces+importer=patchew.org@nongnu.org Sender: "Qemu-devel" X-ZM-MESSAGEID: 1650304089155100001 Content-Type: text/plain; charset="utf-8" Add TCG translation of guest AVX/AVX2 instructions This comprises: * VEX encodings of most (all?) "legacy" SSE operations. These typically add an extra source operand, and clear the unused half of the destination register (SSE encodings leave this unchanged) Previously we were incorrectly translating VEX encoded instructions as if they were legacy SSE encodings. * 256-bit variants of many instructions. AVX adds floating point operations. AVX2 adds integer operations. * A few new instructions (VBROADCAST, VGATHER, VZERO) Signed-off-by: Paul Brook --- target/i386/cpu.c | 8 +- target/i386/helper.h | 2 + target/i386/ops_sse.h | 2606 ++++++++++++++++++++++++---------- target/i386/ops_sse_header.h | 364 +++-- target/i386/tcg/fpu_helper.c | 3 + target/i386/tcg/translate.c | 1902 +++++++++++++++++++------ 6 files changed, 3597 insertions(+), 1288 deletions(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index cb6b5467d0..494f01959d 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -625,12 +625,12 @@ void x86_cpu_vendor_words2str(char *dst, uint32_t ven= dor1, CPUID_EXT_SSE41 | CPUID_EXT_SSE42 | CPUID_EXT_POPCNT | \ CPUID_EXT_XSAVE | /* CPUID_EXT_OSXSAVE is dynamic */ \ CPUID_EXT_MOVBE | CPUID_EXT_AES | CPUID_EXT_HYPERVISOR | \ - CPUID_EXT_RDRAND) + CPUID_EXT_RDRAND | CPUID_EXT_AVX) /* missing: CPUID_EXT_DTES64, CPUID_EXT_DSCPL, CPUID_EXT_VMX, CPUID_EXT_SMX, CPUID_EXT_EST, CPUID_EXT_TM2, CPUID_EXT_CID, CPUID_EXT_FMA, CPUID_EXT_XTPR, CPUID_EXT_PDCM, CPUID_EXT_PCID, CPUID_EXT_DCA, - CPUID_EXT_X2APIC, CPUID_EXT_TSC_DEADLINE_TIMER, CPUID_EXT_AVX, + CPUID_EXT_X2APIC, CPUID_EXT_TSC_DEADLINE_TIMER, CPUID_EXT_F16C */ =20 #ifdef TARGET_X86_64 @@ -653,9 +653,9 @@ void x86_cpu_vendor_words2str(char *dst, uint32_t vendo= r1, CPUID_7_0_EBX_BMI1 | CPUID_7_0_EBX_BMI2 | CPUID_7_0_EBX_ADX | \ CPUID_7_0_EBX_PCOMMIT | CPUID_7_0_EBX_CLFLUSHOPT | \ CPUID_7_0_EBX_CLWB | CPUID_7_0_EBX_MPX | CPUID_7_0_EBX_FSGSBASE = | \ - CPUID_7_0_EBX_ERMS) + CPUID_7_0_EBX_ERMS | CPUID_7_0_EBX_AVX2) /* missing: - CPUID_7_0_EBX_HLE, CPUID_7_0_EBX_AVX2, + CPUID_7_0_EBX_HLE CPUID_7_0_EBX_INVPCID, CPUID_7_0_EBX_RTM, CPUID_7_0_EBX_RDSEED */ #define TCG_7_0_ECX_FEATURES (CPUID_7_0_ECX_UMIP | CPUID_7_0_ECX_PKU | \ diff --git a/target/i386/helper.h b/target/i386/helper.h index ac3b4d1ee3..3da5df98b9 100644 --- a/target/i386/helper.h +++ b/target/i386/helper.h @@ -218,6 +218,8 @@ DEF_HELPER_3(movq, void, env, ptr, ptr) #include "ops_sse_header.h" #define SHIFT 1 #include "ops_sse_header.h" +#define SHIFT 2 +#include "ops_sse_header.h" =20 DEF_HELPER_3(rclb, tl, env, tl, tl) DEF_HELPER_3(rclw, tl, env, tl, tl) diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index 6f1fc174b3..9cd7b2875e 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -23,6 +23,7 @@ #if SHIFT =3D=3D 0 #define Reg MMXReg #define XMM_ONLY(...) +#define YMM_ONLY(...) #define B(n) MMX_B(n) #define W(n) MMX_W(n) #define L(n) MMX_L(n) @@ -35,260 +36,355 @@ #define W(n) ZMM_W(n) #define L(n) ZMM_L(n) #define Q(n) ZMM_Q(n) +#if SHIFT =3D=3D 1 #define SUFFIX _xmm +#define YMM_ONLY(...) +#else +#define SUFFIX _ymm +#define YMM_ONLY(...) __VA_ARGS__ +#endif +#endif + +#if SHIFT =3D=3D 0 +#define SHIFT_HELPER_BODY(n, elem, F) do { \ + d->elem(0) =3D F(s->elem(0), shift); \ + if ((n) > 1) { \ + d->elem(1) =3D F(s->elem(1), shift); \ + } \ + if ((n) > 2) { \ + d->elem(2) =3D F(s->elem(2), shift); \ + d->elem(3) =3D F(s->elem(3), shift); \ + } \ + if ((n) > 4) { \ + d->elem(4) =3D F(s->elem(4), shift); \ + d->elem(5) =3D F(s->elem(5), shift); \ + d->elem(6) =3D F(s->elem(6), shift); \ + d->elem(7) =3D F(s->elem(7), shift); \ + } \ + if ((n) > 8) { \ + d->elem(8) =3D F(s->elem(8), shift); \ + d->elem(9) =3D F(s->elem(9), shift); \ + d->elem(10) =3D F(s->elem(10), shift); \ + d->elem(11) =3D F(s->elem(11), shift); \ + d->elem(12) =3D F(s->elem(12), shift); \ + d->elem(13) =3D F(s->elem(13), shift); \ + d->elem(14) =3D F(s->elem(14), shift); \ + d->elem(15) =3D F(s->elem(15), shift); \ + } \ + } while (0) + +#define FPSRL(x, c) ((x) >> shift) +#define FPSRAW(x, c) ((int16_t)(x) >> shift) +#define FPSRAL(x, c) ((int32_t)(x) >> shift) +#define FPSLL(x, c) ((x) << shift) #endif =20 -void glue(helper_psrlw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_psrlw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) { int shift; - - if (s->Q(0) > 15) { + if (c->Q(0) > 15) { d->Q(0) =3D 0; -#if SHIFT =3D=3D 1 - d->Q(1) =3D 0; -#endif + XMM_ONLY(d->Q(1) =3D 0;) + YMM_ONLY( + d->Q(2) =3D 0; + d->Q(3) =3D 0; + ) } else { - shift =3D s->B(0); - d->W(0) >>=3D shift; - d->W(1) >>=3D shift; - d->W(2) >>=3D shift; - d->W(3) >>=3D shift; -#if SHIFT =3D=3D 1 - d->W(4) >>=3D shift; - d->W(5) >>=3D shift; - d->W(6) >>=3D shift; - d->W(7) >>=3D shift; -#endif + shift =3D c->B(0); + SHIFT_HELPER_BODY(4 << SHIFT, W, FPSRL); } } =20 -void glue(helper_psraw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_psllw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) { int shift; - - if (s->Q(0) > 15) { - shift =3D 15; + if (c->Q(0) > 15) { + d->Q(0) =3D 0; + XMM_ONLY(d->Q(1) =3D 0;) + YMM_ONLY( + d->Q(2) =3D 0; + d->Q(3) =3D 0; + ) } else { - shift =3D s->B(0); + shift =3D c->B(0); + SHIFT_HELPER_BODY(4 << SHIFT, W, FPSLL); } - d->W(0) =3D (int16_t)d->W(0) >> shift; - d->W(1) =3D (int16_t)d->W(1) >> shift; - d->W(2) =3D (int16_t)d->W(2) >> shift; - d->W(3) =3D (int16_t)d->W(3) >> shift; -#if SHIFT =3D=3D 1 - d->W(4) =3D (int16_t)d->W(4) >> shift; - d->W(5) =3D (int16_t)d->W(5) >> shift; - d->W(6) =3D (int16_t)d->W(6) >> shift; - d->W(7) =3D (int16_t)d->W(7) >> shift; -#endif } =20 -void glue(helper_psllw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_psraw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) { int shift; - - if (s->Q(0) > 15) { - d->Q(0) =3D 0; -#if SHIFT =3D=3D 1 - d->Q(1) =3D 0; -#endif + if (c->Q(0) > 15) { + shift =3D 15; } else { - shift =3D s->B(0); - d->W(0) <<=3D shift; - d->W(1) <<=3D shift; - d->W(2) <<=3D shift; - d->W(3) <<=3D shift; -#if SHIFT =3D=3D 1 - d->W(4) <<=3D shift; - d->W(5) <<=3D shift; - d->W(6) <<=3D shift; - d->W(7) <<=3D shift; -#endif + shift =3D c->B(0); } + SHIFT_HELPER_BODY(4 << SHIFT, W, FPSRAW); } =20 -void glue(helper_psrld, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_psrld, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) { int shift; - - if (s->Q(0) > 31) { + if (c->Q(0) > 31) { d->Q(0) =3D 0; -#if SHIFT =3D=3D 1 - d->Q(1) =3D 0; -#endif + XMM_ONLY(d->Q(1) =3D 0;) + YMM_ONLY( + d->Q(2) =3D 0; + d->Q(3) =3D 0; + ) } else { - shift =3D s->B(0); - d->L(0) >>=3D shift; - d->L(1) >>=3D shift; -#if SHIFT =3D=3D 1 - d->L(2) >>=3D shift; - d->L(3) >>=3D shift; -#endif + shift =3D c->B(0); + SHIFT_HELPER_BODY(2 << SHIFT, L, FPSRL); } } =20 -void glue(helper_psrad, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_pslld, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) { int shift; - - if (s->Q(0) > 31) { - shift =3D 31; + if (c->Q(0) > 31) { + d->Q(0) =3D 0; + XMM_ONLY(d->Q(1) =3D 0;) + YMM_ONLY( + d->Q(2) =3D 0; + d->Q(3) =3D 0; + ) } else { - shift =3D s->B(0); + shift =3D c->B(0); + SHIFT_HELPER_BODY(2 << SHIFT, L, FPSLL); } - d->L(0) =3D (int32_t)d->L(0) >> shift; - d->L(1) =3D (int32_t)d->L(1) >> shift; -#if SHIFT =3D=3D 1 - d->L(2) =3D (int32_t)d->L(2) >> shift; - d->L(3) =3D (int32_t)d->L(3) >> shift; -#endif } =20 -void glue(helper_pslld, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_psrad, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) { int shift; - - if (s->Q(0) > 31) { - d->Q(0) =3D 0; -#if SHIFT =3D=3D 1 - d->Q(1) =3D 0; -#endif + if (c->Q(0) > 31) { + shift =3D 31; } else { - shift =3D s->B(0); - d->L(0) <<=3D shift; - d->L(1) <<=3D shift; -#if SHIFT =3D=3D 1 - d->L(2) <<=3D shift; - d->L(3) <<=3D shift; -#endif + shift =3D c->B(0); } + SHIFT_HELPER_BODY(2 << SHIFT, L, FPSRAL); } =20 -void glue(helper_psrlq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_psrlq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) { int shift; - - if (s->Q(0) > 63) { + if (c->Q(0) > 63) { d->Q(0) =3D 0; -#if SHIFT =3D=3D 1 - d->Q(1) =3D 0; -#endif + XMM_ONLY(d->Q(1) =3D 0;) + YMM_ONLY( + d->Q(2) =3D 0; + d->Q(3) =3D 0; + ) } else { - shift =3D s->B(0); - d->Q(0) >>=3D shift; -#if SHIFT =3D=3D 1 - d->Q(1) >>=3D shift; -#endif + shift =3D c->B(0); + SHIFT_HELPER_BODY(1 << SHIFT, Q, FPSRL); } } =20 -void glue(helper_psllq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_psllq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) { int shift; - - if (s->Q(0) > 63) { + if (c->Q(0) > 63) { d->Q(0) =3D 0; -#if SHIFT =3D=3D 1 - d->Q(1) =3D 0; -#endif + XMM_ONLY(d->Q(1) =3D 0;) + YMM_ONLY( + d->Q(2) =3D 0; + d->Q(3) =3D 0; + ) } else { - shift =3D s->B(0); - d->Q(0) <<=3D shift; -#if SHIFT =3D=3D 1 - d->Q(1) <<=3D shift; -#endif + shift =3D c->B(0); + SHIFT_HELPER_BODY(1 << SHIFT, Q, FPSLL); } } =20 -#if SHIFT =3D=3D 1 -void glue(helper_psrldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +#if SHIFT >=3D 1 +void glue(helper_psrldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) { int shift, i; =20 - shift =3D s->L(0); + shift =3D c->L(0); if (shift > 16) { shift =3D 16; } for (i =3D 0; i < 16 - shift; i++) { - d->B(i) =3D d->B(i + shift); + d->B(i) =3D s->B(i + shift); } for (i =3D 16 - shift; i < 16; i++) { d->B(i) =3D 0; } +#if SHIFT =3D=3D 2 + for (i =3D 0; i < 16 - shift; i++) { + d->B(i + 16) =3D s->B(i + 16 + shift); + } + for (i =3D 16 - shift; i < 16; i++) { + d->B(i + 16) =3D 0; + } +#endif } =20 -void glue(helper_pslldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_pslldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) { int shift, i; =20 - shift =3D s->L(0); + shift =3D c->L(0); if (shift > 16) { shift =3D 16; } for (i =3D 15; i >=3D shift; i--) { - d->B(i) =3D d->B(i - shift); + d->B(i) =3D s->B(i - shift); } for (i =3D 0; i < shift; i++) { d->B(i) =3D 0; } +#if SHIFT =3D=3D 2 + for (i =3D 15; i >=3D shift; i--) { + d->B(i + 16) =3D s->B(i + 16 - shift); + } + for (i =3D 0; i < shift; i++) { + d->B(i + 16) =3D 0; + } +#endif } #endif =20 -#define SSE_HELPER_B(name, F) \ +#define SSE_HELPER_1(name, elem, num, F) = \ void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ { \ - d->B(0) =3D F(d->B(0), s->B(0)); \ - d->B(1) =3D F(d->B(1), s->B(1)); \ - d->B(2) =3D F(d->B(2), s->B(2)); \ - d->B(3) =3D F(d->B(3), s->B(3)); \ - d->B(4) =3D F(d->B(4), s->B(4)); \ - d->B(5) =3D F(d->B(5), s->B(5)); \ - d->B(6) =3D F(d->B(6), s->B(6)); \ - d->B(7) =3D F(d->B(7), s->B(7)); \ + d->elem(0) =3D F(s->elem(0)); \ + d->elem(1) =3D F(s->elem(1)); \ + if ((num << SHIFT) > 2) { \ + d->elem(2) =3D F(s->elem(2)); \ + d->elem(3) =3D F(s->elem(3)); \ + } \ + if ((num << SHIFT) > 4) { \ + d->elem(4) =3D F(s->elem(4)); \ + d->elem(5) =3D F(s->elem(5)); \ + d->elem(6) =3D F(s->elem(6)); \ + d->elem(7) =3D F(s->elem(7)); \ + } \ + if ((num << SHIFT) > 8) { \ + d->elem(8) =3D F(s->elem(8)); \ + d->elem(9) =3D F(s->elem(9)); \ + d->elem(10) =3D F(s->elem(10)); \ + d->elem(11) =3D F(s->elem(11)); \ + d->elem(12) =3D F(s->elem(12)); \ + d->elem(13) =3D F(s->elem(13)); \ + d->elem(14) =3D F(s->elem(14)); \ + d->elem(15) =3D F(s->elem(15)); \ + } \ + if ((num << SHIFT) > 16) { \ + d->elem(16) =3D F(s->elem(16)); \ + d->elem(17) =3D F(s->elem(17)); \ + d->elem(18) =3D F(s->elem(18)); \ + d->elem(19) =3D F(s->elem(19)); \ + d->elem(20) =3D F(s->elem(20)); \ + d->elem(21) =3D F(s->elem(21)); \ + d->elem(22) =3D F(s->elem(22)); \ + d->elem(23) =3D F(s->elem(23)); \ + d->elem(24) =3D F(s->elem(24)); \ + d->elem(25) =3D F(s->elem(25)); \ + d->elem(26) =3D F(s->elem(26)); \ + d->elem(27) =3D F(s->elem(27)); \ + d->elem(28) =3D F(s->elem(28)); \ + d->elem(29) =3D F(s->elem(29)); \ + d->elem(30) =3D F(s->elem(30)); \ + d->elem(31) =3D F(s->elem(31)); \ + } \ + } + +#define SSE_HELPER_B(name, F) \ + void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) \ + { \ + d->B(0) =3D F(v->B(0), s->B(0)); \ + d->B(1) =3D F(v->B(1), s->B(1)); \ + d->B(2) =3D F(v->B(2), s->B(2)); \ + d->B(3) =3D F(v->B(3), s->B(3)); \ + d->B(4) =3D F(v->B(4), s->B(4)); \ + d->B(5) =3D F(v->B(5), s->B(5)); \ + d->B(6) =3D F(v->B(6), s->B(6)); \ + d->B(7) =3D F(v->B(7), s->B(7)); \ XMM_ONLY( \ - d->B(8) =3D F(d->B(8), s->B(8)); \ - d->B(9) =3D F(d->B(9), s->B(9)); \ - d->B(10) =3D F(d->B(10), s->B(10)); \ - d->B(11) =3D F(d->B(11), s->B(11)); \ - d->B(12) =3D F(d->B(12), s->B(12)); \ - d->B(13) =3D F(d->B(13), s->B(13)); \ - d->B(14) =3D F(d->B(14), s->B(14)); \ - d->B(15) =3D F(d->B(15), s->B(15)); \ + d->B(8) =3D F(v->B(8), s->B(8)); \ + d->B(9) =3D F(v->B(9), s->B(9)); \ + d->B(10) =3D F(v->B(10), s->B(10)); \ + d->B(11) =3D F(v->B(11), s->B(11)); \ + d->B(12) =3D F(v->B(12), s->B(12)); \ + d->B(13) =3D F(v->B(13), s->B(13)); \ + d->B(14) =3D F(v->B(14), s->B(14)); \ + d->B(15) =3D F(v->B(15), s->B(15)); \ + ) \ + YMM_ONLY( \ + d->B(16) =3D F(v->B(16), s->B(16)); \ + d->B(17) =3D F(v->B(17), s->B(17)); \ + d->B(18) =3D F(v->B(18), s->B(18)); \ + d->B(19) =3D F(v->B(19), s->B(19)); \ + d->B(20) =3D F(v->B(20), s->B(20)); \ + d->B(21) =3D F(v->B(21), s->B(21)); \ + d->B(22) =3D F(v->B(22), s->B(22)); \ + d->B(23) =3D F(v->B(23), s->B(23)); \ + d->B(24) =3D F(v->B(24), s->B(24)); \ + d->B(25) =3D F(v->B(25), s->B(25)); \ + d->B(26) =3D F(v->B(26), s->B(26)); \ + d->B(27) =3D F(v->B(27), s->B(27)); \ + d->B(28) =3D F(v->B(28), s->B(28)); \ + d->B(29) =3D F(v->B(29), s->B(29)); \ + d->B(30) =3D F(v->B(30), s->B(30)); \ + d->B(31) =3D F(v->B(31), s->B(31)); \ ) \ } =20 #define SSE_HELPER_W(name, F) \ - void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ + void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) \ { \ - d->W(0) =3D F(d->W(0), s->W(0)); \ - d->W(1) =3D F(d->W(1), s->W(1)); \ - d->W(2) =3D F(d->W(2), s->W(2)); \ - d->W(3) =3D F(d->W(3), s->W(3)); \ + d->W(0) =3D F(v->W(0), s->W(0)); \ + d->W(1) =3D F(v->W(1), s->W(1)); \ + d->W(2) =3D F(v->W(2), s->W(2)); \ + d->W(3) =3D F(v->W(3), s->W(3)); \ XMM_ONLY( \ - d->W(4) =3D F(d->W(4), s->W(4)); \ - d->W(5) =3D F(d->W(5), s->W(5)); \ - d->W(6) =3D F(d->W(6), s->W(6)); \ - d->W(7) =3D F(d->W(7), s->W(7)); \ + d->W(4) =3D F(v->W(4), s->W(4)); \ + d->W(5) =3D F(v->W(5), s->W(5)); \ + d->W(6) =3D F(v->W(6), s->W(6)); \ + d->W(7) =3D F(v->W(7), s->W(7)); \ + ) \ + YMM_ONLY( \ + d->W(8) =3D F(v->W(8), s->W(8)); \ + d->W(9) =3D F(v->W(9), s->W(9)); \ + d->W(10) =3D F(v->W(10), s->W(10)); \ + d->W(11) =3D F(v->W(11), s->W(11)); \ + d->W(12) =3D F(v->W(12), s->W(12)); \ + d->W(13) =3D F(v->W(13), s->W(13)); \ + d->W(14) =3D F(v->W(14), s->W(14)); \ + d->W(15) =3D F(v->W(15), s->W(15)); \ ) \ } =20 #define SSE_HELPER_L(name, F) \ - void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ + void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) \ { \ - d->L(0) =3D F(d->L(0), s->L(0)); \ - d->L(1) =3D F(d->L(1), s->L(1)); \ + d->L(0) =3D F(v->L(0), s->L(0)); \ + d->L(1) =3D F(v->L(1), s->L(1)); \ XMM_ONLY( \ - d->L(2) =3D F(d->L(2), s->L(2)); \ - d->L(3) =3D F(d->L(3), s->L(3)); \ + d->L(2) =3D F(v->L(2), s->L(2)); \ + d->L(3) =3D F(v->L(3), s->L(3)); \ + ) \ + YMM_ONLY( \ + d->L(4) =3D F(v->L(4), s->L(4)); \ + d->L(5) =3D F(v->L(5), s->L(5)); \ + d->L(6) =3D F(v->L(6), s->L(6)); \ + d->L(7) =3D F(v->L(7), s->L(7)); \ ) \ } =20 #define SSE_HELPER_Q(name, F) \ - void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ + void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) \ { \ - d->Q(0) =3D F(d->Q(0), s->Q(0)); \ + d->Q(0) =3D F(v->Q(0), s->Q(0)); \ XMM_ONLY( \ - d->Q(1) =3D F(d->Q(1), s->Q(1)); \ + d->Q(1) =3D F(v->Q(1), s->Q(1)); \ + ) \ + YMM_ONLY( \ + d->Q(2) =3D F(v->Q(2), s->Q(2)); \ + d->Q(3) =3D F(v->Q(3), s->Q(3)); \ ) \ } =20 @@ -411,30 +507,41 @@ SSE_HELPER_W(helper_pcmpeqw, FCMPEQ) SSE_HELPER_L(helper_pcmpeql, FCMPEQ) =20 SSE_HELPER_W(helper_pmullw, FMULLW) -#if SHIFT =3D=3D 0 -SSE_HELPER_W(helper_pmulhrw, FMULHRW) -#endif SSE_HELPER_W(helper_pmulhuw, FMULHUW) SSE_HELPER_W(helper_pmulhw, FMULHW) =20 +#if SHIFT =3D=3D 0 +void glue(helper_pmulhrw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +{ + d->W(0) =3D FMULHRW(d->W(0), s->W(0)); + d->W(1) =3D FMULHRW(d->W(1), s->W(1)); + d->W(2) =3D FMULHRW(d->W(2), s->W(2)); + d->W(3) =3D FMULHRW(d->W(3), s->W(3)); +} +#endif + SSE_HELPER_B(helper_pavgb, FAVG) SSE_HELPER_W(helper_pavgw, FAVG) =20 -void glue(helper_pmuludq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_pmuludq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) { - d->Q(0) =3D (uint64_t)s->L(0) * (uint64_t)d->L(0); -#if SHIFT =3D=3D 1 - d->Q(1) =3D (uint64_t)s->L(2) * (uint64_t)d->L(2); + d->Q(0) =3D (uint64_t)s->L(0) * (uint64_t)v->L(0); +#if SHIFT >=3D 1 + d->Q(1) =3D (uint64_t)s->L(2) * (uint64_t)v->L(2); +#if SHIFT =3D=3D 2 + d->Q(2) =3D (uint64_t)s->L(4) * (uint64_t)v->L(4); + d->Q(3) =3D (uint64_t)s->L(6) * (uint64_t)v->L(6); +#endif #endif } =20 -void glue(helper_pmaddwd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_pmaddwd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) { int i; =20 for (i =3D 0; i < (2 << SHIFT); i++) { - d->L(i) =3D (int16_t)s->W(2 * i) * (int16_t)d->W(2 * i) + - (int16_t)s->W(2 * i + 1) * (int16_t)d->W(2 * i + 1); + d->L(i) =3D (int16_t)s->W(2 * i) * (int16_t)v->W(2 * i) + + (int16_t)s->W(2 * i + 1) * (int16_t)v->W(2 * i + 1); } } =20 @@ -448,34 +555,57 @@ static inline int abs1(int a) } } #endif -void glue(helper_psadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_psadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) { unsigned int val; =20 val =3D 0; - val +=3D abs1(d->B(0) - s->B(0)); - val +=3D abs1(d->B(1) - s->B(1)); - val +=3D abs1(d->B(2) - s->B(2)); - val +=3D abs1(d->B(3) - s->B(3)); - val +=3D abs1(d->B(4) - s->B(4)); - val +=3D abs1(d->B(5) - s->B(5)); - val +=3D abs1(d->B(6) - s->B(6)); - val +=3D abs1(d->B(7) - s->B(7)); + val +=3D abs1(v->B(0) - s->B(0)); + val +=3D abs1(v->B(1) - s->B(1)); + val +=3D abs1(v->B(2) - s->B(2)); + val +=3D abs1(v->B(3) - s->B(3)); + val +=3D abs1(v->B(4) - s->B(4)); + val +=3D abs1(v->B(5) - s->B(5)); + val +=3D abs1(v->B(6) - s->B(6)); + val +=3D abs1(v->B(7) - s->B(7)); d->Q(0) =3D val; -#if SHIFT =3D=3D 1 +#if SHIFT >=3D 1 val =3D 0; - val +=3D abs1(d->B(8) - s->B(8)); - val +=3D abs1(d->B(9) - s->B(9)); - val +=3D abs1(d->B(10) - s->B(10)); - val +=3D abs1(d->B(11) - s->B(11)); - val +=3D abs1(d->B(12) - s->B(12)); - val +=3D abs1(d->B(13) - s->B(13)); - val +=3D abs1(d->B(14) - s->B(14)); - val +=3D abs1(d->B(15) - s->B(15)); + val +=3D abs1(v->B(8) - s->B(8)); + val +=3D abs1(v->B(9) - s->B(9)); + val +=3D abs1(v->B(10) - s->B(10)); + val +=3D abs1(v->B(11) - s->B(11)); + val +=3D abs1(v->B(12) - s->B(12)); + val +=3D abs1(v->B(13) - s->B(13)); + val +=3D abs1(v->B(14) - s->B(14)); + val +=3D abs1(v->B(15) - s->B(15)); d->Q(1) =3D val; +#if SHIFT =3D=3D 2 + val =3D 0; + val +=3D abs1(v->B(16) - s->B(16)); + val +=3D abs1(v->B(17) - s->B(17)); + val +=3D abs1(v->B(18) - s->B(18)); + val +=3D abs1(v->B(19) - s->B(19)); + val +=3D abs1(v->B(20) - s->B(20)); + val +=3D abs1(v->B(21) - s->B(21)); + val +=3D abs1(v->B(22) - s->B(22)); + val +=3D abs1(v->B(23) - s->B(23)); + d->Q(2) =3D val; + val =3D 0; + val +=3D abs1(v->B(24) - s->B(24)); + val +=3D abs1(v->B(25) - s->B(25)); + val +=3D abs1(v->B(26) - s->B(26)); + val +=3D abs1(v->B(27) - s->B(27)); + val +=3D abs1(v->B(28) - s->B(28)); + val +=3D abs1(v->B(29) - s->B(29)); + val +=3D abs1(v->B(30) - s->B(30)); + val +=3D abs1(v->B(31) - s->B(31)); + d->Q(3) =3D val; +#endif #endif } =20 +#if SHIFT < 2 void glue(helper_maskmov, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, target_ulong a0) { @@ -487,13 +617,18 @@ void glue(helper_maskmov, SUFFIX)(CPUX86State *env, R= eg *d, Reg *s, } } } +#endif =20 void glue(helper_movl_mm_T0, SUFFIX)(Reg *d, uint32_t val) { d->L(0) =3D val; d->L(1) =3D 0; -#if SHIFT =3D=3D 1 +#if SHIFT >=3D 1 d->Q(1) =3D 0; +#if SHIFT =3D=3D 2 + d->Q(2) =3D 0; + d->Q(3) =3D 0; +#endif #endif } =20 @@ -501,114 +636,152 @@ void glue(helper_movl_mm_T0, SUFFIX)(Reg *d, uint32= _t val) void glue(helper_movq_mm_T0, SUFFIX)(Reg *d, uint64_t val) { d->Q(0) =3D val; -#if SHIFT =3D=3D 1 +#if SHIFT >=3D 1 d->Q(1) =3D 0; +#if SHIFT =3D=3D 2 + d->Q(2) =3D 0; + d->Q(3) =3D 0; +#endif #endif } #endif =20 +#define SHUFFLE4(F, a, b, offset) do { \ + r0 =3D a->F((order & 3) + offset); \ + r1 =3D a->F(((order >> 2) & 3) + offset); \ + r2 =3D b->F(((order >> 4) & 3) + offset); \ + r3 =3D b->F(((order >> 6) & 3) + offset); \ + d->F(offset) =3D r0; \ + d->F(offset + 1) =3D r1; \ + d->F(offset + 2) =3D r2; \ + d->F(offset + 3) =3D r3; \ + } while (0) + #if SHIFT =3D=3D 0 void glue(helper_pshufw, SUFFIX)(Reg *d, Reg *s, int order) { - Reg r; + uint16_t r0, r1, r2, r3; =20 - r.W(0) =3D s->W(order & 3); - r.W(1) =3D s->W((order >> 2) & 3); - r.W(2) =3D s->W((order >> 4) & 3); - r.W(3) =3D s->W((order >> 6) & 3); - *d =3D r; + SHUFFLE4(W, s, s, 0); } #else -void helper_shufps(Reg *d, Reg *s, int order) +void glue(helper_shufps, SUFFIX)(Reg *d, Reg *v, Reg *s, int order) { - Reg r; + uint32_t r0, r1, r2, r3; =20 - r.L(0) =3D d->L(order & 3); - r.L(1) =3D d->L((order >> 2) & 3); - r.L(2) =3D s->L((order >> 4) & 3); - r.L(3) =3D s->L((order >> 6) & 3); - *d =3D r; + SHUFFLE4(L, v, s, 0); +#if SHIFT =3D=3D 2 + SHUFFLE4(L, v, s, 4); +#endif } =20 -void helper_shufpd(Reg *d, Reg *s, int order) +void glue(helper_shufpd, SUFFIX)(Reg *d, Reg *v, Reg *s, int order) { - Reg r; + uint64_t r0, r1; =20 - r.Q(0) =3D d->Q(order & 1); - r.Q(1) =3D s->Q((order >> 1) & 1); - *d =3D r; + r0 =3D v->Q(order & 1); + r1 =3D s->Q((order >> 1) & 1); + d->Q(0) =3D r0; + d->Q(1) =3D r1; +#if SHIFT =3D=3D 2 + r0 =3D v->Q(((order >> 2) & 1) + 2); + r1 =3D s->Q(((order >> 3) & 1) + 2); + d->Q(2) =3D r0; + d->Q(3) =3D r1; +#endif } =20 void glue(helper_pshufd, SUFFIX)(Reg *d, Reg *s, int order) { - Reg r; + uint32_t r0, r1, r2, r3; =20 - r.L(0) =3D s->L(order & 3); - r.L(1) =3D s->L((order >> 2) & 3); - r.L(2) =3D s->L((order >> 4) & 3); - r.L(3) =3D s->L((order >> 6) & 3); - *d =3D r; + SHUFFLE4(L, s, s, 0); +#if SHIFT =3D=3D 2 + SHUFFLE4(L, s, s, 4); +#endif } =20 void glue(helper_pshuflw, SUFFIX)(Reg *d, Reg *s, int order) { - Reg r; + uint16_t r0, r1, r2, r3; =20 - r.W(0) =3D s->W(order & 3); - r.W(1) =3D s->W((order >> 2) & 3); - r.W(2) =3D s->W((order >> 4) & 3); - r.W(3) =3D s->W((order >> 6) & 3); - r.Q(1) =3D s->Q(1); - *d =3D r; + SHUFFLE4(W, s, s, 0); + d->Q(1) =3D s->Q(1); +#if SHIFT =3D=3D 2 + SHUFFLE4(W, s, s, 8); + d->Q(3) =3D s->Q(3); +#endif } =20 void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *s, int order) { - Reg r; + uint16_t r0, r1, r2, r3; =20 - r.Q(0) =3D s->Q(0); - r.W(4) =3D s->W(4 + (order & 3)); - r.W(5) =3D s->W(4 + ((order >> 2) & 3)); - r.W(6) =3D s->W(4 + ((order >> 4) & 3)); - r.W(7) =3D s->W(4 + ((order >> 6) & 3)); - *d =3D r; + d->Q(0) =3D s->Q(0); + SHUFFLE4(W, s, s, 4); +#if SHIFT =3D=3D 2 + d->Q(2) =3D s->Q(2); + SHUFFLE4(W, s, s, 12); +#endif } #endif =20 -#if SHIFT =3D=3D 1 +#if SHIFT >=3D 1 /* FPU ops */ /* XXX: not accurate */ =20 -#define SSE_HELPER_S(name, F) \ - void helper_ ## name ## ps(CPUX86State *env, Reg *d, Reg *s) \ +#define SSE_HELPER_P(name, F) \ + void glue(helper_ ## name ## ps, SUFFIX)(CPUX86State *env, \ + Reg *d, Reg *v, Reg *s) \ { \ - d->ZMM_S(0) =3D F(32, d->ZMM_S(0), s->ZMM_S(0)); \ - d->ZMM_S(1) =3D F(32, d->ZMM_S(1), s->ZMM_S(1)); \ - d->ZMM_S(2) =3D F(32, d->ZMM_S(2), s->ZMM_S(2)); \ - d->ZMM_S(3) =3D F(32, d->ZMM_S(3), s->ZMM_S(3)); \ + d->ZMM_S(0) =3D F(32, v->ZMM_S(0), s->ZMM_S(0)); \ + d->ZMM_S(1) =3D F(32, v->ZMM_S(1), s->ZMM_S(1)); \ + d->ZMM_S(2) =3D F(32, v->ZMM_S(2), s->ZMM_S(2)); \ + d->ZMM_S(3) =3D F(32, v->ZMM_S(3), s->ZMM_S(3)); \ + YMM_ONLY( \ + d->ZMM_S(4) =3D F(32, v->ZMM_S(4), s->ZMM_S(4)); \ + d->ZMM_S(5) =3D F(32, v->ZMM_S(5), s->ZMM_S(5)); \ + d->ZMM_S(6) =3D F(32, v->ZMM_S(6), s->ZMM_S(6)); \ + d->ZMM_S(7) =3D F(32, v->ZMM_S(7), s->ZMM_S(7)); \ + ) \ } \ \ - void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *s) \ + void glue(helper_ ## name ## pd, SUFFIX)(CPUX86State *env, \ + Reg *d, Reg *v, Reg *s) \ { \ - d->ZMM_S(0) =3D F(32, d->ZMM_S(0), s->ZMM_S(0)); \ - } \ + d->ZMM_D(0) =3D F(64, v->ZMM_D(0), s->ZMM_D(0)); \ + d->ZMM_D(1) =3D F(64, v->ZMM_D(1), s->ZMM_D(1)); \ + YMM_ONLY( \ + d->ZMM_D(2) =3D F(64, v->ZMM_D(2), s->ZMM_D(2)); \ + d->ZMM_D(3) =3D F(64, v->ZMM_D(3), s->ZMM_D(3)); \ + ) \ + } + +#if SHIFT =3D=3D 1 + +#define SSE_HELPER_S(name, F) \ + SSE_HELPER_P(name, F) \ \ - void helper_ ## name ## pd(CPUX86State *env, Reg *d, Reg *s) \ + void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *v, Reg *s)\ { \ - d->ZMM_D(0) =3D F(64, d->ZMM_D(0), s->ZMM_D(0)); \ - d->ZMM_D(1) =3D F(64, d->ZMM_D(1), s->ZMM_D(1)); \ + d->ZMM_S(0) =3D F(32, v->ZMM_S(0), s->ZMM_S(0)); \ } \ \ - void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *s) \ + void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *v, Reg *s)\ { \ - d->ZMM_D(0) =3D F(64, d->ZMM_D(0), s->ZMM_D(0)); \ + d->ZMM_D(0) =3D F(64, v->ZMM_D(0), s->ZMM_D(0)); \ } =20 +#else + +#define SSE_HELPER_S(name, F) SSE_HELPER_P(name, F) + +#endif + #define FPU_ADD(size, a, b) float ## size ## _add(a, b, &env->sse_status) #define FPU_SUB(size, a, b) float ## size ## _sub(a, b, &env->sse_status) #define FPU_MUL(size, a, b) float ## size ## _mul(a, b, &env->sse_status) #define FPU_DIV(size, a, b) float ## size ## _div(a, b, &env->sse_status) -#define FPU_SQRT(size, a, b) float ## size ## _sqrt(b, &env->sse_status) =20 /* Note that the choice of comparison op here is important to get the * special cases right: for min and max Intel specifies that (-0,0), @@ -625,27 +798,76 @@ SSE_HELPER_S(mul, FPU_MUL) SSE_HELPER_S(div, FPU_DIV) SSE_HELPER_S(min, FPU_MIN) SSE_HELPER_S(max, FPU_MAX) -SSE_HELPER_S(sqrt, FPU_SQRT) =20 +void glue(helper_sqrtps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +{ + d->ZMM_S(0) =3D float32_sqrt(s->ZMM_S(0), &env->sse_status); + d->ZMM_S(1) =3D float32_sqrt(s->ZMM_S(1), &env->sse_status); + d->ZMM_S(2) =3D float32_sqrt(s->ZMM_S(2), &env->sse_status); + d->ZMM_S(3) =3D float32_sqrt(s->ZMM_S(3), &env->sse_status); +#if SHIFT =3D=3D 2 + d->ZMM_S(4) =3D float32_sqrt(s->ZMM_S(4), &env->sse_status); + d->ZMM_S(5) =3D float32_sqrt(s->ZMM_S(5), &env->sse_status); + d->ZMM_S(6) =3D float32_sqrt(s->ZMM_S(6), &env->sse_status); + d->ZMM_S(7) =3D float32_sqrt(s->ZMM_S(7), &env->sse_status); +#endif +} + +void glue(helper_sqrtpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +{ + d->ZMM_D(0) =3D float64_sqrt(s->ZMM_D(0), &env->sse_status); + d->ZMM_D(1) =3D float64_sqrt(s->ZMM_D(1), &env->sse_status); +#if SHIFT =3D=3D 2 + d->ZMM_D(2) =3D float64_sqrt(s->ZMM_D(2), &env->sse_status); + d->ZMM_D(3) =3D float64_sqrt(s->ZMM_D(3), &env->sse_status); +#endif +} + +#if SHIFT =3D=3D 1 +void helper_sqrtss(CPUX86State *env, Reg *d, Reg *s) +{ + d->ZMM_S(0) =3D float32_sqrt(s->ZMM_S(0), &env->sse_status); +} + +void helper_sqrtsd(CPUX86State *env, Reg *d, Reg *s) +{ + d->ZMM_D(0) =3D float64_sqrt(s->ZMM_D(0), &env->sse_status); +} +#endif =20 /* float to float conversions */ -void helper_cvtps2pd(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_cvtps2pd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { float32 s0, s1; =20 s0 =3D s->ZMM_S(0); s1 =3D s->ZMM_S(1); +#if SHIFT =3D=3D 2 + float32 s2, s3; + s2 =3D s->ZMM_S(2); + s3 =3D s->ZMM_S(3); + d->ZMM_D(2) =3D float32_to_float64(s2, &env->sse_status); + d->ZMM_D(3) =3D float32_to_float64(s3, &env->sse_status); +#endif d->ZMM_D(0) =3D float32_to_float64(s0, &env->sse_status); d->ZMM_D(1) =3D float32_to_float64(s1, &env->sse_status); } =20 -void helper_cvtpd2ps(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_cvtpd2ps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { d->ZMM_S(0) =3D float64_to_float32(s->ZMM_D(0), &env->sse_status); d->ZMM_S(1) =3D float64_to_float32(s->ZMM_D(1), &env->sse_status); +#if SHIFT =3D=3D 2 + d->ZMM_S(2) =3D float64_to_float32(s->ZMM_D(2), &env->sse_status); + d->ZMM_S(3) =3D float64_to_float32(s->ZMM_D(3), &env->sse_status); + d->Q(2) =3D 0; + d->Q(3) =3D 0; +#else d->Q(1) =3D 0; +#endif } =20 +#if SHIFT =3D=3D 1 void helper_cvtss2sd(CPUX86State *env, Reg *d, Reg *s) { d->ZMM_D(0) =3D float32_to_float64(s->ZMM_S(0), &env->sse_status); @@ -655,26 +877,41 @@ void helper_cvtsd2ss(CPUX86State *env, Reg *d, Reg *s) { d->ZMM_S(0) =3D float64_to_float32(s->ZMM_D(0), &env->sse_status); } +#endif =20 /* integer to float */ -void helper_cvtdq2ps(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_cvtdq2ps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { d->ZMM_S(0) =3D int32_to_float32(s->ZMM_L(0), &env->sse_status); d->ZMM_S(1) =3D int32_to_float32(s->ZMM_L(1), &env->sse_status); d->ZMM_S(2) =3D int32_to_float32(s->ZMM_L(2), &env->sse_status); d->ZMM_S(3) =3D int32_to_float32(s->ZMM_L(3), &env->sse_status); +#if SHIFT =3D=3D 2 + d->ZMM_S(4) =3D int32_to_float32(s->ZMM_L(4), &env->sse_status); + d->ZMM_S(5) =3D int32_to_float32(s->ZMM_L(5), &env->sse_status); + d->ZMM_S(6) =3D int32_to_float32(s->ZMM_L(6), &env->sse_status); + d->ZMM_S(7) =3D int32_to_float32(s->ZMM_L(7), &env->sse_status); +#endif } =20 -void helper_cvtdq2pd(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_cvtdq2pd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { int32_t l0, l1; =20 l0 =3D (int32_t)s->ZMM_L(0); l1 =3D (int32_t)s->ZMM_L(1); +#if SHIFT =3D=3D 2 + int32_t l2, l3; + l2 =3D (int32_t)s->ZMM_L(2); + l3 =3D (int32_t)s->ZMM_L(3); + d->ZMM_D(2) =3D int32_to_float64(l2, &env->sse_status); + d->ZMM_D(3) =3D int32_to_float64(l3, &env->sse_status); +#endif d->ZMM_D(0) =3D int32_to_float64(l0, &env->sse_status); d->ZMM_D(1) =3D int32_to_float64(l1, &env->sse_status); } =20 +#if SHIFT =3D=3D 1 void helper_cvtpi2ps(CPUX86State *env, ZMMReg *d, MMXReg *s) { d->ZMM_S(0) =3D int32_to_float32(s->MMX_L(0), &env->sse_status); @@ -709,8 +946,11 @@ void helper_cvtsq2sd(CPUX86State *env, ZMMReg *d, uint= 64_t val) } #endif =20 +#endif + /* float to integer */ =20 +#if SHIFT =3D=3D 1 /* * x86 mandates that we return the indefinite integer value for the result * of any float-to-integer conversion that raises the 'invalid' exception. @@ -741,22 +981,37 @@ WRAP_FLOATCONV(int64_t, float32_to_int64, float32, IN= T64_MIN) WRAP_FLOATCONV(int64_t, float32_to_int64_round_to_zero, float32, INT64_MIN) WRAP_FLOATCONV(int64_t, float64_to_int64, float64, INT64_MIN) WRAP_FLOATCONV(int64_t, float64_to_int64_round_to_zero, float64, INT64_MIN) +#endif =20 -void helper_cvtps2dq(CPUX86State *env, ZMMReg *d, ZMMReg *s) +void glue(helper_cvtps2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) { d->ZMM_L(0) =3D x86_float32_to_int32(s->ZMM_S(0), &env->sse_status); d->ZMM_L(1) =3D x86_float32_to_int32(s->ZMM_S(1), &env->sse_status); d->ZMM_L(2) =3D x86_float32_to_int32(s->ZMM_S(2), &env->sse_status); d->ZMM_L(3) =3D x86_float32_to_int32(s->ZMM_S(3), &env->sse_status); +#if SHIFT =3D=3D 2 + d->ZMM_L(4) =3D x86_float32_to_int32(s->ZMM_S(4), &env->sse_status); + d->ZMM_L(5) =3D x86_float32_to_int32(s->ZMM_S(5), &env->sse_status); + d->ZMM_L(6) =3D x86_float32_to_int32(s->ZMM_S(6), &env->sse_status); + d->ZMM_L(7) =3D x86_float32_to_int32(s->ZMM_S(7), &env->sse_status); +#endif } =20 -void helper_cvtpd2dq(CPUX86State *env, ZMMReg *d, ZMMReg *s) +void glue(helper_cvtpd2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) { d->ZMM_L(0) =3D x86_float64_to_int32(s->ZMM_D(0), &env->sse_status); d->ZMM_L(1) =3D x86_float64_to_int32(s->ZMM_D(1), &env->sse_status); +#if SHIFT =3D=3D 2 + d->ZMM_L(2) =3D x86_float64_to_int32(s->ZMM_D(2), &env->sse_status); + d->ZMM_L(3) =3D x86_float64_to_int32(s->ZMM_D(3), &env->sse_status); + d->Q(2) =3D 0; + d->Q(3) =3D 0; +#else d->ZMM_Q(1) =3D 0; +#endif } =20 +#if SHIFT =3D=3D 1 void helper_cvtps2pi(CPUX86State *env, MMXReg *d, ZMMReg *s) { d->MMX_L(0) =3D x86_float32_to_int32(s->ZMM_S(0), &env->sse_status); @@ -790,33 +1045,64 @@ int64_t helper_cvtsd2sq(CPUX86State *env, ZMMReg *s) return x86_float64_to_int64(s->ZMM_D(0), &env->sse_status); } #endif +#endif =20 /* float to integer truncated */ -void helper_cvttps2dq(CPUX86State *env, ZMMReg *d, ZMMReg *s) -{ - d->ZMM_L(0) =3D x86_float32_to_int32_round_to_zero(s->ZMM_S(0), &env->= sse_status); - d->ZMM_L(1) =3D x86_float32_to_int32_round_to_zero(s->ZMM_S(1), &env->= sse_status); - d->ZMM_L(2) =3D x86_float32_to_int32_round_to_zero(s->ZMM_S(2), &env->= sse_status); - d->ZMM_L(3) =3D x86_float32_to_int32_round_to_zero(s->ZMM_S(3), &env->= sse_status); +void glue(helper_cvttps2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) +{ + d->ZMM_L(0) =3D x86_float32_to_int32_round_to_zero(s->ZMM_S(0), + &env->sse_status); + d->ZMM_L(1) =3D x86_float32_to_int32_round_to_zero(s->ZMM_S(1), + &env->sse_status); + d->ZMM_L(2) =3D x86_float32_to_int32_round_to_zero(s->ZMM_S(2), + &env->sse_status); + d->ZMM_L(3) =3D x86_float32_to_int32_round_to_zero(s->ZMM_S(3), + &env->sse_status); +#if SHIFT =3D=3D 2 + d->ZMM_L(4) =3D x86_float32_to_int32_round_to_zero(s->ZMM_S(4), + &env->sse_status); + d->ZMM_L(5) =3D x86_float32_to_int32_round_to_zero(s->ZMM_S(5), + &env->sse_status); + d->ZMM_L(6) =3D x86_float32_to_int32_round_to_zero(s->ZMM_S(6), + &env->sse_status); + d->ZMM_L(7) =3D x86_float32_to_int32_round_to_zero(s->ZMM_S(7), + &env->sse_status); +#endif } =20 -void helper_cvttpd2dq(CPUX86State *env, ZMMReg *d, ZMMReg *s) +void glue(helper_cvttpd2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) { - d->ZMM_L(0) =3D x86_float64_to_int32_round_to_zero(s->ZMM_D(0), &env->= sse_status); - d->ZMM_L(1) =3D x86_float64_to_int32_round_to_zero(s->ZMM_D(1), &env->= sse_status); + d->ZMM_L(0) =3D x86_float64_to_int32_round_to_zero(s->ZMM_D(0), + &env->sse_status); + d->ZMM_L(1) =3D x86_float64_to_int32_round_to_zero(s->ZMM_D(1), + &env->sse_status); +#if SHIFT =3D=3D 2 + d->ZMM_L(2) =3D x86_float64_to_int32_round_to_zero(s->ZMM_D(2), + &env->sse_status); + d->ZMM_L(3) =3D x86_float64_to_int32_round_to_zero(s->ZMM_D(3), + &env->sse_status); + d->ZMM_Q(2) =3D 0; + d->ZMM_Q(3) =3D 0; +#else d->ZMM_Q(1) =3D 0; +#endif } =20 +#if SHIFT =3D=3D 1 void helper_cvttps2pi(CPUX86State *env, MMXReg *d, ZMMReg *s) { - d->MMX_L(0) =3D x86_float32_to_int32_round_to_zero(s->ZMM_S(0), &env->= sse_status); - d->MMX_L(1) =3D x86_float32_to_int32_round_to_zero(s->ZMM_S(1), &env->= sse_status); + d->MMX_L(0) =3D x86_float32_to_int32_round_to_zero(s->ZMM_S(0), + &env->sse_status); + d->MMX_L(1) =3D x86_float32_to_int32_round_to_zero(s->ZMM_S(1), + &env->sse_status); } =20 void helper_cvttpd2pi(CPUX86State *env, MMXReg *d, ZMMReg *s) { - d->MMX_L(0) =3D x86_float64_to_int32_round_to_zero(s->ZMM_D(0), &env->= sse_status); - d->MMX_L(1) =3D x86_float64_to_int32_round_to_zero(s->ZMM_D(1), &env->= sse_status); + d->MMX_L(0) =3D x86_float64_to_int32_round_to_zero(s->ZMM_D(0), + &env->sse_status); + d->MMX_L(1) =3D x86_float64_to_int32_round_to_zero(s->ZMM_D(1), + &env->sse_status); } =20 int32_t helper_cvttss2si(CPUX86State *env, ZMMReg *s) @@ -840,8 +1126,9 @@ int64_t helper_cvttsd2sq(CPUX86State *env, ZMMReg *s) return x86_float64_to_int64_round_to_zero(s->ZMM_D(0), &env->sse_statu= s); } #endif +#endif =20 -void helper_rsqrtps(CPUX86State *env, ZMMReg *d, ZMMReg *s) +void glue(helper_rsqrtps, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) { uint8_t old_flags =3D get_float_exception_flags(&env->sse_status); d->ZMM_S(0) =3D float32_div(float32_one, @@ -856,9 +1143,24 @@ void helper_rsqrtps(CPUX86State *env, ZMMReg *d, ZMMR= eg *s) d->ZMM_S(3) =3D float32_div(float32_one, float32_sqrt(s->ZMM_S(3), &env->sse_status), &env->sse_status); +#if SHIFT =3D=3D 2 + d->ZMM_S(4) =3D float32_div(float32_one, + float32_sqrt(s->ZMM_S(4), &env->sse_status), + &env->sse_status); + d->ZMM_S(5) =3D float32_div(float32_one, + float32_sqrt(s->ZMM_S(5), &env->sse_status), + &env->sse_status); + d->ZMM_S(6) =3D float32_div(float32_one, + float32_sqrt(s->ZMM_S(6), &env->sse_status), + &env->sse_status); + d->ZMM_S(7) =3D float32_div(float32_one, + float32_sqrt(s->ZMM_S(7), &env->sse_status), + &env->sse_status); +#endif set_float_exception_flags(old_flags, &env->sse_status); } =20 +#if SHIFT =3D=3D 1 void helper_rsqrtss(CPUX86State *env, ZMMReg *d, ZMMReg *s) { uint8_t old_flags =3D get_float_exception_flags(&env->sse_status); @@ -867,24 +1169,34 @@ void helper_rsqrtss(CPUX86State *env, ZMMReg *d, ZMM= Reg *s) &env->sse_status); set_float_exception_flags(old_flags, &env->sse_status); } +#endif =20 -void helper_rcpps(CPUX86State *env, ZMMReg *d, ZMMReg *s) +void glue(helper_rcpps, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) { uint8_t old_flags =3D get_float_exception_flags(&env->sse_status); d->ZMM_S(0) =3D float32_div(float32_one, s->ZMM_S(0), &env->sse_status= ); d->ZMM_S(1) =3D float32_div(float32_one, s->ZMM_S(1), &env->sse_status= ); d->ZMM_S(2) =3D float32_div(float32_one, s->ZMM_S(2), &env->sse_status= ); d->ZMM_S(3) =3D float32_div(float32_one, s->ZMM_S(3), &env->sse_status= ); +#if SHIFT =3D=3D 2 + d->ZMM_S(4) =3D float32_div(float32_one, s->ZMM_S(4), &env->sse_status= ); + d->ZMM_S(5) =3D float32_div(float32_one, s->ZMM_S(5), &env->sse_status= ); + d->ZMM_S(6) =3D float32_div(float32_one, s->ZMM_S(6), &env->sse_status= ); + d->ZMM_S(7) =3D float32_div(float32_one, s->ZMM_S(7), &env->sse_status= ); +#endif set_float_exception_flags(old_flags, &env->sse_status); } =20 +#if SHIFT =3D=3D 1 void helper_rcpss(CPUX86State *env, ZMMReg *d, ZMMReg *s) { uint8_t old_flags =3D get_float_exception_flags(&env->sse_status); d->ZMM_S(0) =3D float32_div(float32_one, s->ZMM_S(0), &env->sse_status= ); set_float_exception_flags(old_flags, &env->sse_status); } +#endif =20 +#if SHIFT =3D=3D 1 static inline uint64_t helper_extrq(uint64_t src, int shift, int len) { uint64_t mask; @@ -928,113 +1240,213 @@ void helper_insertq_i(CPUX86State *env, ZMMReg *d,= int index, int length) { d->ZMM_Q(0) =3D helper_insertq(d->ZMM_Q(0), index, length); } +#endif =20 -void helper_haddps(CPUX86State *env, ZMMReg *d, ZMMReg *s) -{ - ZMMReg r; - - r.ZMM_S(0) =3D float32_add(d->ZMM_S(0), d->ZMM_S(1), &env->sse_status); - r.ZMM_S(1) =3D float32_add(d->ZMM_S(2), d->ZMM_S(3), &env->sse_status); - r.ZMM_S(2) =3D float32_add(s->ZMM_S(0), s->ZMM_S(1), &env->sse_status); - r.ZMM_S(3) =3D float32_add(s->ZMM_S(2), s->ZMM_S(3), &env->sse_status); - *d =3D r; +void glue(helper_haddps, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) +{ + float32 r0, r1, r2, r3; + + r0 =3D float32_add(v->ZMM_S(0), v->ZMM_S(1), &env->sse_status); + r1 =3D float32_add(v->ZMM_S(2), v->ZMM_S(3), &env->sse_status); + r2 =3D float32_add(s->ZMM_S(0), s->ZMM_S(1), &env->sse_status); + r3 =3D float32_add(s->ZMM_S(2), s->ZMM_S(3), &env->sse_status); + d->ZMM_S(0) =3D r0; + d->ZMM_S(1) =3D r1; + d->ZMM_S(2) =3D r2; + d->ZMM_S(3) =3D r3; +#if SHIFT =3D=3D 2 + r0 =3D float32_add(v->ZMM_S(4), v->ZMM_S(5), &env->sse_status); + r1 =3D float32_add(v->ZMM_S(6), v->ZMM_S(7), &env->sse_status); + r2 =3D float32_add(s->ZMM_S(4), s->ZMM_S(5), &env->sse_status); + r3 =3D float32_add(s->ZMM_S(6), s->ZMM_S(7), &env->sse_status); + d->ZMM_S(4) =3D r0; + d->ZMM_S(5) =3D r1; + d->ZMM_S(6) =3D r2; + d->ZMM_S(7) =3D r3; +#endif } =20 -void helper_haddpd(CPUX86State *env, ZMMReg *d, ZMMReg *s) +void glue(helper_haddpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) { - ZMMReg r; + float64 r0, r1; =20 - r.ZMM_D(0) =3D float64_add(d->ZMM_D(0), d->ZMM_D(1), &env->sse_status); - r.ZMM_D(1) =3D float64_add(s->ZMM_D(0), s->ZMM_D(1), &env->sse_status); - *d =3D r; + r0 =3D float64_add(v->ZMM_D(0), v->ZMM_D(1), &env->sse_status); + r1 =3D float64_add(s->ZMM_D(0), s->ZMM_D(1), &env->sse_status); + d->ZMM_D(0) =3D r0; + d->ZMM_D(1) =3D r1; +#if SHIFT =3D=3D 2 + r0 =3D float64_add(v->ZMM_D(2), v->ZMM_D(3), &env->sse_status); + r1 =3D float64_add(s->ZMM_D(2), s->ZMM_D(3), &env->sse_status); + d->ZMM_D(2) =3D r0; + d->ZMM_D(3) =3D r1; +#endif } =20 -void helper_hsubps(CPUX86State *env, ZMMReg *d, ZMMReg *s) -{ - ZMMReg r; - - r.ZMM_S(0) =3D float32_sub(d->ZMM_S(0), d->ZMM_S(1), &env->sse_status); - r.ZMM_S(1) =3D float32_sub(d->ZMM_S(2), d->ZMM_S(3), &env->sse_status); - r.ZMM_S(2) =3D float32_sub(s->ZMM_S(0), s->ZMM_S(1), &env->sse_status); - r.ZMM_S(3) =3D float32_sub(s->ZMM_S(2), s->ZMM_S(3), &env->sse_status); - *d =3D r; +void glue(helper_hsubps, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) +{ + float32 r0, r1, r2, r3; + + r0 =3D float32_sub(v->ZMM_S(0), v->ZMM_S(1), &env->sse_status); + r1 =3D float32_sub(v->ZMM_S(2), v->ZMM_S(3), &env->sse_status); + r2 =3D float32_sub(s->ZMM_S(0), s->ZMM_S(1), &env->sse_status); + r3 =3D float32_sub(s->ZMM_S(2), s->ZMM_S(3), &env->sse_status); + d->ZMM_S(0) =3D r0; + d->ZMM_S(1) =3D r1; + d->ZMM_S(2) =3D r2; + d->ZMM_S(3) =3D r3; +#if SHIFT =3D=3D 2 + r0 =3D float32_sub(v->ZMM_S(4), v->ZMM_S(5), &env->sse_status); + r1 =3D float32_sub(v->ZMM_S(6), v->ZMM_S(7), &env->sse_status); + r2 =3D float32_sub(s->ZMM_S(4), s->ZMM_S(5), &env->sse_status); + r3 =3D float32_sub(s->ZMM_S(6), s->ZMM_S(7), &env->sse_status); + d->ZMM_S(4) =3D r0; + d->ZMM_S(5) =3D r1; + d->ZMM_S(6) =3D r2; + d->ZMM_S(7) =3D r3; +#endif } =20 -void helper_hsubpd(CPUX86State *env, ZMMReg *d, ZMMReg *s) +void glue(helper_hsubpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) { - ZMMReg r; + float64 r0, r1; =20 - r.ZMM_D(0) =3D float64_sub(d->ZMM_D(0), d->ZMM_D(1), &env->sse_status); - r.ZMM_D(1) =3D float64_sub(s->ZMM_D(0), s->ZMM_D(1), &env->sse_status); - *d =3D r; + r0 =3D float64_sub(v->ZMM_D(0), v->ZMM_D(1), &env->sse_status); + r1 =3D float64_sub(s->ZMM_D(0), s->ZMM_D(1), &env->sse_status); + d->ZMM_D(0) =3D r0; + d->ZMM_D(1) =3D r1; +#if SHIFT =3D=3D 2 + r0 =3D float64_sub(v->ZMM_D(2), v->ZMM_D(3), &env->sse_status); + r1 =3D float64_sub(s->ZMM_D(2), s->ZMM_D(3), &env->sse_status); + d->ZMM_D(2) =3D r0; + d->ZMM_D(3) =3D r1; +#endif } =20 -void helper_addsubps(CPUX86State *env, ZMMReg *d, ZMMReg *s) +void glue(helper_addsubps, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *= s) { - d->ZMM_S(0) =3D float32_sub(d->ZMM_S(0), s->ZMM_S(0), &env->sse_status= ); - d->ZMM_S(1) =3D float32_add(d->ZMM_S(1), s->ZMM_S(1), &env->sse_status= ); - d->ZMM_S(2) =3D float32_sub(d->ZMM_S(2), s->ZMM_S(2), &env->sse_status= ); - d->ZMM_S(3) =3D float32_add(d->ZMM_S(3), s->ZMM_S(3), &env->sse_status= ); + d->ZMM_S(0) =3D float32_sub(v->ZMM_S(0), s->ZMM_S(0), &env->sse_status= ); + d->ZMM_S(1) =3D float32_add(v->ZMM_S(1), s->ZMM_S(1), &env->sse_status= ); + d->ZMM_S(2) =3D float32_sub(v->ZMM_S(2), s->ZMM_S(2), &env->sse_status= ); + d->ZMM_S(3) =3D float32_add(v->ZMM_S(3), s->ZMM_S(3), &env->sse_status= ); +#if SHIFT =3D=3D 2 + d->ZMM_S(4) =3D float32_sub(v->ZMM_S(4), s->ZMM_S(4), &env->sse_status= ); + d->ZMM_S(5) =3D float32_add(v->ZMM_S(5), s->ZMM_S(5), &env->sse_status= ); + d->ZMM_S(6) =3D float32_sub(v->ZMM_S(6), s->ZMM_S(6), &env->sse_status= ); + d->ZMM_S(7) =3D float32_add(v->ZMM_S(7), s->ZMM_S(7), &env->sse_status= ); +#endif } =20 -void helper_addsubpd(CPUX86State *env, ZMMReg *d, ZMMReg *s) +void glue(helper_addsubpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *= s) { - d->ZMM_D(0) =3D float64_sub(d->ZMM_D(0), s->ZMM_D(0), &env->sse_status= ); - d->ZMM_D(1) =3D float64_add(d->ZMM_D(1), s->ZMM_D(1), &env->sse_status= ); + d->ZMM_D(0) =3D float64_sub(v->ZMM_D(0), s->ZMM_D(0), &env->sse_status= ); + d->ZMM_D(1) =3D float64_add(v->ZMM_D(1), s->ZMM_D(1), &env->sse_status= ); +#if SHIFT =3D=3D 2 + d->ZMM_D(2) =3D float64_sub(v->ZMM_D(2), s->ZMM_D(2), &env->sse_status= ); + d->ZMM_D(3) =3D float64_add(v->ZMM_D(3), s->ZMM_D(3), &env->sse_status= ); +#endif } =20 -/* XXX: unordered */ -#define SSE_HELPER_CMP(name, F) \ - void helper_ ## name ## ps(CPUX86State *env, Reg *d, Reg *s) \ - { \ - d->ZMM_L(0) =3D F(32, d->ZMM_S(0), s->ZMM_S(0)); \ - d->ZMM_L(1) =3D F(32, d->ZMM_S(1), s->ZMM_S(1)); \ - d->ZMM_L(2) =3D F(32, d->ZMM_S(2), s->ZMM_S(2)); \ - d->ZMM_L(3) =3D F(32, d->ZMM_S(3), s->ZMM_S(3)); \ - } \ - \ - void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *s) \ - { \ - d->ZMM_L(0) =3D F(32, d->ZMM_S(0), s->ZMM_S(0)); \ - } \ - \ - void helper_ ## name ## pd(CPUX86State *env, Reg *d, Reg *s) \ +#define SSE_HELPER_CMP_P(name, F, C) \ + void glue(helper_ ## name ## ps, SUFFIX)(CPUX86State *env, \ + Reg *d, Reg *v, Reg *s) \ { \ - d->ZMM_Q(0) =3D F(64, d->ZMM_D(0), s->ZMM_D(0)); \ - d->ZMM_Q(1) =3D F(64, d->ZMM_D(1), s->ZMM_D(1)); \ + d->ZMM_L(0) =3D F(32, C, v->ZMM_S(0), s->ZMM_S(0)); \ + d->ZMM_L(1) =3D F(32, C, v->ZMM_S(1), s->ZMM_S(1)); \ + d->ZMM_L(2) =3D F(32, C, v->ZMM_S(2), s->ZMM_S(2)); \ + d->ZMM_L(3) =3D F(32, C, v->ZMM_S(3), s->ZMM_S(3)); \ + YMM_ONLY( \ + d->ZMM_L(4) =3D F(32, C, v->ZMM_S(4), s->ZMM_S(4)); \ + d->ZMM_L(5) =3D F(32, C, v->ZMM_S(5), s->ZMM_S(5)); \ + d->ZMM_L(6) =3D F(32, C, v->ZMM_S(6), s->ZMM_S(6)); \ + d->ZMM_L(7) =3D F(32, C, v->ZMM_S(7), s->ZMM_S(7)); \ + ) \ } \ \ - void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *s) \ + void glue(helper_ ## name ## pd, SUFFIX)(CPUX86State *env, \ + Reg *d, Reg *v, Reg *s) \ { \ - d->ZMM_Q(0) =3D F(64, d->ZMM_D(0), s->ZMM_D(0)); \ - } - -#define FPU_CMPEQ(size, a, b) \ - (float ## size ## _eq_quiet(a, b, &env->sse_status) ? -1 : 0) -#define FPU_CMPLT(size, a, b) \ - (float ## size ## _lt(a, b, &env->sse_status) ? -1 : 0) -#define FPU_CMPLE(size, a, b) \ - (float ## size ## _le(a, b, &env->sse_status) ? -1 : 0) -#define FPU_CMPUNORD(size, a, b) \ - (float ## size ## _unordered_quiet(a, b, &env->sse_status) ? -1 : 0) -#define FPU_CMPNEQ(size, a, b) \ - (float ## size ## _eq_quiet(a, b, &env->sse_status) ? 0 : -1) -#define FPU_CMPNLT(size, a, b) \ - (float ## size ## _lt(a, b, &env->sse_status) ? 0 : -1) -#define FPU_CMPNLE(size, a, b) \ - (float ## size ## _le(a, b, &env->sse_status) ? 0 : -1) -#define FPU_CMPORD(size, a, b) \ - (float ## size ## _unordered_quiet(a, b, &env->sse_status) ? 0 : -1) - -SSE_HELPER_CMP(cmpeq, FPU_CMPEQ) -SSE_HELPER_CMP(cmplt, FPU_CMPLT) -SSE_HELPER_CMP(cmple, FPU_CMPLE) -SSE_HELPER_CMP(cmpunord, FPU_CMPUNORD) -SSE_HELPER_CMP(cmpneq, FPU_CMPNEQ) -SSE_HELPER_CMP(cmpnlt, FPU_CMPNLT) -SSE_HELPER_CMP(cmpnle, FPU_CMPNLE) -SSE_HELPER_CMP(cmpord, FPU_CMPORD) + d->ZMM_Q(0) =3D F(64, C, v->ZMM_D(0), s->ZMM_D(0)); \ + d->ZMM_Q(1) =3D F(64, C, v->ZMM_D(1), s->ZMM_D(1)); \ + YMM_ONLY( \ + d->ZMM_Q(2) =3D F(64, C, v->ZMM_D(2), s->ZMM_D(2)); \ + d->ZMM_Q(3) =3D F(64, C, v->ZMM_D(3), s->ZMM_D(3)); \ + ) \ + } + +#if SHIFT =3D=3D 1 +#define SSE_HELPER_CMP(name, F, C) = \ + SSE_HELPER_CMP_P(name, F, C) = \ + void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *v, Reg *s) = \ + { = \ + d->ZMM_L(0) =3D F(32, C, v->ZMM_S(0), s->ZMM_S(0)); = \ + } = \ + = \ + void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *v, Reg *s) = \ + { = \ + d->ZMM_Q(0) =3D F(64, C, v->ZMM_D(0), s->ZMM_D(0)); = \ + } + +static inline bool FPU_EQU(FloatRelation x) +{ + return (x =3D=3D float_relation_equal || x =3D=3D float_relation_unord= ered); +} +static inline bool FPU_GE(FloatRelation x) +{ + return (x =3D=3D float_relation_equal || x =3D=3D float_relation_great= er); +} +#define FPU_EQ(x) (x =3D=3D float_relation_equal) +#define FPU_LT(x) (x =3D=3D float_relation_less) +#define FPU_LE(x) (x <=3D float_relation_equal) +#define FPU_GT(x) (x =3D=3D float_relation_greater) +#define FPU_UNORD(x) (x =3D=3D float_relation_unordered) +#define FPU_FALSE(x) 0 + +#define FPU_CMPQ(size, COND, a, b) \ + (COND(float ## size ## _compare_quiet(a, b, &env->sse_status)) ? -1 : = 0) +#define FPU_CMPS(size, COND, a, b) \ + (COND(float ## size ## _compare(a, b, &env->sse_status)) ? -1 : 0) + +#else +#define SSE_HELPER_CMP(name, F, C) SSE_HELPER_CMP_P(name, F, C) +#endif =20 +SSE_HELPER_CMP(cmpeq, FPU_CMPQ, FPU_EQ) +SSE_HELPER_CMP(cmplt, FPU_CMPS, FPU_LT) +SSE_HELPER_CMP(cmple, FPU_CMPS, FPU_LE) +SSE_HELPER_CMP(cmpunord, FPU_CMPQ, FPU_UNORD) +SSE_HELPER_CMP(cmpneq, FPU_CMPQ, !FPU_EQ) +SSE_HELPER_CMP(cmpnlt, FPU_CMPS, !FPU_LT) +SSE_HELPER_CMP(cmpnle, FPU_CMPS, !FPU_LE) +SSE_HELPER_CMP(cmpord, FPU_CMPQ, !FPU_UNORD) + +SSE_HELPER_CMP(cmpequ, FPU_CMPQ, FPU_EQU) +SSE_HELPER_CMP(cmpnge, FPU_CMPS, !FPU_GE) +SSE_HELPER_CMP(cmpngt, FPU_CMPS, !FPU_GT) +SSE_HELPER_CMP(cmpfalse, FPU_CMPQ, FPU_FALSE) +SSE_HELPER_CMP(cmpnequ, FPU_CMPQ, !FPU_EQU) +SSE_HELPER_CMP(cmpge, FPU_CMPS, FPU_GE) +SSE_HELPER_CMP(cmpgt, FPU_CMPS, FPU_GT) +SSE_HELPER_CMP(cmptrue, FPU_CMPQ, !FPU_FALSE) + +SSE_HELPER_CMP(cmpeqs, FPU_CMPS, FPU_EQ) +SSE_HELPER_CMP(cmpltq, FPU_CMPQ, FPU_LT) +SSE_HELPER_CMP(cmpleq, FPU_CMPQ, FPU_LE) +SSE_HELPER_CMP(cmpunords, FPU_CMPS, FPU_UNORD) +SSE_HELPER_CMP(cmpneqq, FPU_CMPS, !FPU_EQ) +SSE_HELPER_CMP(cmpnltq, FPU_CMPQ, !FPU_LT) +SSE_HELPER_CMP(cmpnleq, FPU_CMPQ, !FPU_LE) +SSE_HELPER_CMP(cmpords, FPU_CMPS, !FPU_UNORD) + +SSE_HELPER_CMP(cmpequs, FPU_CMPS, FPU_EQU) +SSE_HELPER_CMP(cmpngeq, FPU_CMPQ, !FPU_GE) +SSE_HELPER_CMP(cmpngtq, FPU_CMPQ, !FPU_GT) +SSE_HELPER_CMP(cmpfalses, FPU_CMPS, FPU_FALSE) +SSE_HELPER_CMP(cmpnequs, FPU_CMPS, !FPU_EQU) +SSE_HELPER_CMP(cmpgeq, FPU_CMPQ, FPU_GE) +SSE_HELPER_CMP(cmpgtq, FPU_CMPQ, FPU_GT) +SSE_HELPER_CMP(cmptrues, FPU_CMPS, !FPU_FALSE) + +#if SHIFT =3D=3D 1 static const int comis_eflags[4] =3D {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C}; =20 void helper_ucomiss(CPUX86State *env, Reg *d, Reg *s) @@ -1080,25 +1492,38 @@ void helper_comisd(CPUX86State *env, Reg *d, Reg *s) ret =3D float64_compare(d0, d1, &env->sse_status); CC_SRC =3D comis_eflags[ret + 1]; } +#endif =20 -uint32_t helper_movmskps(CPUX86State *env, Reg *s) +uint32_t glue(helper_movmskps, SUFFIX)(CPUX86State *env, Reg *s) { - int b0, b1, b2, b3; + uint32_t mask; =20 - b0 =3D s->ZMM_L(0) >> 31; - b1 =3D s->ZMM_L(1) >> 31; - b2 =3D s->ZMM_L(2) >> 31; - b3 =3D s->ZMM_L(3) >> 31; - return b0 | (b1 << 1) | (b2 << 2) | (b3 << 3); + mask =3D 0; + mask |=3D (s->ZMM_L(0) >> (31 - 0)) & (1 << 0); + mask |=3D (s->ZMM_L(1) >> (31 - 1)) & (1 << 1); + mask |=3D (s->ZMM_L(2) >> (31 - 2)) & (1 << 2); + mask |=3D (s->ZMM_L(3) >> (31 - 3)) & (1 << 3); +#if SHIFT =3D=3D 2 + mask |=3D (s->ZMM_L(4) >> (31 - 4)) & (1 << 4); + mask |=3D (s->ZMM_L(5) >> (31 - 5)) & (1 << 5); + mask |=3D (s->ZMM_L(6) >> (31 - 6)) & (1 << 6); + mask |=3D (s->ZMM_L(7) >> (31 - 7)) & (1 << 7); +#endif + return mask; } =20 -uint32_t helper_movmskpd(CPUX86State *env, Reg *s) +uint32_t glue(helper_movmskpd, SUFFIX)(CPUX86State *env, Reg *s) { - int b0, b1; + uint32_t mask; =20 - b0 =3D s->ZMM_L(1) >> 31; - b1 =3D s->ZMM_L(3) >> 31; - return b0 | (b1 << 1); + mask =3D 0; + mask |=3D (s->ZMM_L(1) >> (31 - 0)) & (1 << 0); + mask |=3D (s->ZMM_L(3) >> (31 - 1)) & (1 << 1); +#if SHIFT =3D=3D 2 + mask |=3D (s->ZMM_L(5) >> (31 - 2)) & (1 << 2); + mask |=3D (s->ZMM_L(7) >> (31 - 3)) & (1 << 3); +#endif + return mask; } =20 #endif @@ -1116,7 +1541,7 @@ uint32_t glue(helper_pmovmskb, SUFFIX)(CPUX86State *e= nv, Reg *s) val |=3D (s->B(5) >> 2) & 0x20; val |=3D (s->B(6) >> 1) & 0x40; val |=3D (s->B(7)) & 0x80; -#if SHIFT =3D=3D 1 +#if SHIFT >=3D 1 val |=3D (s->B(8) << 1) & 0x0100; val |=3D (s->B(9) << 2) & 0x0200; val |=3D (s->B(10) << 3) & 0x0400; @@ -1125,160 +1550,243 @@ uint32_t glue(helper_pmovmskb, SUFFIX)(CPUX86Stat= e *env, Reg *s) val |=3D (s->B(13) << 6) & 0x2000; val |=3D (s->B(14) << 7) & 0x4000; val |=3D (s->B(15) << 8) & 0x8000; +#if SHIFT =3D=3D 2 + val |=3D ((uint32_t)s->B(16) << 9) & 0x00010000; + val |=3D ((uint32_t)s->B(17) << 10) & 0x00020000; + val |=3D ((uint32_t)s->B(18) << 11) & 0x00040000; + val |=3D ((uint32_t)s->B(19) << 12) & 0x00080000; + val |=3D ((uint32_t)s->B(20) << 13) & 0x00100000; + val |=3D ((uint32_t)s->B(21) << 14) & 0x00200000; + val |=3D ((uint32_t)s->B(22) << 15) & 0x00400000; + val |=3D ((uint32_t)s->B(23) << 16) & 0x00800000; + val |=3D ((uint32_t)s->B(24) << 17) & 0x01000000; + val |=3D ((uint32_t)s->B(25) << 18) & 0x02000000; + val |=3D ((uint32_t)s->B(26) << 19) & 0x04000000; + val |=3D ((uint32_t)s->B(27) << 20) & 0x08000000; + val |=3D ((uint32_t)s->B(28) << 21) & 0x10000000; + val |=3D ((uint32_t)s->B(29) << 22) & 0x20000000; + val |=3D ((uint32_t)s->B(30) << 23) & 0x40000000; + val |=3D ((uint32_t)s->B(31) << 24) & 0x80000000; +#endif #endif return val; } =20 -void glue(helper_packsswb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) -{ - Reg r; - - r.B(0) =3D satsb((int16_t)d->W(0)); - r.B(1) =3D satsb((int16_t)d->W(1)); - r.B(2) =3D satsb((int16_t)d->W(2)); - r.B(3) =3D satsb((int16_t)d->W(3)); -#if SHIFT =3D=3D 1 - r.B(4) =3D satsb((int16_t)d->W(4)); - r.B(5) =3D satsb((int16_t)d->W(5)); - r.B(6) =3D satsb((int16_t)d->W(6)); - r.B(7) =3D satsb((int16_t)d->W(7)); -#endif - r.B((4 << SHIFT) + 0) =3D satsb((int16_t)s->W(0)); - r.B((4 << SHIFT) + 1) =3D satsb((int16_t)s->W(1)); - r.B((4 << SHIFT) + 2) =3D satsb((int16_t)s->W(2)); - r.B((4 << SHIFT) + 3) =3D satsb((int16_t)s->W(3)); -#if SHIFT =3D=3D 1 - r.B(12) =3D satsb((int16_t)s->W(4)); - r.B(13) =3D satsb((int16_t)s->W(5)); - r.B(14) =3D satsb((int16_t)s->W(6)); - r.B(15) =3D satsb((int16_t)s->W(7)); +#if SHIFT =3D=3D 0 +#define PACK_WIDTH 4 +#else +#define PACK_WIDTH 8 #endif - *d =3D r; -} - -void glue(helper_packuswb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) -{ - Reg r; =20 - r.B(0) =3D satub((int16_t)d->W(0)); - r.B(1) =3D satub((int16_t)d->W(1)); - r.B(2) =3D satub((int16_t)d->W(2)); - r.B(3) =3D satub((int16_t)d->W(3)); -#if SHIFT =3D=3D 1 - r.B(4) =3D satub((int16_t)d->W(4)); - r.B(5) =3D satub((int16_t)d->W(5)); - r.B(6) =3D satub((int16_t)d->W(6)); - r.B(7) =3D satub((int16_t)d->W(7)); -#endif - r.B((4 << SHIFT) + 0) =3D satub((int16_t)s->W(0)); - r.B((4 << SHIFT) + 1) =3D satub((int16_t)s->W(1)); - r.B((4 << SHIFT) + 2) =3D satub((int16_t)s->W(2)); - r.B((4 << SHIFT) + 3) =3D satub((int16_t)s->W(3)); -#if SHIFT =3D=3D 1 - r.B(12) =3D satub((int16_t)s->W(4)); - r.B(13) =3D satub((int16_t)s->W(5)); - r.B(14) =3D satub((int16_t)s->W(6)); - r.B(15) =3D satub((int16_t)s->W(7)); -#endif - *d =3D r; +#define PACK4(F, to, reg, from) do { \ + r[to + 0] =3D F((int16_t)reg->W(from + 0)); \ + r[to + 1] =3D F((int16_t)reg->W(from + 1)); \ + r[to + 2] =3D F((int16_t)reg->W(from + 2)); \ + r[to + 3] =3D F((int16_t)reg->W(from + 3)); \ + } while (0) + +#define PACK_HELPER_B(name, F) \ +void glue(helper_pack ## name, SUFFIX)(CPUX86State *env, \ + Reg *d, Reg *v, Reg *s) \ +{ \ + uint8_t r[PACK_WIDTH * 2]; \ + int i; \ + PACK4(F, 0, v, 0); \ + PACK4(F, PACK_WIDTH, s, 0); \ + XMM_ONLY( \ + PACK4(F, 4, v, 4); \ + PACK4(F, 12, s, 4); \ + ) \ + for (i =3D 0; i < PACK_WIDTH * 2; i++) { \ + d->B(i) =3D r[i]; \ + } \ + YMM_ONLY( \ + PACK4(F, 0, v, 8); \ + PACK4(F, 4, v, 12); \ + PACK4(F, 8, s, 8); \ + PACK4(F, 12, s, 12); \ + for (i =3D 0; i < 16; i++) { \ + d->B(i + 16) =3D r[i]; \ + } \ + ) \ } =20 -void glue(helper_packssdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +PACK_HELPER_B(sswb, satsb) +PACK_HELPER_B(uswb, satub) + +void glue(helper_packssdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *= s) { - Reg r; + uint16_t r[PACK_WIDTH]; + int i; =20 - r.W(0) =3D satsw(d->L(0)); - r.W(1) =3D satsw(d->L(1)); -#if SHIFT =3D=3D 1 - r.W(2) =3D satsw(d->L(2)); - r.W(3) =3D satsw(d->L(3)); + r[0] =3D satsw(v->L(0)); + r[1] =3D satsw(v->L(1)); + r[PACK_WIDTH / 2 + 0] =3D satsw(s->L(0)); + r[PACK_WIDTH / 2 + 1] =3D satsw(s->L(1)); +#if SHIFT >=3D 1 + r[2] =3D satsw(v->L(2)); + r[3] =3D satsw(v->L(3)); + r[6] =3D satsw(s->L(2)); + r[7] =3D satsw(s->L(3)); #endif - r.W((2 << SHIFT) + 0) =3D satsw(s->L(0)); - r.W((2 << SHIFT) + 1) =3D satsw(s->L(1)); -#if SHIFT =3D=3D 1 - r.W(6) =3D satsw(s->L(2)); - r.W(7) =3D satsw(s->L(3)); + for (i =3D 0; i < PACK_WIDTH; i++) { + d->W(i) =3D r[i]; + } +#if SHIFT =3D=3D 2 + r[0] =3D satsw(v->L(4)); + r[1] =3D satsw(v->L(5)); + r[2] =3D satsw(v->L(6)); + r[3] =3D satsw(v->L(7)); + r[4] =3D satsw(s->L(4)); + r[5] =3D satsw(s->L(5)); + r[6] =3D satsw(s->L(6)); + r[7] =3D satsw(s->L(7)); + for (i =3D 0; i < 8; i++) { + d->W(i + 8) =3D r[i]; + } #endif - *d =3D r; } =20 #define UNPCK_OP(base_name, base) \ \ void glue(helper_punpck ## base_name ## bw, SUFFIX)(CPUX86State *env,\ - Reg *d, Reg *s) \ + Reg *d, Reg *v, Reg *s) \ { \ - Reg r; \ + uint8_t r[PACK_WIDTH * 2]; \ + int i; \ \ - r.B(0) =3D d->B((base << (SHIFT + 2)) + 0); \ - r.B(1) =3D s->B((base << (SHIFT + 2)) + 0); \ - r.B(2) =3D d->B((base << (SHIFT + 2)) + 1); \ - r.B(3) =3D s->B((base << (SHIFT + 2)) + 1); \ - r.B(4) =3D d->B((base << (SHIFT + 2)) + 2); \ - r.B(5) =3D s->B((base << (SHIFT + 2)) + 2); \ - r.B(6) =3D d->B((base << (SHIFT + 2)) + 3); \ - r.B(7) =3D s->B((base << (SHIFT + 2)) + 3); \ + r[0] =3D v->B((base * PACK_WIDTH) + 0); \ + r[1] =3D s->B((base * PACK_WIDTH) + 0); \ + r[2] =3D v->B((base * PACK_WIDTH) + 1); \ + r[3] =3D s->B((base * PACK_WIDTH) + 1); \ + r[4] =3D v->B((base * PACK_WIDTH) + 2); \ + r[5] =3D s->B((base * PACK_WIDTH) + 2); \ + r[6] =3D v->B((base * PACK_WIDTH) + 3); \ + r[7] =3D s->B((base * PACK_WIDTH) + 3); \ XMM_ONLY( \ - r.B(8) =3D d->B((base << (SHIFT + 2)) + 4); \ - r.B(9) =3D s->B((base << (SHIFT + 2)) + 4); \ - r.B(10) =3D d->B((base << (SHIFT + 2)) + 5); \ - r.B(11) =3D s->B((base << (SHIFT + 2)) + 5); \ - r.B(12) =3D d->B((base << (SHIFT + 2)) + 6); \ - r.B(13) =3D s->B((base << (SHIFT + 2)) + 6); \ - r.B(14) =3D d->B((base << (SHIFT + 2)) + 7); \ - r.B(15) =3D s->B((base << (SHIFT + 2)) + 7); \ + r[8] =3D v->B((base * PACK_WIDTH) + 4); \ + r[9] =3D s->B((base * PACK_WIDTH) + 4); \ + r[10] =3D v->B((base * PACK_WIDTH) + 5); \ + r[11] =3D s->B((base * PACK_WIDTH) + 5); \ + r[12] =3D v->B((base * PACK_WIDTH) + 6); \ + r[13] =3D s->B((base * PACK_WIDTH) + 6); \ + r[14] =3D v->B((base * PACK_WIDTH) + 7); \ + r[15] =3D s->B((base * PACK_WIDTH) + 7); \ + ) \ + for (i =3D 0; i < PACK_WIDTH * 2; i++) { \ + d->B(i) =3D r[i]; \ + } \ + YMM_ONLY( \ + r[0] =3D v->B((base * 8) + 16); \ + r[1] =3D s->B((base * 8) + 16); \ + r[2] =3D v->B((base * 8) + 17); \ + r[3] =3D s->B((base * 8) + 17); \ + r[4] =3D v->B((base * 8) + 18); \ + r[5] =3D s->B((base * 8) + 18); \ + r[6] =3D v->B((base * 8) + 19); \ + r[7] =3D s->B((base * 8) + 19); \ + r[8] =3D v->B((base * 8) + 20); \ + r[9] =3D s->B((base * 8) + 20); \ + r[10] =3D v->B((base * 8) + 21); \ + r[11] =3D s->B((base * 8) + 21); \ + r[12] =3D v->B((base * 8) + 22); \ + r[13] =3D s->B((base * 8) + 22); \ + r[14] =3D v->B((base * 8) + 23); \ + r[15] =3D s->B((base * 8) + 23); \ + for (i =3D 0; i < PACK_WIDTH * 2; i++) { \ + d->B(16 + i) =3D r[i]; \ + } \ ) \ - *d =3D r; \ } \ \ void glue(helper_punpck ## base_name ## wd, SUFFIX)(CPUX86State *env,\ - Reg *d, Reg *s) \ + Reg *d, Reg *v, Reg *s) \ { \ - Reg r; \ + uint16_t r[PACK_WIDTH]; \ + int i; \ \ - r.W(0) =3D d->W((base << (SHIFT + 1)) + 0); \ - r.W(1) =3D s->W((base << (SHIFT + 1)) + 0); \ - r.W(2) =3D d->W((base << (SHIFT + 1)) + 1); \ - r.W(3) =3D s->W((base << (SHIFT + 1)) + 1); \ + r[0] =3D v->W((base * (PACK_WIDTH / 2)) + 0); \ + r[1] =3D s->W((base * (PACK_WIDTH / 2)) + 0); \ + r[2] =3D v->W((base * (PACK_WIDTH / 2)) + 1); \ + r[3] =3D s->W((base * (PACK_WIDTH / 2)) + 1); \ XMM_ONLY( \ - r.W(4) =3D d->W((base << (SHIFT + 1)) + 2); \ - r.W(5) =3D s->W((base << (SHIFT + 1)) + 2); \ - r.W(6) =3D d->W((base << (SHIFT + 1)) + 3); \ - r.W(7) =3D s->W((base << (SHIFT + 1)) + 3); \ + r[4] =3D v->W((base * 4) + 2); \ + r[5] =3D s->W((base * 4) + 2); \ + r[6] =3D v->W((base * 4) + 3); \ + r[7] =3D s->W((base * 4) + 3); \ + ) \ + for (i =3D 0; i < PACK_WIDTH; i++) { \ + d->W(i) =3D r[i]; \ + } \ + YMM_ONLY( \ + r[0] =3D v->W((base * 4) + 8); \ + r[1] =3D s->W((base * 4) + 8); \ + r[2] =3D v->W((base * 4) + 9); \ + r[3] =3D s->W((base * 4) + 9); \ + r[4] =3D v->W((base * 4) + 10); \ + r[5] =3D s->W((base * 4) + 10); \ + r[6] =3D v->W((base * 4) + 11); \ + r[7] =3D s->W((base * 4) + 11); \ + for (i =3D 0; i < PACK_WIDTH; i++) { \ + d->W(i + 8) =3D r[i]; \ + } \ ) \ - *d =3D r; \ } \ \ void glue(helper_punpck ## base_name ## dq, SUFFIX)(CPUX86State *env,\ - Reg *d, Reg *s) \ + Reg *d, Reg *v, Reg *s) \ { \ - Reg r; \ + uint32_t r[4]; \ \ - r.L(0) =3D d->L((base << SHIFT) + 0); \ - r.L(1) =3D s->L((base << SHIFT) + 0); \ + r[0] =3D v->L((base * (PACK_WIDTH / 4)) + 0); \ + r[1] =3D s->L((base * (PACK_WIDTH / 4)) + 0); \ XMM_ONLY( \ - r.L(2) =3D d->L((base << SHIFT) + 1); \ - r.L(3) =3D s->L((base << SHIFT) + 1); \ + r[2] =3D v->L((base * 2) + 1); \ + r[3] =3D s->L((base * 2) + 1); \ + d->L(2) =3D r[2]; \ + d->L(3) =3D r[3]; \ + ) \ + d->L(0) =3D r[0]; \ + d->L(1) =3D r[1]; \ + YMM_ONLY( \ + r[0] =3D v->L((base * 2) + 4); \ + r[1] =3D s->L((base * 2) + 4); \ + r[2] =3D v->L((base * 2) + 5); \ + r[3] =3D s->L((base * 2) + 5); \ + d->L(4) =3D r[0]; \ + d->L(5) =3D r[1]; \ + d->L(6) =3D r[2]; \ + d->L(7) =3D r[3]; \ ) \ - *d =3D r; \ } \ \ XMM_ONLY( \ - void glue(helper_punpck ## base_name ## qdq, SUFFIX)(CPUX86St= ate \ - *env, \ - Reg *d, \ - Reg *s) \ + void glue(helper_punpck ## base_name ## qdq, SUFFIX)( \ + CPUX86State *env, Reg *d, Reg *v, Reg *s) \ { \ - Reg r; \ + uint64_t r[2]; \ \ - r.Q(0) =3D d->Q(base); \ - r.Q(1) =3D s->Q(base); \ - *d =3D r; \ + r[0] =3D v->Q(base); \ + r[1] =3D s->Q(base); \ + d->Q(0) =3D r[0]; \ + d->Q(1) =3D r[1]; \ + YMM_ONLY( \ + r[0] =3D v->Q(base + 2); \ + r[1] =3D s->Q(base + 2); \ + d->Q(2) =3D r[0]; \ + d->Q(3) =3D r[1]; \ + ) \ } \ ) =20 UNPCK_OP(l, 0) UNPCK_OP(h, 1) =20 +#undef PACK_WIDTH +#undef PACK_HELPER_B +#undef PACK4 + + /* 3DNow! float ops */ #if SHIFT =3D=3D 0 void helper_pi2fd(CPUX86State *env, MMXReg *d, MMXReg *s) @@ -1429,123 +1937,176 @@ void helper_pswapd(CPUX86State *env, MMXReg *d, M= MXReg *s) #endif =20 /* SSSE3 op helpers */ -void glue(helper_pshufb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_pshufb, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) { int i; - Reg r; +#if SHIFT =3D=3D 0 + uint8_t r[8]; =20 - for (i =3D 0; i < (8 << SHIFT); i++) { - r.B(i) =3D (s->B(i) & 0x80) ? 0 : (d->B(s->B(i) & ((8 << SHIFT) - = 1))); + for (i =3D 0; i < 8; i++) { + r[i] =3D (s->B(i) & 0x80) ? 0 : (v->B(s->B(i) & 7)); } + for (i =3D 0; i < 8; i++) { + d->B(i) =3D r[i]; + } +#else + uint8_t r[16]; =20 - *d =3D r; + for (i =3D 0; i < 16; i++) { + r[i] =3D (s->B(i) & 0x80) ? 0 : (v->B(s->B(i) & 0xf)); + } + for (i =3D 0; i < 16; i++) { + d->B(i) =3D r[i]; + } +#if SHIFT =3D=3D 2 + for (i =3D 0; i < 16; i++) { + r[i] =3D (s->B(i + 16) & 0x80) ? 0 : (v->B((s->B(i + 16) & 0xf) + = 16)); + } + for (i =3D 0; i < 16; i++) { + d->B(i + 16) =3D r[i]; + } +#endif +#endif } =20 -void glue(helper_phaddw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) -{ +#if SHIFT =3D=3D 0 =20 - Reg r; +#define SSE_HELPER_HW(name, F) \ +void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *= s) \ +{ \ + uint16_t r[4]; \ + r[0] =3D F(v->W(0), v->W(1)); \ + r[1] =3D F(v->W(2), v->W(3)); \ + r[2] =3D F(s->W(0), s->W(1)); \ + r[3] =3D F(s->W(3), s->W(3)); \ + d->W(0) =3D r[0]; \ + d->W(1) =3D r[1]; \ + d->W(2) =3D r[2]; \ + d->W(3) =3D r[3]; \ +} + +#define SSE_HELPER_HL(name, F) \ +void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *= s) \ +{ \ + uint32_t r0, r1; \ + r0 =3D F(v->L(0), v->L(1)); \ + r1 =3D F(s->L(0), s->L(1)); \ + d->W(0) =3D r0; \ + d->W(1) =3D r1; \ +} =20 - r.W(0) =3D (int16_t)d->W(0) + (int16_t)d->W(1); - r.W(1) =3D (int16_t)d->W(2) + (int16_t)d->W(3); - XMM_ONLY(r.W(2) =3D (int16_t)d->W(4) + (int16_t)d->W(5)); - XMM_ONLY(r.W(3) =3D (int16_t)d->W(6) + (int16_t)d->W(7)); - r.W((2 << SHIFT) + 0) =3D (int16_t)s->W(0) + (int16_t)s->W(1); - r.W((2 << SHIFT) + 1) =3D (int16_t)s->W(2) + (int16_t)s->W(3); - XMM_ONLY(r.W(6) =3D (int16_t)s->W(4) + (int16_t)s->W(5)); - XMM_ONLY(r.W(7) =3D (int16_t)s->W(6) + (int16_t)s->W(7)); +#else =20 - *d =3D r; +#define SSE_HELPER_HW(name, F) \ +void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *= s) \ +{ \ + int32_t r[8]; \ + r[0] =3D F(v->W(0), v->W(1)); \ + r[1] =3D F(v->W(2), v->W(3)); \ + r[2] =3D F(v->W(4), v->W(5)); \ + r[3] =3D F(v->W(6), v->W(7)); \ + r[4] =3D F(s->W(0), s->W(1)); \ + r[5] =3D F(s->W(2), s->W(3)); \ + r[6] =3D F(s->W(4), s->W(5)); \ + r[7] =3D F(s->W(6), s->W(7)); \ + d->W(0) =3D r[0]; \ + d->W(1) =3D r[1]; \ + d->W(2) =3D r[2]; \ + d->W(3) =3D r[3]; \ + d->W(4) =3D r[4]; \ + d->W(5) =3D r[5]; \ + d->W(6) =3D r[6]; \ + d->W(7) =3D r[7]; \ + YMM_ONLY( \ + r[0] =3D F(v->W(8), v->W(9)); \ + r[1] =3D F(v->W(10), v->W(11)); \ + r[2] =3D F(v->W(12), v->W(13)); \ + r[3] =3D F(v->W(14), v->W(15)); \ + r[4] =3D F(s->W(8), s->W(9)); \ + r[5] =3D F(s->W(10), s->W(11)); \ + r[6] =3D F(s->W(12), s->W(13)); \ + r[7] =3D F(s->W(14), s->W(15)); \ + d->W(8) =3D r[0]; \ + d->W(9) =3D r[1]; \ + d->W(10) =3D r[2]; \ + d->W(11) =3D r[3]; \ + d->W(12) =3D r[4]; \ + d->W(13) =3D r[5]; \ + d->W(14) =3D r[6]; \ + d->W(15) =3D r[7]; \ + ) \ +} + +#define SSE_HELPER_HL(name, F) \ +void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *= s) \ +{ \ + int32_t r0, r1, r2, r3; \ + r0 =3D F(v->L(0), v->L(1)); \ + r1 =3D F(v->L(2), v->L(3)); \ + r2 =3D F(s->L(0), s->L(1)); \ + r3 =3D F(s->L(2), s->L(3)); \ + d->L(0) =3D r0; \ + d->L(1) =3D r1; \ + d->L(2) =3D r2; \ + d->L(3) =3D r3; \ + YMM_ONLY( \ + r0 =3D F(v->L(4), v->L(5)); \ + r1 =3D F(v->L(6), v->L(7)); \ + r2 =3D F(s->L(4), s->L(5)); \ + r3 =3D F(s->L(6), s->L(7)); \ + d->L(4) =3D r0; \ + d->L(5) =3D r1; \ + d->L(6) =3D r2; \ + d->L(7) =3D r3; \ + ) \ } - -void glue(helper_phaddd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) -{ - Reg r; - - r.L(0) =3D (int32_t)d->L(0) + (int32_t)d->L(1); - XMM_ONLY(r.L(1) =3D (int32_t)d->L(2) + (int32_t)d->L(3)); - r.L((1 << SHIFT) + 0) =3D (int32_t)s->L(0) + (int32_t)s->L(1); - XMM_ONLY(r.L(3) =3D (int32_t)s->L(2) + (int32_t)s->L(3)); - - *d =3D r; -} - -void glue(helper_phaddsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) -{ - Reg r; - - r.W(0) =3D satsw((int16_t)d->W(0) + (int16_t)d->W(1)); - r.W(1) =3D satsw((int16_t)d->W(2) + (int16_t)d->W(3)); - XMM_ONLY(r.W(2) =3D satsw((int16_t)d->W(4) + (int16_t)d->W(5))); - XMM_ONLY(r.W(3) =3D satsw((int16_t)d->W(6) + (int16_t)d->W(7))); - r.W((2 << SHIFT) + 0) =3D satsw((int16_t)s->W(0) + (int16_t)s->W(1)); - r.W((2 << SHIFT) + 1) =3D satsw((int16_t)s->W(2) + (int16_t)s->W(3)); - XMM_ONLY(r.W(6) =3D satsw((int16_t)s->W(4) + (int16_t)s->W(5))); - XMM_ONLY(r.W(7) =3D satsw((int16_t)s->W(6) + (int16_t)s->W(7))); - - *d =3D r; -} - -void glue(helper_pmaddubsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) -{ - d->W(0) =3D satsw((int8_t)s->B(0) * (uint8_t)d->B(0) + - (int8_t)s->B(1) * (uint8_t)d->B(1)); - d->W(1) =3D satsw((int8_t)s->B(2) * (uint8_t)d->B(2) + - (int8_t)s->B(3) * (uint8_t)d->B(3)); - d->W(2) =3D satsw((int8_t)s->B(4) * (uint8_t)d->B(4) + - (int8_t)s->B(5) * (uint8_t)d->B(5)); - d->W(3) =3D satsw((int8_t)s->B(6) * (uint8_t)d->B(6) + - (int8_t)s->B(7) * (uint8_t)d->B(7)); -#if SHIFT =3D=3D 1 - d->W(4) =3D satsw((int8_t)s->B(8) * (uint8_t)d->B(8) + - (int8_t)s->B(9) * (uint8_t)d->B(9)); - d->W(5) =3D satsw((int8_t)s->B(10) * (uint8_t)d->B(10) + - (int8_t)s->B(11) * (uint8_t)d->B(11)); - d->W(6) =3D satsw((int8_t)s->B(12) * (uint8_t)d->B(12) + - (int8_t)s->B(13) * (uint8_t)d->B(13)); - d->W(7) =3D satsw((int8_t)s->B(14) * (uint8_t)d->B(14) + - (int8_t)s->B(15) * (uint8_t)d->B(15)); #endif -} =20 -void glue(helper_phsubw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) -{ - d->W(0) =3D (int16_t)d->W(0) - (int16_t)d->W(1); - d->W(1) =3D (int16_t)d->W(2) - (int16_t)d->W(3); - XMM_ONLY(d->W(2) =3D (int16_t)d->W(4) - (int16_t)d->W(5)); - XMM_ONLY(d->W(3) =3D (int16_t)d->W(6) - (int16_t)d->W(7)); - d->W((2 << SHIFT) + 0) =3D (int16_t)s->W(0) - (int16_t)s->W(1); - d->W((2 << SHIFT) + 1) =3D (int16_t)s->W(2) - (int16_t)s->W(3); - XMM_ONLY(d->W(6) =3D (int16_t)s->W(4) - (int16_t)s->W(5)); - XMM_ONLY(d->W(7) =3D (int16_t)s->W(6) - (int16_t)s->W(7)); -} - -void glue(helper_phsubd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) -{ - d->L(0) =3D (int32_t)d->L(0) - (int32_t)d->L(1); - XMM_ONLY(d->L(1) =3D (int32_t)d->L(2) - (int32_t)d->L(3)); - d->L((1 << SHIFT) + 0) =3D (int32_t)s->L(0) - (int32_t)s->L(1); - XMM_ONLY(d->L(3) =3D (int32_t)s->L(2) - (int32_t)s->L(3)); -} - -void glue(helper_phsubsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) -{ - d->W(0) =3D satsw((int16_t)d->W(0) - (int16_t)d->W(1)); - d->W(1) =3D satsw((int16_t)d->W(2) - (int16_t)d->W(3)); - XMM_ONLY(d->W(2) =3D satsw((int16_t)d->W(4) - (int16_t)d->W(5))); - XMM_ONLY(d->W(3) =3D satsw((int16_t)d->W(6) - (int16_t)d->W(7))); - d->W((2 << SHIFT) + 0) =3D satsw((int16_t)s->W(0) - (int16_t)s->W(1)); - d->W((2 << SHIFT) + 1) =3D satsw((int16_t)s->W(2) - (int16_t)s->W(3)); - XMM_ONLY(d->W(6) =3D satsw((int16_t)s->W(4) - (int16_t)s->W(5))); - XMM_ONLY(d->W(7) =3D satsw((int16_t)s->W(6) - (int16_t)s->W(7))); +SSE_HELPER_HW(phaddw, FADD) +SSE_HELPER_HW(phsubw, FSUB) +SSE_HELPER_HW(phaddsw, FADDSW) +SSE_HELPER_HW(phsubsw, FSUBSW) +SSE_HELPER_HL(phaddd, FADD) +SSE_HELPER_HL(phsubd, FSUB) + +#undef SSE_HELPER_HW +#undef SSE_HELPER_HL + +void glue(helper_pmaddubsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg = *s) +{ + d->W(0) =3D satsw((int8_t)s->B(0) * (uint8_t)v->B(0) + + (int8_t)s->B(1) * (uint8_t)v->B(1)); + d->W(1) =3D satsw((int8_t)s->B(2) * (uint8_t)v->B(2) + + (int8_t)s->B(3) * (uint8_t)v->B(3)); + d->W(2) =3D satsw((int8_t)s->B(4) * (uint8_t)v->B(4) + + (int8_t)s->B(5) * (uint8_t)v->B(5)); + d->W(3) =3D satsw((int8_t)s->B(6) * (uint8_t)v->B(6) + + (int8_t)s->B(7) * (uint8_t)v->B(7)); +#if SHIFT >=3D 1 + d->W(4) =3D satsw((int8_t)s->B(8) * (uint8_t)v->B(8) + + (int8_t)s->B(9) * (uint8_t)v->B(9)); + d->W(5) =3D satsw((int8_t)s->B(10) * (uint8_t)v->B(10) + + (int8_t)s->B(11) * (uint8_t)v->B(11)); + d->W(6) =3D satsw((int8_t)s->B(12) * (uint8_t)v->B(12) + + (int8_t)s->B(13) * (uint8_t)v->B(13)); + d->W(7) =3D satsw((int8_t)s->B(14) * (uint8_t)v->B(14) + + (int8_t)s->B(15) * (uint8_t)v->B(15)); +#if SHIFT =3D=3D 2 + int i; + for (i =3D 8; i < 16; i++) { + d->W(i) =3D satsw((int8_t)s->B(i * 2) * (uint8_t)v->B(i * 2) + + (int8_t)s->B(i * 2 + 1) * (uint8_t)v->B(i * 2 + 1)= ); + } +#endif +#endif } =20 -#define FABSB(_, x) (x > INT8_MAX ? -(int8_t)x : x) -#define FABSW(_, x) (x > INT16_MAX ? -(int16_t)x : x) -#define FABSL(_, x) (x > INT32_MAX ? -(int32_t)x : x) -SSE_HELPER_B(helper_pabsb, FABSB) -SSE_HELPER_W(helper_pabsw, FABSW) -SSE_HELPER_L(helper_pabsd, FABSL) +#define FABSB(x) (x > INT8_MAX ? -(int8_t)x : x) +#define FABSW(x) (x > INT16_MAX ? -(int16_t)x : x) +#define FABSL(x) (x > INT32_MAX ? -(int32_t)x : x) +SSE_HELPER_1(helper_pabsb, B, 8, FABSB) +SSE_HELPER_1(helper_pabsw, W, 4, FABSW) +SSE_HELPER_1(helper_pabsd, L, 2, FABSL) =20 #define FMULHRSW(d, s) (((int16_t) d * (int16_t)s + 0x4000) >> 15) SSE_HELPER_W(helper_pmulhrsw, FMULHRSW) @@ -1557,104 +2118,119 @@ SSE_HELPER_B(helper_psignb, FSIGNB) SSE_HELPER_W(helper_psignw, FSIGNW) SSE_HELPER_L(helper_psignd, FSIGNL) =20 -void glue(helper_palignr, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, +void glue(helper_palignr, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s, int32_t shift) { - Reg r; - /* XXX could be checked during translation */ - if (shift >=3D (16 << SHIFT)) { - r.Q(0) =3D 0; - XMM_ONLY(r.Q(1) =3D 0); + if (shift >=3D (SHIFT ? 32 : 16)) { + d->Q(0) =3D 0; + XMM_ONLY(d->Q(1) =3D 0); +#if SHIFT =3D=3D 2 + d->Q(2) =3D 0; + d->Q(3) =3D 0; +#endif } else { shift <<=3D 3; #define SHR(v, i) (i < 64 && i > -64 ? i > 0 ? v >> (i) : (v << -(i)) : 0) #if SHIFT =3D=3D 0 - r.Q(0) =3D SHR(s->Q(0), shift - 0) | - SHR(d->Q(0), shift - 64); + d->Q(0) =3D SHR(s->Q(0), shift - 0) | + SHR(v->Q(0), shift - 64); #else - r.Q(0) =3D SHR(s->Q(0), shift - 0) | - SHR(s->Q(1), shift - 64) | - SHR(d->Q(0), shift - 128) | - SHR(d->Q(1), shift - 192); - r.Q(1) =3D SHR(s->Q(0), shift + 64) | - SHR(s->Q(1), shift - 0) | - SHR(d->Q(0), shift - 64) | - SHR(d->Q(1), shift - 128); + uint64_t r0, r1; + + r0 =3D SHR(s->Q(0), shift - 0) | + SHR(s->Q(1), shift - 64) | + SHR(v->Q(0), shift - 128) | + SHR(v->Q(1), shift - 192); + r1 =3D SHR(s->Q(0), shift + 64) | + SHR(s->Q(1), shift - 0) | + SHR(v->Q(0), shift - 64) | + SHR(v->Q(1), shift - 128); + d->Q(0) =3D r0; + d->Q(1) =3D r1; +#if SHIFT =3D=3D 2 + r0 =3D SHR(s->Q(2), shift - 0) | + SHR(s->Q(3), shift - 64) | + SHR(v->Q(2), shift - 128) | + SHR(v->Q(3), shift - 192); + r1 =3D SHR(s->Q(2), shift + 64) | + SHR(s->Q(3), shift - 0) | + SHR(v->Q(2), shift - 64) | + SHR(v->Q(3), shift - 128); + d->Q(2) =3D r0; + d->Q(3) =3D r1; +#endif #endif #undef SHR } - - *d =3D r; } =20 -#define XMM0 (env->xmm_regs[0]) +#if SHIFT >=3D 1 + +#define BLEND_V128(elem, num, F, b) do { = \ + d->elem(b + 0) =3D F(v->elem(b + 0), s->elem(b + 0), m->elem(b + 0)); = \ + d->elem(b + 1) =3D F(v->elem(b + 1), s->elem(b + 1), m->elem(b + 1)); = \ + if (num > 2) { = \ + d->elem(b + 2) =3D F(v->elem(b + 2), s->elem(b + 2), m->elem(b + 2= )); \ + d->elem(b + 3) =3D F(v->elem(b + 3), s->elem(b + 3), m->elem(b + 3= )); \ + } = \ + if (num > 4) { = \ + d->elem(b + 4) =3D F(v->elem(b + 4), s->elem(b + 4), m->elem(b + 4= )); \ + d->elem(b + 5) =3D F(v->elem(b + 5), s->elem(b + 5), m->elem(b + 5= )); \ + d->elem(b + 6) =3D F(v->elem(b + 6), s->elem(b + 6), m->elem(b + 6= )); \ + d->elem(b + 7) =3D F(v->elem(b + 7), s->elem(b + 7), m->elem(b + 7= )); \ + } = \ + if (num > 8) { = \ + d->elem(b + 8) =3D F(v->elem(b + 8), s->elem(b + 8), m->elem(b + 8= )); \ + d->elem(b + 9) =3D F(v->elem(b + 9), s->elem(b + 9), m->elem(b + 9= )); \ + d->elem(b + 10) =3D F(v->elem(b + 10), s->elem(b + 10), m->elem(b = + 10));\ + d->elem(b + 11) =3D F(v->elem(b + 11), s->elem(b + 11), m->elem(b = + 11));\ + d->elem(b + 12) =3D F(v->elem(b + 12), s->elem(b + 12), m->elem(b = + 12));\ + d->elem(b + 13) =3D F(v->elem(b + 13), s->elem(b + 13), m->elem(b = + 13));\ + d->elem(b + 14) =3D F(v->elem(b + 14), s->elem(b + 14), m->elem(b = + 14));\ + d->elem(b + 15) =3D F(v->elem(b + 15), s->elem(b + 15), m->elem(b = + 15));\ + } \ + } while (0) =20 -#if SHIFT =3D=3D 1 #define SSE_HELPER_V(name, elem, num, F) \ - void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ + void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s, \ + Reg *m) \ { \ - d->elem(0) =3D F(d->elem(0), s->elem(0), XMM0.elem(0)); \ - d->elem(1) =3D F(d->elem(1), s->elem(1), XMM0.elem(1)); \ - if (num > 2) { \ - d->elem(2) =3D F(d->elem(2), s->elem(2), XMM0.elem(2)); \ - d->elem(3) =3D F(d->elem(3), s->elem(3), XMM0.elem(3)); \ - if (num > 4) { \ - d->elem(4) =3D F(d->elem(4), s->elem(4), XMM0.elem(4)); \ - d->elem(5) =3D F(d->elem(5), s->elem(5), XMM0.elem(5)); \ - d->elem(6) =3D F(d->elem(6), s->elem(6), XMM0.elem(6)); \ - d->elem(7) =3D F(d->elem(7), s->elem(7), XMM0.elem(7)); \ - if (num > 8) { \ - d->elem(8) =3D F(d->elem(8), s->elem(8), XMM0.elem(8))= ; \ - d->elem(9) =3D F(d->elem(9), s->elem(9), XMM0.elem(9))= ; \ - d->elem(10) =3D F(d->elem(10), s->elem(10), XMM0.elem(= 10)); \ - d->elem(11) =3D F(d->elem(11), s->elem(11), XMM0.elem(= 11)); \ - d->elem(12) =3D F(d->elem(12), s->elem(12), XMM0.elem(= 12)); \ - d->elem(13) =3D F(d->elem(13), s->elem(13), XMM0.elem(= 13)); \ - d->elem(14) =3D F(d->elem(14), s->elem(14), XMM0.elem(= 14)); \ - d->elem(15) =3D F(d->elem(15), s->elem(15), XMM0.elem(= 15)); \ - } \ - } \ - } \ - } + BLEND_V128(elem, num, F, 0); \ + YMM_ONLY(BLEND_V128(elem, num, F, num);) \ + } + +#define BLEND_I128(elem, num, F, b) do { = \ + d->elem(b + 0) =3D F(v->elem(b + 0), s->elem(b + 0), ((imm >> 0) & 1))= ; \ + d->elem(b + 1) =3D F(v->elem(b + 1), s->elem(b + 1), ((imm >> 1) & 1))= ; \ + if (num > 2) { = \ + d->elem(b + 2) =3D F(v->elem(b + 2), s->elem(b + 2), ((imm >> 2) &= 1)); \ + d->elem(b + 3) =3D F(v->elem(b + 3), s->elem(b + 3), ((imm >> 3) &= 1)); \ + } = \ + if (num > 4) { = \ + d->elem(b + 4) =3D F(v->elem(b + 4), s->elem(b + 4), ((imm >> 4) &= 1)); \ + d->elem(b + 5) =3D F(v->elem(b + 5), s->elem(b + 5), ((imm >> 5) &= 1)); \ + d->elem(b + 6) =3D F(v->elem(b + 6), s->elem(b + 6), ((imm >> 6) &= 1)); \ + d->elem(b + 7) =3D F(v->elem(b + 7), s->elem(b + 7), ((imm >> 7) &= 1)); \ + } = \ + } while (0) =20 #define SSE_HELPER_I(name, elem, num, F) \ - void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t imm= ) \ + void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s, \ + uint32_t imm) \ { \ - d->elem(0) =3D F(d->elem(0), s->elem(0), ((imm >> 0) & 1)); \ - d->elem(1) =3D F(d->elem(1), s->elem(1), ((imm >> 1) & 1)); \ - if (num > 2) { \ - d->elem(2) =3D F(d->elem(2), s->elem(2), ((imm >> 2) & 1)); \ - d->elem(3) =3D F(d->elem(3), s->elem(3), ((imm >> 3) & 1)); \ - if (num > 4) { \ - d->elem(4) =3D F(d->elem(4), s->elem(4), ((imm >> 4) & 1))= ; \ - d->elem(5) =3D F(d->elem(5), s->elem(5), ((imm >> 5) & 1))= ; \ - d->elem(6) =3D F(d->elem(6), s->elem(6), ((imm >> 6) & 1))= ; \ - d->elem(7) =3D F(d->elem(7), s->elem(7), ((imm >> 7) & 1))= ; \ - if (num > 8) { \ - d->elem(8) =3D F(d->elem(8), s->elem(8), ((imm >> 8) &= 1)); \ - d->elem(9) =3D F(d->elem(9), s->elem(9), ((imm >> 9) &= 1)); \ - d->elem(10) =3D F(d->elem(10), s->elem(10), \ - ((imm >> 10) & 1)); \ - d->elem(11) =3D F(d->elem(11), s->elem(11), \ - ((imm >> 11) & 1)); \ - d->elem(12) =3D F(d->elem(12), s->elem(12), \ - ((imm >> 12) & 1)); \ - d->elem(13) =3D F(d->elem(13), s->elem(13), \ - ((imm >> 13) & 1)); \ - d->elem(14) =3D F(d->elem(14), s->elem(14), \ - ((imm >> 14) & 1)); \ - d->elem(15) =3D F(d->elem(15), s->elem(15), \ - ((imm >> 15) & 1)); \ - } \ - } \ - } \ + BLEND_I128(elem, num, F, 0); \ + YMM_ONLY( \ + if (num < 8) \ + imm >>=3D num; \ + BLEND_I128(elem, num, F, num); \ + ) \ } =20 /* SSE4.1 op helpers */ -#define FBLENDVB(d, s, m) ((m & 0x80) ? s : d) -#define FBLENDVPS(d, s, m) ((m & 0x80000000) ? s : d) -#define FBLENDVPD(d, s, m) ((m & 0x8000000000000000LL) ? s : d) +#define FBLENDVB(v, s, m) ((m & 0x80) ? s : v) +#define FBLENDVPS(v, s, m) ((m & 0x80000000) ? s : v) +#define FBLENDVPD(v, s, m) ((m & 0x8000000000000000LL) ? s : v) SSE_HELPER_V(helper_pblendvb, B, 16, FBLENDVB) SSE_HELPER_V(helper_blendvps, L, 4, FBLENDVPS) SSE_HELPER_V(helper_blendvpd, Q, 2, FBLENDVPD) @@ -1664,14 +2240,28 @@ void glue(helper_ptest, SUFFIX)(CPUX86State *env, R= eg *d, Reg *s) uint64_t zf =3D (s->Q(0) & d->Q(0)) | (s->Q(1) & d->Q(1)); uint64_t cf =3D (s->Q(0) & ~d->Q(0)) | (s->Q(1) & ~d->Q(1)); =20 +#if SHIFT =3D=3D 2 + zf |=3D (s->Q(2) & d->Q(2)) | (s->Q(3) & d->Q(3)); + cf |=3D (s->Q(2) & ~d->Q(2)) | (s->Q(3) & ~d->Q(3)); +#endif CC_SRC =3D (zf ? 0 : CC_Z) | (cf ? 0 : CC_C); } =20 #define SSE_HELPER_F(name, elem, num, F) \ void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ { \ - if (num > 2) { \ - if (num > 4) { \ + if (num * SHIFT > 2) { \ + if (num * SHIFT > 8) { \ + d->elem(15) =3D F(15); \ + d->elem(14) =3D F(14); \ + d->elem(13) =3D F(13); \ + d->elem(12) =3D F(12); \ + d->elem(11) =3D F(11); \ + d->elem(10) =3D F(10); \ + d->elem(9) =3D F(9); \ + d->elem(8) =3D F(8); \ + } \ + if (num * SHIFT > 4) { \ d->elem(7) =3D F(7); \ d->elem(6) =3D F(6); \ d->elem(5) =3D F(5); \ @@ -1697,28 +2287,57 @@ SSE_HELPER_F(helper_pmovzxwd, L, 4, s->W) SSE_HELPER_F(helper_pmovzxwq, Q, 2, s->W) SSE_HELPER_F(helper_pmovzxdq, Q, 2, s->L) =20 -void glue(helper_pmuldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_pmuldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) { - d->Q(0) =3D (int64_t)(int32_t) d->L(0) * (int32_t) s->L(0); - d->Q(1) =3D (int64_t)(int32_t) d->L(2) * (int32_t) s->L(2); + d->Q(0) =3D (int64_t)(int32_t) v->L(0) * (int32_t) s->L(0); + d->Q(1) =3D (int64_t)(int32_t) v->L(2) * (int32_t) s->L(2); +#if SHIFT =3D=3D 2 + d->Q(2) =3D (int64_t)(int32_t) v->L(4) * (int32_t) s->L(4); + d->Q(3) =3D (int64_t)(int32_t) v->L(6) * (int32_t) s->L(6); +#endif } =20 #define FCMPEQQ(d, s) (d =3D=3D s ? -1 : 0) SSE_HELPER_Q(helper_pcmpeqq, FCMPEQQ) =20 -void glue(helper_packusdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) -{ - Reg r; - - r.W(0) =3D satuw((int32_t) d->L(0)); - r.W(1) =3D satuw((int32_t) d->L(1)); - r.W(2) =3D satuw((int32_t) d->L(2)); - r.W(3) =3D satuw((int32_t) d->L(3)); - r.W(4) =3D satuw((int32_t) s->L(0)); - r.W(5) =3D satuw((int32_t) s->L(1)); - r.W(6) =3D satuw((int32_t) s->L(2)); - r.W(7) =3D satuw((int32_t) s->L(3)); - *d =3D r; +void glue(helper_packusdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *= s) +{ + uint16_t r[8]; + + r[0] =3D satuw((int32_t) v->L(0)); + r[1] =3D satuw((int32_t) v->L(1)); + r[2] =3D satuw((int32_t) v->L(2)); + r[3] =3D satuw((int32_t) v->L(3)); + r[4] =3D satuw((int32_t) s->L(0)); + r[5] =3D satuw((int32_t) s->L(1)); + r[6] =3D satuw((int32_t) s->L(2)); + r[7] =3D satuw((int32_t) s->L(3)); + d->W(0) =3D r[0]; + d->W(1) =3D r[1]; + d->W(2) =3D r[2]; + d->W(3) =3D r[3]; + d->W(4) =3D r[4]; + d->W(5) =3D r[5]; + d->W(6) =3D r[6]; + d->W(7) =3D r[7]; +#if SHIFT =3D=3D 2 + r[0] =3D satuw((int32_t) v->L(4)); + r[1] =3D satuw((int32_t) v->L(5)); + r[2] =3D satuw((int32_t) v->L(6)); + r[3] =3D satuw((int32_t) v->L(7)); + r[4] =3D satuw((int32_t) s->L(4)); + r[5] =3D satuw((int32_t) s->L(5)); + r[6] =3D satuw((int32_t) s->L(6)); + r[7] =3D satuw((int32_t) s->L(7)); + d->W(8) =3D r[0]; + d->W(9) =3D r[1]; + d->W(10) =3D r[2]; + d->W(11) =3D r[3]; + d->W(12) =3D r[4]; + d->W(13) =3D r[5]; + d->W(14) =3D r[6]; + d->W(15) =3D r[7]; +#endif } =20 #define FMINSB(d, s) MIN((int8_t)d, (int8_t)s) @@ -1737,6 +2356,7 @@ SSE_HELPER_L(helper_pmaxud, MAX) #define FMULLD(d, s) ((int32_t)d * (int32_t)s) SSE_HELPER_L(helper_pmulld, FMULLD) =20 +#if SHIFT =3D=3D 1 void glue(helper_phminposuw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { int idx =3D 0; @@ -1768,6 +2388,7 @@ void glue(helper_phminposuw, SUFFIX)(CPUX86State *env= , Reg *d, Reg *s) d->L(1) =3D 0; d->Q(1) =3D 0; } +#endif =20 void glue(helper_roundps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t mode) @@ -1797,6 +2418,12 @@ void glue(helper_roundps, SUFFIX)(CPUX86State *env, = Reg *d, Reg *s, d->ZMM_S(1) =3D float32_round_to_int(s->ZMM_S(1), &env->sse_status); d->ZMM_S(2) =3D float32_round_to_int(s->ZMM_S(2), &env->sse_status); d->ZMM_S(3) =3D float32_round_to_int(s->ZMM_S(3), &env->sse_status); +#if SHIFT =3D=3D 2 + d->ZMM_S(4) =3D float32_round_to_int(s->ZMM_S(4), &env->sse_status); + d->ZMM_S(5) =3D float32_round_to_int(s->ZMM_S(5), &env->sse_status); + d->ZMM_S(6) =3D float32_round_to_int(s->ZMM_S(6), &env->sse_status); + d->ZMM_S(7) =3D float32_round_to_int(s->ZMM_S(7), &env->sse_status); +#endif =20 if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) { set_float_exception_flags(get_float_exception_flags(&env->sse_stat= us) & @@ -1832,6 +2459,10 @@ void glue(helper_roundpd, SUFFIX)(CPUX86State *env, = Reg *d, Reg *s, =20 d->ZMM_D(0) =3D float64_round_to_int(s->ZMM_D(0), &env->sse_status); d->ZMM_D(1) =3D float64_round_to_int(s->ZMM_D(1), &env->sse_status); +#if SHIFT =3D=3D 2 + d->ZMM_D(2) =3D float64_round_to_int(s->ZMM_D(2), &env->sse_status); + d->ZMM_D(3) =3D float64_round_to_int(s->ZMM_D(3), &env->sse_status); +#endif =20 if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) { set_float_exception_flags(get_float_exception_flags(&env->sse_stat= us) & @@ -1841,7 +2472,8 @@ void glue(helper_roundpd, SUFFIX)(CPUX86State *env, R= eg *d, Reg *s, env->sse_status.float_rounding_mode =3D prev_rounding_mode; } =20 -void glue(helper_roundss, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, +#if SHIFT =3D=3D 1 +void helper_roundss_xmm(CPUX86State *env, Reg *d, Reg *s, uint32_t mode) { uint8_t old_flags =3D get_float_exception_flags(&env->sse_status); @@ -1875,7 +2507,7 @@ void glue(helper_roundss, SUFFIX)(CPUX86State *env, R= eg *d, Reg *s, env->sse_status.float_rounding_mode =3D prev_rounding_mode; } =20 -void glue(helper_roundsd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, +void helper_roundsd_xmm(CPUX86State *env, Reg *d, Reg *s, uint32_t mode) { uint8_t old_flags =3D get_float_exception_flags(&env->sse_status); @@ -1908,99 +2540,158 @@ void glue(helper_roundsd, SUFFIX)(CPUX86State *env= , Reg *d, Reg *s, } env->sse_status.float_rounding_mode =3D prev_rounding_mode; } +#endif =20 -#define FBLENDP(d, s, m) (m ? s : d) +#define FBLENDP(v, s, m) (m ? s : v) SSE_HELPER_I(helper_blendps, L, 4, FBLENDP) SSE_HELPER_I(helper_blendpd, Q, 2, FBLENDP) SSE_HELPER_I(helper_pblendw, W, 8, FBLENDP) =20 -void glue(helper_dpps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t = mask) +void glue(helper_dpps, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s, + uint32_t mask) { - float32 iresult =3D float32_zero; + float32 prod, iresult, iresult2; =20 + /* + * We must evaluate (A+B)+(C+D), not ((A+B)+C)+D + * to correctly round the intermediate results + */ if (mask & (1 << 4)) { - iresult =3D float32_add(iresult, - float32_mul(d->ZMM_S(0), s->ZMM_S(0), - &env->sse_status), - &env->sse_status); + iresult =3D float32_mul(v->ZMM_S(0), s->ZMM_S(0), &env->sse_status= ); + } else { + iresult =3D float32_zero; } if (mask & (1 << 5)) { - iresult =3D float32_add(iresult, - float32_mul(d->ZMM_S(1), s->ZMM_S(1), - &env->sse_status), - &env->sse_status); + prod =3D float32_mul(v->ZMM_S(1), s->ZMM_S(1), &env->sse_status); + } else { + prod =3D float32_zero; } + iresult =3D float32_add(iresult, prod, &env->sse_status); if (mask & (1 << 6)) { - iresult =3D float32_add(iresult, - float32_mul(d->ZMM_S(2), s->ZMM_S(2), - &env->sse_status), - &env->sse_status); + iresult2 =3D float32_mul(v->ZMM_S(2), s->ZMM_S(2), &env->sse_statu= s); + } else { + iresult2 =3D float32_zero; } if (mask & (1 << 7)) { - iresult =3D float32_add(iresult, - float32_mul(d->ZMM_S(3), s->ZMM_S(3), - &env->sse_status), - &env->sse_status); + prod =3D float32_mul(v->ZMM_S(3), s->ZMM_S(3), &env->sse_status); + } else { + prod =3D float32_zero; } + iresult2 =3D float32_add(iresult2, prod, &env->sse_status); + iresult =3D float32_add(iresult, iresult2, &env->sse_status); + d->ZMM_S(0) =3D (mask & (1 << 0)) ? iresult : float32_zero; d->ZMM_S(1) =3D (mask & (1 << 1)) ? iresult : float32_zero; d->ZMM_S(2) =3D (mask & (1 << 2)) ? iresult : float32_zero; d->ZMM_S(3) =3D (mask & (1 << 3)) ? iresult : float32_zero; +#if SHIFT =3D=3D 2 + if (mask & (1 << 4)) { + iresult =3D float32_mul(v->ZMM_S(4), s->ZMM_S(4), &env->sse_status= ); + } else { + iresult =3D float32_zero; + } + if (mask & (1 << 5)) { + prod =3D float32_mul(v->ZMM_S(5), s->ZMM_S(5), &env->sse_status); + } else { + prod =3D float32_zero; + } + iresult =3D float32_add(iresult, prod, &env->sse_status); + if (mask & (1 << 6)) { + iresult2 =3D float32_mul(v->ZMM_S(6), s->ZMM_S(6), &env->sse_statu= s); + } else { + iresult2 =3D float32_zero; + } + if (mask & (1 << 7)) { + prod =3D float32_mul(v->ZMM_S(7), s->ZMM_S(7), &env->sse_status); + } else { + prod =3D float32_zero; + } + iresult2 =3D float32_add(iresult2, prod, &env->sse_status); + iresult =3D float32_add(iresult, iresult2, &env->sse_status); + + d->ZMM_S(4) =3D (mask & (1 << 0)) ? iresult : float32_zero; + d->ZMM_S(5) =3D (mask & (1 << 1)) ? iresult : float32_zero; + d->ZMM_S(6) =3D (mask & (1 << 2)) ? iresult : float32_zero; + d->ZMM_S(7) =3D (mask & (1 << 3)) ? iresult : float32_zero; +#endif } =20 -void glue(helper_dppd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t = mask) +#if SHIFT =3D=3D 1 +/* Oddly, there is no ymm version of dppd */ +void glue(helper_dppd, SUFFIX)(CPUX86State *env, + Reg *d, Reg *v, Reg *s, uint32_t mask) { - float64 iresult =3D float64_zero; + float64 iresult; =20 if (mask & (1 << 4)) { - iresult =3D float64_add(iresult, - float64_mul(d->ZMM_D(0), s->ZMM_D(0), - &env->sse_status), - &env->sse_status); + iresult =3D float64_mul(v->ZMM_D(0), s->ZMM_D(0), &env->sse_status= ); + } else { + iresult =3D float64_zero; } + if (mask & (1 << 5)) { iresult =3D float64_add(iresult, - float64_mul(d->ZMM_D(1), s->ZMM_D(1), + float64_mul(v->ZMM_D(1), s->ZMM_D(1), &env->sse_status), &env->sse_status); } d->ZMM_D(0) =3D (mask & (1 << 0)) ? iresult : float64_zero; d->ZMM_D(1) =3D (mask & (1 << 1)) ? iresult : float64_zero; } +#endif =20 -void glue(helper_mpsadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, +void glue(helper_mpsadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s, uint32_t offset) { int s0 =3D (offset & 3) << 2; int d0 =3D (offset & 4) << 0; int i; - Reg r; + uint16_t r[8]; =20 for (i =3D 0; i < 8; i++, d0++) { - r.W(i) =3D 0; - r.W(i) +=3D abs1(d->B(d0 + 0) - s->B(s0 + 0)); - r.W(i) +=3D abs1(d->B(d0 + 1) - s->B(s0 + 1)); - r.W(i) +=3D abs1(d->B(d0 + 2) - s->B(s0 + 2)); - r.W(i) +=3D abs1(d->B(d0 + 3) - s->B(s0 + 3)); + r[i] =3D 0; + r[i] +=3D abs1(v->B(d0 + 0) - s->B(s0 + 0)); + r[i] +=3D abs1(v->B(d0 + 1) - s->B(s0 + 1)); + r[i] +=3D abs1(v->B(d0 + 2) - s->B(s0 + 2)); + r[i] +=3D abs1(v->B(d0 + 3) - s->B(s0 + 3)); } + for (i =3D 0; i < 8; i++) { + d->W(i) =3D r[i]; + } +#if SHIFT =3D=3D 2 + s0 =3D ((offset & 0x18) >> 1) + 16; + d0 =3D ((offset & 0x20) >> 3) + 16; =20 - *d =3D r; + for (i =3D 0; i < 8; i++, d0++) { + r[i] =3D 0; + r[i] +=3D abs1(v->B(d0 + 0) - s->B(s0 + 0)); + r[i] +=3D abs1(v->B(d0 + 1) - s->B(s0 + 1)); + r[i] +=3D abs1(v->B(d0 + 2) - s->B(s0 + 2)); + r[i] +=3D abs1(v->B(d0 + 3) - s->B(s0 + 3)); + } + for (i =3D 0; i < 8; i++) { + d->W(i + 8) =3D r[i]; + } +#endif } =20 /* SSE4.2 op helpers */ #define FCMPGTQ(d, s) ((int64_t)d > (int64_t)s ? -1 : 0) SSE_HELPER_Q(helper_pcmpgtq, FCMPGTQ) =20 +#if SHIFT =3D=3D 1 static inline int pcmp_elen(CPUX86State *env, int reg, uint32_t ctrl) { - int val; + int64_t val; =20 /* Presence of REX.W is indicated by a bit higher than 7 set */ if (ctrl >> 8) { - val =3D abs1((int64_t)env->regs[reg]); + val =3D env->regs[reg]; } else { - val =3D abs1((int32_t)env->regs[reg]); + val =3D (int32_t)env->regs[reg]; } + if (val < 0) + val =3D 16; =20 if (ctrl & 1) { if (val > 8) { @@ -2213,14 +2904,16 @@ target_ulong helper_crc32(uint32_t crc1, target_ulo= ng msg, uint32_t len) return crc; } =20 -void glue(helper_pclmulqdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, - uint32_t ctrl) +#endif + +#if SHIFT =3D=3D 1 +static void clmulq(uint64_t *dest_l, uint64_t *dest_h, + uint64_t a, uint64_t b) { - uint64_t ah, al, b, resh, resl; + uint64_t al, ah, resh, resl; =20 ah =3D 0; - al =3D d->Q((ctrl & 1) !=3D 0); - b =3D s->Q((ctrl & 16) !=3D 0); + al =3D a; resh =3D resl =3D 0; =20 while (b) { @@ -2233,71 +2926,115 @@ void glue(helper_pclmulqdq, SUFFIX)(CPUX86State *e= nv, Reg *d, Reg *s, b >>=3D 1; } =20 - d->Q(0) =3D resl; - d->Q(1) =3D resh; + *dest_l =3D resl; + *dest_h =3D resh; } +#endif =20 -void glue(helper_aesdec, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_pclmulqdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg = *s, + uint32_t ctrl) +{ + uint64_t a, b; + + a =3D v->Q((ctrl & 1) !=3D 0); + b =3D s->Q((ctrl & 16) !=3D 0); + clmulq(&d->Q(0), &d->Q(1), a, b); +#if SHIFT =3D=3D 2 + a =3D v->Q(((ctrl & 1) !=3D 0) + 2); + b =3D s->Q(((ctrl & 16) !=3D 0) + 2); + clmulq(&d->Q(2), &d->Q(3), a, b); +#endif +} + +void glue(helper_aesdec, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) { int i; - Reg st =3D *d; + Reg st =3D *v; Reg rk =3D *s; =20 for (i =3D 0 ; i < 4 ; i++) { - d->L(i) =3D rk.L(i) ^ bswap32(AES_Td0[st.B(AES_ishifts[4*i+0])] ^ - AES_Td1[st.B(AES_ishifts[4*i+1])] ^ - AES_Td2[st.B(AES_ishifts[4*i+2])] ^ - AES_Td3[st.B(AES_ishifts[4*i+3])]); + d->L(i) =3D rk.L(i) ^ bswap32(AES_Td0[st.B(AES_ishifts[4 * i + 0])= ] ^ + AES_Td1[st.B(AES_ishifts[4 * i + 1])] ^ + AES_Td2[st.B(AES_ishifts[4 * i + 2])] ^ + AES_Td3[st.B(AES_ishifts[4 * i + 3])]); } +#if SHIFT =3D=3D 2 + for (i =3D 0 ; i < 4 ; i++) { + d->L(i + 4) =3D rk.L(i + 4) ^ bswap32( + AES_Td0[st.B(AES_ishifts[4 * i + 0] + 16)] ^ + AES_Td1[st.B(AES_ishifts[4 * i + 1] + 16)] ^ + AES_Td2[st.B(AES_ishifts[4 * i + 2] + 16)] ^ + AES_Td3[st.B(AES_ishifts[4 * i + 3] + 16)]); + } +#endif } =20 -void glue(helper_aesdeclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_aesdeclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg= *s) { int i; - Reg st =3D *d; + Reg st =3D *v; Reg rk =3D *s; =20 for (i =3D 0; i < 16; i++) { d->B(i) =3D rk.B(i) ^ (AES_isbox[st.B(AES_ishifts[i])]); } +#if SHIFT =3D=3D 2 + for (i =3D 0; i < 16; i++) { + d->B(i + 16) =3D rk.B(i + 16) ^ (AES_isbox[st.B(AES_ishifts[i] + 1= 6)]); + } +#endif } =20 -void glue(helper_aesenc, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_aesenc, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) { int i; - Reg st =3D *d; + Reg st =3D *v; Reg rk =3D *s; =20 for (i =3D 0 ; i < 4 ; i++) { - d->L(i) =3D rk.L(i) ^ bswap32(AES_Te0[st.B(AES_shifts[4*i+0])] ^ - AES_Te1[st.B(AES_shifts[4*i+1])] ^ - AES_Te2[st.B(AES_shifts[4*i+2])] ^ - AES_Te3[st.B(AES_shifts[4*i+3])]); + d->L(i) =3D rk.L(i) ^ bswap32(AES_Te0[st.B(AES_shifts[4 * i + 0])]= ^ + AES_Te1[st.B(AES_shifts[4 * i + 1])] ^ + AES_Te2[st.B(AES_shifts[4 * i + 2])] ^ + AES_Te3[st.B(AES_shifts[4 * i + 3])]); + } +#if SHIFT =3D=3D 2 + for (i =3D 0 ; i < 4 ; i++) { + d->L(i + 4) =3D rk.L(i + 4) ^ bswap32( + AES_Te0[st.B(AES_shifts[4 * i + 0] + 16)] ^ + AES_Te1[st.B(AES_shifts[4 * i + 1] + 16)] ^ + AES_Te2[st.B(AES_shifts[4 * i + 2] + 16)] ^ + AES_Te3[st.B(AES_shifts[4 * i + 3] + 16)]); } +#endif } =20 -void glue(helper_aesenclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_aesenclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg= *s) { int i; - Reg st =3D *d; + Reg st =3D *v; Reg rk =3D *s; =20 for (i =3D 0; i < 16; i++) { d->B(i) =3D rk.B(i) ^ (AES_sbox[st.B(AES_shifts[i])]); } - +#if SHIFT =3D=3D 2 + for (i =3D 0; i < 16; i++) { + d->B(i + 16) =3D rk.B(i + 16) ^ (AES_sbox[st.B(AES_shifts[i] + 16)= ]); + } +#endif } =20 +#if SHIFT =3D=3D 1 void glue(helper_aesimc, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { int i; Reg tmp =3D *s; =20 for (i =3D 0 ; i < 4 ; i++) { - d->L(i) =3D bswap32(AES_imc[tmp.B(4*i+0)][0] ^ - AES_imc[tmp.B(4*i+1)][1] ^ - AES_imc[tmp.B(4*i+2)][2] ^ - AES_imc[tmp.B(4*i+3)][3]); + d->L(i) =3D bswap32(AES_imc[tmp.B(4 * i + 0)][0] ^ + AES_imc[tmp.B(4 * i + 1)][1] ^ + AES_imc[tmp.B(4 * i + 2)][2] ^ + AES_imc[tmp.B(4 * i + 3)][3]); } } =20 @@ -2315,9 +3052,430 @@ void glue(helper_aeskeygenassist, SUFFIX)(CPUX86Sta= te *env, Reg *d, Reg *s, d->L(3) =3D (d->L(2) << 24 | d->L(2) >> 8) ^ ctrl; } #endif +#endif + +#if SHIFT >=3D 1 +void glue(helper_vbroadcastb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +{ + uint8_t val =3D s->B(0); + int i; + + for (i =3D 0; i < 16 * SHIFT; i++) { + d->B(i) =3D val; + } +} + +void glue(helper_vbroadcastw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +{ + uint16_t val =3D s->W(0); + int i; + + for (i =3D 0; i < 8 * SHIFT; i++) { + d->W(i) =3D val; + } +} + +void glue(helper_vbroadcastl, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +{ + uint32_t val =3D s->L(0); + int i; + + for (i =3D 0; i < 8 * SHIFT; i++) { + d->L(i) =3D val; + } +} + +void glue(helper_vbroadcastq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +{ + uint64_t val =3D s->Q(0); + d->Q(0) =3D val; + d->Q(1) =3D val; +#if SHIFT =3D=3D 2 + d->Q(2) =3D val; + d->Q(3) =3D val; +#endif +} + +void glue(helper_vpermilpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg = *s) +{ + uint64_t r0, r1; + + r0 =3D v->Q((s->Q(0) >> 1) & 1); + r1 =3D v->Q((s->Q(1) >> 1) & 1); + d->Q(0) =3D r0; + d->Q(1) =3D r1; +#if SHIFT =3D=3D 2 + r0 =3D v->Q(((s->Q(2) >> 1) & 1) + 2); + r1 =3D v->Q(((s->Q(3) >> 1) & 1) + 2); + d->Q(2) =3D r0; + d->Q(3) =3D r1; +#endif +} + +void glue(helper_vpermilps, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg = *s) +{ + uint32_t r0, r1, r2, r3; + + r0 =3D v->L(s->L(0) & 3); + r1 =3D v->L(s->L(1) & 3); + r2 =3D v->L(s->L(2) & 3); + r3 =3D v->L(s->L(3) & 3); + d->L(0) =3D r0; + d->L(1) =3D r1; + d->L(2) =3D r2; + d->L(3) =3D r3; +#if SHIFT =3D=3D 2 + r0 =3D v->L((s->L(4) & 3) + 4); + r1 =3D v->L((s->L(5) & 3) + 4); + r2 =3D v->L((s->L(6) & 3) + 4); + r3 =3D v->L((s->L(7) & 3) + 4); + d->L(4) =3D r0; + d->L(5) =3D r1; + d->L(6) =3D r2; + d->L(7) =3D r3; +#endif +} + +void glue(helper_vpermilpd_imm, SUFFIX)(CPUX86State *env, + Reg *d, Reg *s, uint32_t order) +{ + uint64_t r0, r1; + + r0 =3D s->Q((order >> 0) & 1); + r1 =3D s->Q((order >> 1) & 1); + d->Q(0) =3D r0; + d->Q(1) =3D r1; +#if SHIFT =3D=3D 2 + r0 =3D s->Q(((order >> 2) & 1) + 2); + r1 =3D s->Q(((order >> 3) & 1) + 2); + d->Q(2) =3D r0; + d->Q(3) =3D r1; +#endif +} + +void glue(helper_vpermilps_imm, SUFFIX)(CPUX86State *env, + Reg *d, Reg *s, uint32_t order) +{ + uint32_t r0, r1, r2, r3; + + r0 =3D s->L((order >> 0) & 3); + r1 =3D s->L((order >> 2) & 3); + r2 =3D s->L((order >> 4) & 3); + r3 =3D s->L((order >> 6) & 3); + d->L(0) =3D r0; + d->L(1) =3D r1; + d->L(2) =3D r2; + d->L(3) =3D r3; +#if SHIFT =3D=3D 2 + r0 =3D s->L(((order >> 0) & 3) + 4); + r1 =3D s->L(((order >> 2) & 3) + 4); + r2 =3D s->L(((order >> 4) & 3) + 4); + r3 =3D s->L(((order >> 6) & 3) + 4); + d->L(4) =3D r0; + d->L(5) =3D r1; + d->L(6) =3D r2; + d->L(7) =3D r3; +#endif +} + +#if SHIFT =3D=3D 1 +#define FPSRLVD(x, c) (c < 32 ? ((x) >> c) : 0) +#define FPSRLVQ(x, c) (c < 64 ? ((x) >> c) : 0) +#define FPSRAVD(x, c) ((int32_t)(x) >> (c < 64 ? c : 31)) +#define FPSRAVQ(x, c) ((int64_t)(x) >> (c < 64 ? c : 63)) +#define FPSLLVD(x, c) (c < 32 ? ((x) << c) : 0) +#define FPSLLVQ(x, c) (c < 64 ? ((x) << c) : 0) +#endif + +SSE_HELPER_L(helper_vpsrlvd, FPSRLVD) +SSE_HELPER_L(helper_vpsravd, FPSRAVD) +SSE_HELPER_L(helper_vpsllvd, FPSLLVD) + +SSE_HELPER_Q(helper_vpsrlvq, FPSRLVQ) +SSE_HELPER_Q(helper_vpsravq, FPSRAVQ) +SSE_HELPER_Q(helper_vpsllvq, FPSLLVQ) + +void glue(helper_vtestps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +{ + uint32_t zf =3D (s->L(0) & d->L(0)) | (s->L(1) & d->L(1)); + uint32_t cf =3D (s->L(0) & ~d->L(0)) | (s->L(1) & ~d->L(1)); + + zf |=3D (s->L(2) & d->L(2)) | (s->L(3) & d->L(3)); + cf |=3D (s->L(2) & ~d->L(2)) | (s->L(3) & ~d->L(3)); +#if SHIFT =3D=3D 2 + zf |=3D (s->L(4) & d->L(4)) | (s->L(5) & d->L(5)); + cf |=3D (s->L(4) & ~d->L(4)) | (s->L(5) & ~d->L(5)); + zf |=3D (s->L(6) & d->L(6)) | (s->L(7) & d->L(7)); + cf |=3D (s->L(6) & ~d->L(6)) | (s->L(7) & ~d->L(7)); +#endif + CC_SRC =3D ((zf >> 31) ? 0 : CC_Z) | ((cf >> 31) ? 0 : CC_C); +} + +void glue(helper_vtestpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +{ + uint64_t zf =3D (s->Q(0) & d->Q(0)) | (s->Q(1) & d->Q(1)); + uint64_t cf =3D (s->Q(0) & ~d->Q(0)) | (s->Q(1) & ~d->Q(1)); + +#if SHIFT =3D=3D 2 + zf |=3D (s->Q(2) & d->Q(2)) | (s->Q(3) & d->Q(3)); + cf |=3D (s->Q(2) & ~d->Q(2)) | (s->Q(3) & ~d->Q(3)); +#endif + CC_SRC =3D ((zf >> 63) ? 0 : CC_Z) | ((cf >> 63) ? 0 : CC_C); +} + +void glue(helper_vpmaskmovd_st, SUFFIX)(CPUX86State *env, + Reg *s, Reg *v, target_ulong a0) +{ + int i; + + for (i =3D 0; i < (2 << SHIFT); i++) { + if (v->L(i) >> 31) { + cpu_stl_data_ra(env, a0 + i * 4, s->L(i), GETPC()); + } + } +} + +void glue(helper_vpmaskmovq_st, SUFFIX)(CPUX86State *env, + Reg *s, Reg *v, target_ulong a0) +{ + int i; + + for (i =3D 0; i < (1 << SHIFT); i++) { + if (v->Q(i) >> 63) { + cpu_stq_data_ra(env, a0 + i * 8, s->Q(i), GETPC()); + } + } +} + +void glue(helper_vpmaskmovd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg= *s) +{ + d->L(0) =3D (v->L(0) >> 31) ? s->L(0) : 0; + d->L(1) =3D (v->L(1) >> 31) ? s->L(1) : 0; + d->L(2) =3D (v->L(2) >> 31) ? s->L(2) : 0; + d->L(3) =3D (v->L(3) >> 31) ? s->L(3) : 0; +#if SHIFT =3D=3D 2 + d->L(4) =3D (v->L(4) >> 31) ? s->L(4) : 0; + d->L(5) =3D (v->L(5) >> 31) ? s->L(5) : 0; + d->L(6) =3D (v->L(6) >> 31) ? s->L(6) : 0; + d->L(7) =3D (v->L(7) >> 31) ? s->L(7) : 0; +#endif +} + +void glue(helper_vpmaskmovq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg= *s) +{ + d->Q(0) =3D (v->Q(0) >> 63) ? s->Q(0) : 0; + d->Q(1) =3D (v->Q(1) >> 63) ? s->Q(1) : 0; +#if SHIFT =3D=3D 2 + d->Q(2) =3D (v->Q(2) >> 63) ? s->Q(2) : 0; + d->Q(3) =3D (v->Q(3) >> 63) ? s->Q(3) : 0; +#endif +} + +#define VGATHER_HELPER(scale) \ +void glue(helper_vpgatherdd ## scale, SUFFIX)(CPUX86State *env, \ + Reg *d, Reg *v, Reg *s, target_ulong a0) \ +{ \ + int i; \ + for (i =3D 0; i < (2 << SHIFT); i++) { \ + if (v->L(i) >> 31) { \ + target_ulong addr =3D a0 \ + + ((target_ulong)(int32_t)s->L(i) << scale); \ + d->L(i) =3D cpu_ldl_data_ra(env, addr, GETPC()); \ + } \ + v->L(i) =3D 0; \ + } \ +} \ +void glue(helper_vpgatherdq ## scale, SUFFIX)(CPUX86State *env, \ + Reg *d, Reg *v, Reg *s, target_ulong a0) \ +{ \ + int i; \ + for (i =3D 0; i < (1 << SHIFT); i++) { \ + if (v->Q(i) >> 63) { \ + target_ulong addr =3D a0 \ + + ((target_ulong)(int32_t)s->L(i) << scale); \ + d->Q(i) =3D cpu_ldq_data_ra(env, addr, GETPC()); \ + } \ + v->Q(i) =3D 0; \ + } \ +} \ +void glue(helper_vpgatherqd ## scale, SUFFIX)(CPUX86State *env, \ + Reg *d, Reg *v, Reg *s, target_ulong a0) \ +{ \ + int i; \ + for (i =3D 0; i < (1 << SHIFT); i++) { \ + if (v->L(i) >> 31) { \ + target_ulong addr =3D a0 \ + + ((target_ulong)(int64_t)s->Q(i) << scale); \ + d->L(i) =3D cpu_ldl_data_ra(env, addr, GETPC()); \ + } \ + v->L(i) =3D 0; \ + } \ + d->Q(SHIFT) =3D 0; \ + v->Q(SHIFT) =3D 0; \ + YMM_ONLY( \ + d->Q(3) =3D 0; \ + v->Q(3) =3D 0; \ + ) \ +} \ +void glue(helper_vpgatherqq ## scale, SUFFIX)(CPUX86State *env, \ + Reg *d, Reg *v, Reg *s, target_ulong a0) \ +{ \ + int i; \ + for (i =3D 0; i < (1 << SHIFT); i++) { \ + if (v->Q(i) >> 63) { \ + target_ulong addr =3D a0 \ + + ((target_ulong)(int64_t)s->Q(i) << scale); \ + d->Q(i) =3D cpu_ldq_data_ra(env, addr, GETPC()); \ + } \ + v->Q(i) =3D 0; \ + } \ +} + +VGATHER_HELPER(0) +VGATHER_HELPER(1) +VGATHER_HELPER(2) +VGATHER_HELPER(3) + +#if SHIFT =3D=3D 2 +void glue(helper_vbroadcastdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +{ + d->Q(0) =3D s->Q(0); + d->Q(1) =3D s->Q(1); + d->Q(2) =3D s->Q(0); + d->Q(3) =3D s->Q(1); +} + +void helper_vzeroall(CPUX86State *env) +{ + int i; + + for (i =3D 0; i < 8; i++) { + env->xmm_regs[i].ZMM_Q(0) =3D 0; + env->xmm_regs[i].ZMM_Q(1) =3D 0; + env->xmm_regs[i].ZMM_Q(2) =3D 0; + env->xmm_regs[i].ZMM_Q(3) =3D 0; + } +} + +void helper_vzeroupper(CPUX86State *env) +{ + int i; + + for (i =3D 0; i < 8; i++) { + env->xmm_regs[i].ZMM_Q(2) =3D 0; + env->xmm_regs[i].ZMM_Q(3) =3D 0; + } +} + +#ifdef TARGET_X86_64 +void helper_vzeroall_hi8(CPUX86State *env) +{ + int i; + + for (i =3D 8; i < 16; i++) { + env->xmm_regs[i].ZMM_Q(0) =3D 0; + env->xmm_regs[i].ZMM_Q(1) =3D 0; + env->xmm_regs[i].ZMM_Q(2) =3D 0; + env->xmm_regs[i].ZMM_Q(3) =3D 0; + } +} + +void helper_vzeroupper_hi8(CPUX86State *env) +{ + int i; + + for (i =3D 8; i < 16; i++) { + env->xmm_regs[i].ZMM_Q(2) =3D 0; + env->xmm_regs[i].ZMM_Q(3) =3D 0; + } +} +#endif + +void helper_vpermdq_ymm(CPUX86State *env, + Reg *d, Reg *v, Reg *s, uint32_t order) +{ + uint64_t r0, r1, r2, r3; + + switch (order & 3) { + case 0: + r0 =3D v->Q(0); + r1 =3D v->Q(1); + break; + case 1: + r0 =3D v->Q(2); + r1 =3D v->Q(3); + break; + case 2: + r0 =3D s->Q(0); + r1 =3D s->Q(1); + break; + case 3: + r0 =3D s->Q(2); + r1 =3D s->Q(3); + break; + } + switch ((order >> 4) & 3) { + case 0: + r2 =3D v->Q(0); + r3 =3D v->Q(1); + break; + case 1: + r2 =3D v->Q(2); + r3 =3D v->Q(3); + break; + case 2: + r2 =3D s->Q(0); + r3 =3D s->Q(1); + break; + case 3: + r2 =3D s->Q(2); + r3 =3D s->Q(3); + break; + } + d->Q(0) =3D r0; + d->Q(1) =3D r1; + d->Q(2) =3D r2; + d->Q(3) =3D r3; +} + +void helper_vpermq_ymm(CPUX86State *env, Reg *d, Reg *s, uint32_t order) +{ + uint64_t r0, r1, r2, r3; + r0 =3D s->Q(order & 3); + r1 =3D s->Q((order >> 2) & 3); + r2 =3D s->Q((order >> 4) & 3); + r3 =3D s->Q((order >> 6) & 3); + d->Q(0) =3D r0; + d->Q(1) =3D r1; + d->Q(2) =3D r2; + d->Q(3) =3D r3; +} + +void helper_vpermd_ymm(CPUX86State *env, Reg *d, Reg *v, Reg *s) +{ + uint32_t r[8]; + int i; + + for (i =3D 0; i < 8; i++) { + r[i] =3D s->L(v->L(i) & 7); + } + for (i =3D 0; i < 8; i++) { + d->L(i) =3D r[i]; + } +} + +#endif +#endif + +#undef SHIFT_HELPER_W +#undef SHIFT_HELPER_L +#undef SHIFT_HELPER_Q +#undef SSE_HELPER_S +#undef SSE_HELPER_CMP =20 #undef SHIFT #undef XMM_ONLY +#undef YMM_ONLY #undef Reg #undef B #undef W diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h index cef28f2aae..83efb8ab41 100644 --- a/target/i386/ops_sse_header.h +++ b/target/i386/ops_sse_header.h @@ -21,7 +21,11 @@ #define SUFFIX _mmx #else #define Reg ZMMReg +#if SHIFT =3D=3D 1 #define SUFFIX _xmm +#else +#define SUFFIX _ymm +#endif #endif =20 #define dh_alias_Reg ptr @@ -34,31 +38,31 @@ #define dh_typecode_ZMMReg dh_typecode_ptr #define dh_typecode_MMXReg dh_typecode_ptr =20 -DEF_HELPER_3(glue(psrlw, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(psraw, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(psllw, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(psrld, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(psrad, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(pslld, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(psrlq, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(psllq, SUFFIX), void, env, Reg, Reg) - -#if SHIFT =3D=3D 1 -DEF_HELPER_3(glue(psrldq, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(pslldq, SUFFIX), void, env, Reg, Reg) +DEF_HELPER_4(glue(psrlw, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(psraw, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(psllw, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(psrld, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(psrad, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(pslld, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(psrlq, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(psllq, SUFFIX), void, env, Reg, Reg, Reg) + +#if SHIFT >=3D 1 +DEF_HELPER_4(glue(psrldq, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(pslldq, SUFFIX), void, env, Reg, Reg, Reg) #endif =20 #define SSE_HELPER_B(name, F)\ - DEF_HELPER_3(glue(name, SUFFIX), void, env, Reg, Reg) + DEF_HELPER_4(glue(name, SUFFIX), void, env, Reg, Reg, Reg) =20 #define SSE_HELPER_W(name, F)\ - DEF_HELPER_3(glue(name, SUFFIX), void, env, Reg, Reg) + DEF_HELPER_4(glue(name, SUFFIX), void, env, Reg, Reg, Reg) =20 #define SSE_HELPER_L(name, F)\ - DEF_HELPER_3(glue(name, SUFFIX), void, env, Reg, Reg) + DEF_HELPER_4(glue(name, SUFFIX), void, env, Reg, Reg, Reg) =20 #define SSE_HELPER_Q(name, F)\ - DEF_HELPER_3(glue(name, SUFFIX), void, env, Reg, Reg) + DEF_HELPER_4(glue(name, SUFFIX), void, env, Reg, Reg, Reg) =20 SSE_HELPER_B(paddb, FADD) SSE_HELPER_W(paddw, FADD) @@ -101,7 +105,7 @@ SSE_HELPER_L(pcmpeql, FCMPEQ) =20 SSE_HELPER_W(pmullw, FMULLW) #if SHIFT =3D=3D 0 -SSE_HELPER_W(pmulhrw, FMULHRW) +DEF_HELPER_3(glue(pmulhrw, SUFFIX), void, env, Reg, Reg) #endif SSE_HELPER_W(pmulhuw, FMULHUW) SSE_HELPER_W(pmulhw, FMULHW) @@ -109,11 +113,13 @@ SSE_HELPER_W(pmulhw, FMULHW) SSE_HELPER_B(pavgb, FAVG) SSE_HELPER_W(pavgw, FAVG) =20 -DEF_HELPER_3(glue(pmuludq, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(pmaddwd, SUFFIX), void, env, Reg, Reg) +DEF_HELPER_4(glue(pmuludq, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(pmaddwd, SUFFIX), void, env, Reg, Reg, Reg) =20 -DEF_HELPER_3(glue(psadbw, SUFFIX), void, env, Reg, Reg) +DEF_HELPER_4(glue(psadbw, SUFFIX), void, env, Reg, Reg, Reg) +#if SHIFT < 2 DEF_HELPER_4(glue(maskmov, SUFFIX), void, env, Reg, Reg, tl) +#endif DEF_HELPER_2(glue(movl_mm_T0, SUFFIX), void, Reg, i32) #ifdef TARGET_X86_64 DEF_HELPER_2(glue(movq_mm_T0, SUFFIX), void, Reg, i64) @@ -122,38 +128,63 @@ DEF_HELPER_2(glue(movq_mm_T0, SUFFIX), void, Reg, i64) #if SHIFT =3D=3D 0 DEF_HELPER_3(glue(pshufw, SUFFIX), void, Reg, Reg, int) #else -DEF_HELPER_3(shufps, void, Reg, Reg, int) -DEF_HELPER_3(shufpd, void, Reg, Reg, int) DEF_HELPER_3(glue(pshufd, SUFFIX), void, Reg, Reg, int) DEF_HELPER_3(glue(pshuflw, SUFFIX), void, Reg, Reg, int) DEF_HELPER_3(glue(pshufhw, SUFFIX), void, Reg, Reg, int) #endif =20 -#if SHIFT =3D=3D 1 +#if SHIFT >=3D 1 /* FPU ops */ /* XXX: not accurate */ =20 -#define SSE_HELPER_S(name, F) \ - DEF_HELPER_3(name ## ps, void, env, Reg, Reg) \ - DEF_HELPER_3(name ## ss, void, env, Reg, Reg) \ - DEF_HELPER_3(name ## pd, void, env, Reg, Reg) \ - DEF_HELPER_3(name ## sd, void, env, Reg, Reg) +#define SSE_HELPER_P4(name, ...) \ + DEF_HELPER_4(glue(name ## ps, SUFFIX), __VA_ARGS__) \ + DEF_HELPER_4(glue(name ## pd, SUFFIX), __VA_ARGS__) + +#define SSE_HELPER_P3(name, ...) \ + DEF_HELPER_3(glue(name ## ps, SUFFIX), __VA_ARGS__) \ + DEF_HELPER_3(glue(name ## pd, SUFFIX), __VA_ARGS__) + +#if SHIFT =3D=3D 1 +#define SSE_HELPER_S4(name, ...) \ + SSE_HELPER_P4(name, __VA_ARGS__) \ + DEF_HELPER_4(name ## ss, __VA_ARGS__) \ + DEF_HELPER_4(name ## sd, __VA_ARGS__) +#define SSE_HELPER_S3(name, ...) \ + SSE_HELPER_P3(name, __VA_ARGS__) \ + DEF_HELPER_3(name ## ss, __VA_ARGS__) \ + DEF_HELPER_3(name ## sd, __VA_ARGS__) +#else +#define SSE_HELPER_S4(name, ...) SSE_HELPER_P4(name, __VA_ARGS__) +#define SSE_HELPER_S3(name, ...) SSE_HELPER_P3(name, __VA_ARGS__) +#endif + +DEF_HELPER_4(glue(shufps, SUFFIX), void, Reg, Reg, Reg, int) +DEF_HELPER_4(glue(shufpd, SUFFIX), void, Reg, Reg, Reg, int) + +SSE_HELPER_S4(add, void, env, Reg, Reg, Reg) +SSE_HELPER_S4(sub, void, env, Reg, Reg, Reg) +SSE_HELPER_S4(mul, void, env, Reg, Reg, Reg) +SSE_HELPER_S4(div, void, env, Reg, Reg, Reg) +SSE_HELPER_S4(min, void, env, Reg, Reg, Reg) +SSE_HELPER_S4(max, void, env, Reg, Reg, Reg) + +SSE_HELPER_S3(sqrt, void, env, Reg, Reg) =20 -SSE_HELPER_S(add, FPU_ADD) -SSE_HELPER_S(sub, FPU_SUB) -SSE_HELPER_S(mul, FPU_MUL) -SSE_HELPER_S(div, FPU_DIV) -SSE_HELPER_S(min, FPU_MIN) -SSE_HELPER_S(max, FPU_MAX) -SSE_HELPER_S(sqrt, FPU_SQRT) +DEF_HELPER_3(glue(cvtps2pd, SUFFIX), void, env, Reg, Reg) +DEF_HELPER_3(glue(cvtpd2ps, SUFFIX), void, env, Reg, Reg) +DEF_HELPER_3(glue(cvtdq2ps, SUFFIX), void, env, Reg, Reg) +DEF_HELPER_3(glue(cvtdq2pd, SUFFIX), void, env, Reg, Reg) =20 +DEF_HELPER_3(glue(cvtps2dq, SUFFIX), void, env, ZMMReg, ZMMReg) +DEF_HELPER_3(glue(cvtpd2dq, SUFFIX), void, env, ZMMReg, ZMMReg) =20 -DEF_HELPER_3(cvtps2pd, void, env, Reg, Reg) -DEF_HELPER_3(cvtpd2ps, void, env, Reg, Reg) +DEF_HELPER_3(glue(cvttps2dq, SUFFIX), void, env, ZMMReg, ZMMReg) +DEF_HELPER_3(glue(cvttpd2dq, SUFFIX), void, env, ZMMReg, ZMMReg) + +#if SHIFT =3D=3D 1 DEF_HELPER_3(cvtss2sd, void, env, Reg, Reg) DEF_HELPER_3(cvtsd2ss, void, env, Reg, Reg) -DEF_HELPER_3(cvtdq2ps, void, env, Reg, Reg) -DEF_HELPER_3(cvtdq2pd, void, env, Reg, Reg) DEF_HELPER_3(cvtpi2ps, void, env, ZMMReg, MMXReg) DEF_HELPER_3(cvtpi2pd, void, env, ZMMReg, MMXReg) DEF_HELPER_3(cvtsi2ss, void, env, ZMMReg, i32) @@ -164,8 +195,6 @@ DEF_HELPER_3(cvtsq2ss, void, env, ZMMReg, i64) DEF_HELPER_3(cvtsq2sd, void, env, ZMMReg, i64) #endif =20 -DEF_HELPER_3(cvtps2dq, void, env, ZMMReg, ZMMReg) -DEF_HELPER_3(cvtpd2dq, void, env, ZMMReg, ZMMReg) DEF_HELPER_3(cvtps2pi, void, env, MMXReg, ZMMReg) DEF_HELPER_3(cvtpd2pi, void, env, MMXReg, ZMMReg) DEF_HELPER_2(cvtss2si, s32, env, ZMMReg) @@ -175,8 +204,6 @@ DEF_HELPER_2(cvtss2sq, s64, env, ZMMReg) DEF_HELPER_2(cvtsd2sq, s64, env, ZMMReg) #endif =20 -DEF_HELPER_3(cvttps2dq, void, env, ZMMReg, ZMMReg) -DEF_HELPER_3(cvttpd2dq, void, env, ZMMReg, ZMMReg) DEF_HELPER_3(cvttps2pi, void, env, MMXReg, ZMMReg) DEF_HELPER_3(cvttpd2pi, void, env, MMXReg, ZMMReg) DEF_HELPER_2(cvttss2si, s32, env, ZMMReg) @@ -185,60 +212,88 @@ DEF_HELPER_2(cvttsd2si, s32, env, ZMMReg) DEF_HELPER_2(cvttss2sq, s64, env, ZMMReg) DEF_HELPER_2(cvttsd2sq, s64, env, ZMMReg) #endif +#endif =20 -DEF_HELPER_3(rsqrtps, void, env, ZMMReg, ZMMReg) +DEF_HELPER_3(glue(rsqrtps, SUFFIX), void, env, ZMMReg, ZMMReg) +DEF_HELPER_3(glue(rcpps, SUFFIX), void, env, ZMMReg, ZMMReg) + +#if SHIFT =3D=3D 1 DEF_HELPER_3(rsqrtss, void, env, ZMMReg, ZMMReg) -DEF_HELPER_3(rcpps, void, env, ZMMReg, ZMMReg) DEF_HELPER_3(rcpss, void, env, ZMMReg, ZMMReg) DEF_HELPER_3(extrq_r, void, env, ZMMReg, ZMMReg) DEF_HELPER_4(extrq_i, void, env, ZMMReg, int, int) DEF_HELPER_3(insertq_r, void, env, ZMMReg, ZMMReg) DEF_HELPER_4(insertq_i, void, env, ZMMReg, int, int) -DEF_HELPER_3(haddps, void, env, ZMMReg, ZMMReg) -DEF_HELPER_3(haddpd, void, env, ZMMReg, ZMMReg) -DEF_HELPER_3(hsubps, void, env, ZMMReg, ZMMReg) -DEF_HELPER_3(hsubpd, void, env, ZMMReg, ZMMReg) -DEF_HELPER_3(addsubps, void, env, ZMMReg, ZMMReg) -DEF_HELPER_3(addsubpd, void, env, ZMMReg, ZMMReg) - -#define SSE_HELPER_CMP(name, F) \ - DEF_HELPER_3(name ## ps, void, env, Reg, Reg) \ - DEF_HELPER_3(name ## ss, void, env, Reg, Reg) \ - DEF_HELPER_3(name ## pd, void, env, Reg, Reg) \ - DEF_HELPER_3(name ## sd, void, env, Reg, Reg) - -SSE_HELPER_CMP(cmpeq, FPU_CMPEQ) -SSE_HELPER_CMP(cmplt, FPU_CMPLT) -SSE_HELPER_CMP(cmple, FPU_CMPLE) -SSE_HELPER_CMP(cmpunord, FPU_CMPUNORD) -SSE_HELPER_CMP(cmpneq, FPU_CMPNEQ) -SSE_HELPER_CMP(cmpnlt, FPU_CMPNLT) -SSE_HELPER_CMP(cmpnle, FPU_CMPNLE) -SSE_HELPER_CMP(cmpord, FPU_CMPORD) +#endif + +SSE_HELPER_P4(hadd, void, env, Reg, Reg, Reg) +SSE_HELPER_P4(hsub, void, env, Reg, Reg, Reg) +SSE_HELPER_P4(addsub, void, env, Reg, Reg, Reg) + +#define SSE_HELPER_CMP(name, F, C) SSE_HELPER_S4(name, void, env, Reg, Reg= , Reg) + +SSE_HELPER_CMP(cmpeq, FPU_CMPQ, FPU_EQ) +SSE_HELPER_CMP(cmplt, FPU_CMPS, FPU_LT) +SSE_HELPER_CMP(cmple, FPU_CMPS, FPU_LE) +SSE_HELPER_CMP(cmpunord, FPU_CMPQ, FPU_UNORD) +SSE_HELPER_CMP(cmpneq, FPU_CMPQ, !FPU_EQ) +SSE_HELPER_CMP(cmpnlt, FPU_CMPS, !FPU_LT) +SSE_HELPER_CMP(cmpnle, FPU_CMPS, !FPU_LE) +SSE_HELPER_CMP(cmpord, FPU_CMPQ, !FPU_UNORD) + +SSE_HELPER_CMP(cmpequ, FPU_CMPQ, FPU_EQU) +SSE_HELPER_CMP(cmpnge, FPU_CMPS, !FPU_GE) +SSE_HELPER_CMP(cmpngt, FPU_CMPS, !FPU_GT) +SSE_HELPER_CMP(cmpfalse, FPU_CMPQ, FPU_FALSE) +SSE_HELPER_CMP(cmpnequ, FPU_CMPQ, FPU_EQU) +SSE_HELPER_CMP(cmpge, FPU_CMPS, FPU_GE) +SSE_HELPER_CMP(cmpgt, FPU_CMPS, FPU_GT) +SSE_HELPER_CMP(cmptrue, FPU_CMPQ, !FPU_FALSE) + +SSE_HELPER_CMP(cmpeqs, FPU_CMPS, FPU_EQ) +SSE_HELPER_CMP(cmpltq, FPU_CMPQ, FPU_LT) +SSE_HELPER_CMP(cmpleq, FPU_CMPQ, FPU_LE) +SSE_HELPER_CMP(cmpunords, FPU_CMPS, FPU_UNORD) +SSE_HELPER_CMP(cmpneqq, FPU_CMPS, !FPU_EQ) +SSE_HELPER_CMP(cmpnltq, FPU_CMPQ, !FPU_LT) +SSE_HELPER_CMP(cmpnleq, FPU_CMPQ, !FPU_LE) +SSE_HELPER_CMP(cmpords, FPU_CMPS, !FPU_UNORD) + +SSE_HELPER_CMP(cmpequs, FPU_CMPS, FPU_EQU) +SSE_HELPER_CMP(cmpngeq, FPU_CMPQ, !FPU_GE) +SSE_HELPER_CMP(cmpngtq, FPU_CMPQ, !FPU_GT) +SSE_HELPER_CMP(cmpfalses, FPU_CMPS, FPU_FALSE) +SSE_HELPER_CMP(cmpnequs, FPU_CMPS, FPU_EQU) +SSE_HELPER_CMP(cmpgeq, FPU_CMPQ, FPU_GE) +SSE_HELPER_CMP(cmpgtq, FPU_CMPQ, FPU_GT) +SSE_HELPER_CMP(cmptrues, FPU_CMPS, !FPU_FALSE) =20 +#if SHIFT =3D=3D 1 DEF_HELPER_3(ucomiss, void, env, Reg, Reg) DEF_HELPER_3(comiss, void, env, Reg, Reg) DEF_HELPER_3(ucomisd, void, env, Reg, Reg) DEF_HELPER_3(comisd, void, env, Reg, Reg) -DEF_HELPER_2(movmskps, i32, env, Reg) -DEF_HELPER_2(movmskpd, i32, env, Reg) +#endif + +DEF_HELPER_2(glue(movmskps, SUFFIX), i32, env, Reg) +DEF_HELPER_2(glue(movmskpd, SUFFIX), i32, env, Reg) #endif =20 DEF_HELPER_2(glue(pmovmskb, SUFFIX), i32, env, Reg) -DEF_HELPER_3(glue(packsswb, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(packuswb, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(packssdw, SUFFIX), void, env, Reg, Reg) -#define UNPCK_OP(base_name, base) \ - DEF_HELPER_3(glue(punpck ## base_name ## bw, SUFFIX), void, env, Reg, = Reg) \ - DEF_HELPER_3(glue(punpck ## base_name ## wd, SUFFIX), void, env, Reg, = Reg) \ - DEF_HELPER_3(glue(punpck ## base_name ## dq, SUFFIX), void, env, Reg, = Reg) +DEF_HELPER_4(glue(packsswb, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(packuswb, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(packssdw, SUFFIX), void, env, Reg, Reg, Reg) +#define UNPCK_OP(name, base) \ + DEF_HELPER_4(glue(punpck ## name ## bw, SUFFIX), void, env, Reg, Reg, = Reg) \ + DEF_HELPER_4(glue(punpck ## name ## wd, SUFFIX), void, env, Reg, Reg, = Reg) \ + DEF_HELPER_4(glue(punpck ## name ## dq, SUFFIX), void, env, Reg, Reg, = Reg) =20 UNPCK_OP(l, 0) UNPCK_OP(h, 1) =20 -#if SHIFT =3D=3D 1 -DEF_HELPER_3(glue(punpcklqdq, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(punpckhqdq, SUFFIX), void, env, Reg, Reg) +#if SHIFT >=3D 1 +DEF_HELPER_4(glue(punpcklqdq, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(punpckhqdq, SUFFIX), void, env, Reg, Reg, Reg) #endif =20 /* 3DNow! float ops */ @@ -265,28 +320,28 @@ DEF_HELPER_3(pswapd, void, env, MMXReg, MMXReg) #endif =20 /* SSSE3 op helpers */ -DEF_HELPER_3(glue(phaddw, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(phaddd, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(phaddsw, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(phsubw, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(phsubd, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(phsubsw, SUFFIX), void, env, Reg, Reg) +DEF_HELPER_4(glue(phaddw, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(phaddd, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(phaddsw, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(phsubw, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(phsubd, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(phsubsw, SUFFIX), void, env, Reg, Reg, Reg) DEF_HELPER_3(glue(pabsb, SUFFIX), void, env, Reg, Reg) DEF_HELPER_3(glue(pabsw, SUFFIX), void, env, Reg, Reg) DEF_HELPER_3(glue(pabsd, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(pmaddubsw, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(pmulhrsw, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(pshufb, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(psignb, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(psignw, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(psignd, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_4(glue(palignr, SUFFIX), void, env, Reg, Reg, s32) +DEF_HELPER_4(glue(pmaddubsw, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(pmulhrsw, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(pshufb, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(psignb, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(psignw, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(psignd, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_5(glue(palignr, SUFFIX), void, env, Reg, Reg, Reg, s32) =20 /* SSE4.1 op helpers */ -#if SHIFT =3D=3D 1 -DEF_HELPER_3(glue(pblendvb, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(blendvps, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(blendvpd, SUFFIX), void, env, Reg, Reg) +#if SHIFT >=3D 1 +DEF_HELPER_5(glue(pblendvb, SUFFIX), void, env, Reg, Reg, Reg, Reg) +DEF_HELPER_5(glue(blendvps, SUFFIX), void, env, Reg, Reg, Reg, Reg) +DEF_HELPER_5(glue(blendvpd, SUFFIX), void, env, Reg, Reg, Reg, Reg) DEF_HELPER_3(glue(ptest, SUFFIX), void, env, Reg, Reg) DEF_HELPER_3(glue(pmovsxbw, SUFFIX), void, env, Reg, Reg) DEF_HELPER_3(glue(pmovsxbd, SUFFIX), void, env, Reg, Reg) @@ -300,34 +355,42 @@ DEF_HELPER_3(glue(pmovzxbq, SUFFIX), void, env, Reg, = Reg) DEF_HELPER_3(glue(pmovzxwd, SUFFIX), void, env, Reg, Reg) DEF_HELPER_3(glue(pmovzxwq, SUFFIX), void, env, Reg, Reg) DEF_HELPER_3(glue(pmovzxdq, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(pmuldq, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(pcmpeqq, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(packusdw, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(pminsb, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(pminsd, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(pminuw, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(pminud, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(pmaxsb, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(pmaxsd, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(pmaxuw, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(pmaxud, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(pmulld, SUFFIX), void, env, Reg, Reg) +DEF_HELPER_4(glue(pmuldq, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(pcmpeqq, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(packusdw, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(pminsb, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(pminsd, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(pminuw, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(pminud, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(pmaxsb, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(pmaxsd, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(pmaxuw, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(pmaxud, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(pmulld, SUFFIX), void, env, Reg, Reg, Reg) +#if SHIFT =3D=3D 1 DEF_HELPER_3(glue(phminposuw, SUFFIX), void, env, Reg, Reg) +#endif DEF_HELPER_4(glue(roundps, SUFFIX), void, env, Reg, Reg, i32) DEF_HELPER_4(glue(roundpd, SUFFIX), void, env, Reg, Reg, i32) -DEF_HELPER_4(glue(roundss, SUFFIX), void, env, Reg, Reg, i32) -DEF_HELPER_4(glue(roundsd, SUFFIX), void, env, Reg, Reg, i32) -DEF_HELPER_4(glue(blendps, SUFFIX), void, env, Reg, Reg, i32) -DEF_HELPER_4(glue(blendpd, SUFFIX), void, env, Reg, Reg, i32) -DEF_HELPER_4(glue(pblendw, SUFFIX), void, env, Reg, Reg, i32) -DEF_HELPER_4(glue(dpps, SUFFIX), void, env, Reg, Reg, i32) -DEF_HELPER_4(glue(dppd, SUFFIX), void, env, Reg, Reg, i32) -DEF_HELPER_4(glue(mpsadbw, SUFFIX), void, env, Reg, Reg, i32) +#if SHIFT =3D=3D 1 +DEF_HELPER_4(roundss_xmm, void, env, Reg, Reg, i32) +DEF_HELPER_4(roundsd_xmm, void, env, Reg, Reg, i32) +#endif +DEF_HELPER_5(glue(blendps, SUFFIX), void, env, Reg, Reg, Reg, i32) +DEF_HELPER_5(glue(blendpd, SUFFIX), void, env, Reg, Reg, Reg, i32) +DEF_HELPER_5(glue(pblendw, SUFFIX), void, env, Reg, Reg, Reg, i32) +DEF_HELPER_5(glue(dpps, SUFFIX), void, env, Reg, Reg, Reg, i32) +#if SHIFT =3D=3D 1 +DEF_HELPER_5(glue(dppd, SUFFIX), void, env, Reg, Reg, Reg, i32) +#endif +DEF_HELPER_5(glue(mpsadbw, SUFFIX), void, env, Reg, Reg, Reg, i32) #endif =20 /* SSE4.2 op helpers */ +#if SHIFT >=3D 1 +DEF_HELPER_4(glue(pcmpgtq, SUFFIX), void, env, Reg, Reg, Reg) +#endif #if SHIFT =3D=3D 1 -DEF_HELPER_3(glue(pcmpgtq, SUFFIX), void, env, Reg, Reg) DEF_HELPER_4(glue(pcmpestri, SUFFIX), void, env, Reg, Reg, i32) DEF_HELPER_4(glue(pcmpestrm, SUFFIX), void, env, Reg, Reg, i32) DEF_HELPER_4(glue(pcmpistri, SUFFIX), void, env, Reg, Reg, i32) @@ -336,14 +399,68 @@ DEF_HELPER_3(crc32, tl, i32, tl, i32) #endif =20 /* AES-NI op helpers */ +#if SHIFT >=3D 1 +DEF_HELPER_4(glue(aesdec, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(aesdeclast, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(aesenc, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(aesenclast, SUFFIX), void, env, Reg, Reg, Reg) #if SHIFT =3D=3D 1 -DEF_HELPER_3(glue(aesdec, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(aesdeclast, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(aesenc, SUFFIX), void, env, Reg, Reg) -DEF_HELPER_3(glue(aesenclast, SUFFIX), void, env, Reg, Reg) DEF_HELPER_3(glue(aesimc, SUFFIX), void, env, Reg, Reg) DEF_HELPER_4(glue(aeskeygenassist, SUFFIX), void, env, Reg, Reg, i32) -DEF_HELPER_4(glue(pclmulqdq, SUFFIX), void, env, Reg, Reg, i32) +#endif +DEF_HELPER_5(glue(pclmulqdq, SUFFIX), void, env, Reg, Reg, Reg, i32) +#endif + +/* AVX helpers */ +#if SHIFT >=3D 1 +DEF_HELPER_3(glue(vbroadcastb, SUFFIX), void, env, Reg, Reg) +DEF_HELPER_3(glue(vbroadcastw, SUFFIX), void, env, Reg, Reg) +DEF_HELPER_3(glue(vbroadcastl, SUFFIX), void, env, Reg, Reg) +DEF_HELPER_3(glue(vbroadcastq, SUFFIX), void, env, Reg, Reg) +DEF_HELPER_4(glue(vpermilpd, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(vpermilps, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(vpermilpd_imm, SUFFIX), void, env, Reg, Reg, i32) +DEF_HELPER_4(glue(vpermilps_imm, SUFFIX), void, env, Reg, Reg, i32) +DEF_HELPER_4(glue(vpsrlvd, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(vpsravd, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(vpsllvd, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(vpsrlvq, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(vpsravq, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_4(glue(vpsllvq, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_3(glue(vtestps, SUFFIX), void, env, Reg, Reg) +DEF_HELPER_3(glue(vtestpd, SUFFIX), void, env, Reg, Reg) +DEF_HELPER_4(glue(vpmaskmovd_st, SUFFIX), void, env, Reg, Reg, tl) +DEF_HELPER_4(glue(vpmaskmovq_st, SUFFIX), void, env, Reg, Reg, tl) +DEF_HELPER_4(glue(vpmaskmovd, SUFFIX), void, env, Reg, Reg, Reg) +DEF_HELPER_5(glue(vpgatherdd0, SUFFIX), void, env, Reg, Reg, Reg, tl) +DEF_HELPER_5(glue(vpgatherdq0, SUFFIX), void, env, Reg, Reg, Reg, tl) +DEF_HELPER_5(glue(vpgatherqd0, SUFFIX), void, env, Reg, Reg, Reg, tl) +DEF_HELPER_5(glue(vpgatherqq0, SUFFIX), void, env, Reg, Reg, Reg, tl) +DEF_HELPER_5(glue(vpgatherdd1, SUFFIX), void, env, Reg, Reg, Reg, tl) +DEF_HELPER_5(glue(vpgatherdq1, SUFFIX), void, env, Reg, Reg, Reg, tl) +DEF_HELPER_5(glue(vpgatherqd1, SUFFIX), void, env, Reg, Reg, Reg, tl) +DEF_HELPER_5(glue(vpgatherqq1, SUFFIX), void, env, Reg, Reg, Reg, tl) +DEF_HELPER_5(glue(vpgatherdd2, SUFFIX), void, env, Reg, Reg, Reg, tl) +DEF_HELPER_5(glue(vpgatherdq2, SUFFIX), void, env, Reg, Reg, Reg, tl) +DEF_HELPER_5(glue(vpgatherqd2, SUFFIX), void, env, Reg, Reg, Reg, tl) +DEF_HELPER_5(glue(vpgatherqq2, SUFFIX), void, env, Reg, Reg, Reg, tl) +DEF_HELPER_5(glue(vpgatherdd3, SUFFIX), void, env, Reg, Reg, Reg, tl) +DEF_HELPER_5(glue(vpgatherdq3, SUFFIX), void, env, Reg, Reg, Reg, tl) +DEF_HELPER_5(glue(vpgatherqd3, SUFFIX), void, env, Reg, Reg, Reg, tl) +DEF_HELPER_5(glue(vpgatherqq3, SUFFIX), void, env, Reg, Reg, Reg, tl) +DEF_HELPER_4(glue(vpmaskmovq, SUFFIX), void, env, Reg, Reg, Reg) +#if SHIFT =3D=3D 2 +DEF_HELPER_3(glue(vbroadcastdq, SUFFIX), void, env, Reg, Reg) +DEF_HELPER_1(vzeroall, void, env) +DEF_HELPER_1(vzeroupper, void, env) +#ifdef TARGET_X86_64 +DEF_HELPER_1(vzeroall_hi8, void, env) +DEF_HELPER_1(vzeroupper_hi8, void, env) +#endif +DEF_HELPER_5(vpermdq_ymm, void, env, Reg, Reg, Reg, i32) +DEF_HELPER_4(vpermq_ymm, void, env, Reg, Reg, i32) +DEF_HELPER_4(vpermd_ymm, void, env, Reg, Reg, Reg) +#endif #endif =20 #undef SHIFT @@ -354,6 +471,9 @@ DEF_HELPER_4(glue(pclmulqdq, SUFFIX), void, env, Reg, R= eg, i32) #undef SSE_HELPER_W #undef SSE_HELPER_L #undef SSE_HELPER_Q -#undef SSE_HELPER_S +#undef SSE_HELPER_S3 +#undef SSE_HELPER_S4 +#undef SSE_HELPER_P3 +#undef SSE_HELPER_P4 #undef SSE_HELPER_CMP #undef UNPCK_OP diff --git a/target/i386/tcg/fpu_helper.c b/target/i386/tcg/fpu_helper.c index b391b69635..74cf86c986 100644 --- a/target/i386/tcg/fpu_helper.c +++ b/target/i386/tcg/fpu_helper.c @@ -3053,3 +3053,6 @@ void helper_movq(CPUX86State *env, void *d, void *s) =20 #define SHIFT 1 #include "ops_sse.h" + +#define SHIFT 2 +#include "ops_sse.h" diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index c393913fe0..f1c7ab4455 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -125,6 +125,7 @@ typedef struct DisasContext { TCGv tmp4; TCGv_ptr ptr0; TCGv_ptr ptr1; + TCGv_ptr ptr2; TCGv_i32 tmp2_i32; TCGv_i32 tmp3_i32; TCGv_i64 tmp1_i64; @@ -2739,6 +2740,29 @@ static inline void gen_ldo_env_A0(DisasContext *s, i= nt offset) tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(ZMMReg, ZMM_Q(1= ))); } =20 +static inline void gen_ldo_env_A0_ymmh(DisasContext *s, int offset) +{ + int mem_index =3D s->mem_index; + tcg_gen_qemu_ld_i64(s->tmp1_i64, s->A0, mem_index, MO_LEUQ); + tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(ZMMReg, ZMM_Q(2= ))); + tcg_gen_addi_tl(s->tmp0, s->A0, 8); + tcg_gen_qemu_ld_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ); + tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(ZMMReg, ZMM_Q(3= ))); +} + +/* Load 256-bit ymm register value */ +static inline void gen_ldy_env_A0(DisasContext *s, int offset) +{ + int mem_index =3D s->mem_index; + gen_ldo_env_A0(s, offset); + tcg_gen_addi_tl(s->tmp0, s->A0, 16); + tcg_gen_qemu_ld_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ); + tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(ZMMReg, ZMM_Q(2= ))); + tcg_gen_addi_tl(s->tmp0, s->A0, 24); + tcg_gen_qemu_ld_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ); + tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(ZMMReg, ZMM_Q(3= ))); +} + static inline void gen_sto_env_A0(DisasContext *s, int offset) { int mem_index =3D s->mem_index; @@ -2749,6 +2773,29 @@ static inline void gen_sto_env_A0(DisasContext *s, i= nt offset) tcg_gen_qemu_st_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ); } =20 +static inline void gen_sto_env_A0_ymmh(DisasContext *s, int offset) +{ + int mem_index =3D s->mem_index; + tcg_gen_ld_i64(s->tmp1_i64, cpu_env, offset + offsetof(ZMMReg, ZMM_Q(2= ))); + tcg_gen_qemu_st_i64(s->tmp1_i64, s->A0, mem_index, MO_LEUQ); + tcg_gen_addi_tl(s->tmp0, s->A0, 8); + tcg_gen_ld_i64(s->tmp1_i64, cpu_env, offset + offsetof(ZMMReg, ZMM_Q(3= ))); + tcg_gen_qemu_st_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ); +} + +/* Store 256-bit ymm register value */ +static inline void gen_sty_env_A0(DisasContext *s, int offset) +{ + int mem_index =3D s->mem_index; + gen_sto_env_A0(s, offset); + tcg_gen_addi_tl(s->tmp0, s->A0, 16); + tcg_gen_ld_i64(s->tmp1_i64, cpu_env, offset + offsetof(ZMMReg, ZMM_Q(2= ))); + tcg_gen_qemu_st_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ); + tcg_gen_addi_tl(s->tmp0, s->A0, 24); + tcg_gen_ld_i64(s->tmp1_i64, cpu_env, offset + offsetof(ZMMReg, ZMM_Q(3= ))); + tcg_gen_qemu_st_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ); +} + static inline void gen_op_movo(DisasContext *s, int d_offset, int s_offset) { tcg_gen_ld_i64(s->tmp1_i64, cpu_env, s_offset + offsetof(ZMMReg, ZMM_Q= (0))); @@ -2757,6 +2804,32 @@ static inline void gen_op_movo(DisasContext *s, int = d_offset, int s_offset) tcg_gen_st_i64(s->tmp1_i64, cpu_env, d_offset + offsetof(ZMMReg, ZMM_Q= (1))); } =20 +static inline void gen_op_movo_ymmh(DisasContext *s, int d_offset, int s_o= ffset) +{ + tcg_gen_ld_i64(s->tmp1_i64, cpu_env, s_offset + offsetof(ZMMReg, ZMM_Q= (2))); + tcg_gen_st_i64(s->tmp1_i64, cpu_env, d_offset + offsetof(ZMMReg, ZMM_Q= (2))); + tcg_gen_ld_i64(s->tmp1_i64, cpu_env, s_offset + offsetof(ZMMReg, ZMM_Q= (3))); + tcg_gen_st_i64(s->tmp1_i64, cpu_env, d_offset + offsetof(ZMMReg, ZMM_Q= (3))); +} + +static inline void gen_op_movo_ymm_l2h(DisasContext *s, + int d_offset, int s_offset) +{ + tcg_gen_ld_i64(s->tmp1_i64, cpu_env, s_offset + offsetof(ZMMReg, ZMM_Q= (0))); + tcg_gen_st_i64(s->tmp1_i64, cpu_env, d_offset + offsetof(ZMMReg, ZMM_Q= (2))); + tcg_gen_ld_i64(s->tmp1_i64, cpu_env, s_offset + offsetof(ZMMReg, ZMM_Q= (1))); + tcg_gen_st_i64(s->tmp1_i64, cpu_env, d_offset + offsetof(ZMMReg, ZMM_Q= (3))); +} + +static inline void gen_op_movo_ymm_h2l(DisasContext *s, + int d_offset, int s_offset) +{ + tcg_gen_ld_i64(s->tmp1_i64, cpu_env, s_offset + offsetof(ZMMReg, ZMM_Q= (2))); + tcg_gen_st_i64(s->tmp1_i64, cpu_env, d_offset + offsetof(ZMMReg, ZMM_Q= (0))); + tcg_gen_ld_i64(s->tmp1_i64, cpu_env, s_offset + offsetof(ZMMReg, ZMM_Q= (3))); + tcg_gen_st_i64(s->tmp1_i64, cpu_env, d_offset + offsetof(ZMMReg, ZMM_Q= (1))); +} + static inline void gen_op_movq(DisasContext *s, int d_offset, int s_offset) { tcg_gen_ld_i64(s->tmp1_i64, cpu_env, s_offset); @@ -2775,170 +2848,270 @@ static inline void gen_op_movq_env_0(DisasContext= *s, int d_offset) tcg_gen_st_i64(s->tmp1_i64, cpu_env, d_offset); } =20 +#define XMM_OFFSET(reg) offsetof(CPUX86State, xmm_regs[reg]) + +/* + * Clear the top half of the ymm register after a VEX.128 instruction + * This could be optimized by tracking this in env->hflags + */ +static void gen_clear_ymmh(DisasContext *s, int reg) +{ + if (s->prefix & PREFIX_VEX) { + gen_op_movq_env_0(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(2))= ); + gen_op_movq_env_0(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(3))= ); + } +} + +typedef void (*SSEFunc_0_pp)(TCGv_ptr reg_a, TCGv_ptr reg_b); typedef void (*SSEFunc_i_ep)(TCGv_i32 val, TCGv_ptr env, TCGv_ptr reg); typedef void (*SSEFunc_l_ep)(TCGv_i64 val, TCGv_ptr env, TCGv_ptr reg); typedef void (*SSEFunc_0_epi)(TCGv_ptr env, TCGv_ptr reg, TCGv_i32 val); typedef void (*SSEFunc_0_epl)(TCGv_ptr env, TCGv_ptr reg, TCGv_i64 val); typedef void (*SSEFunc_0_epp)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b= ); +typedef void (*SSEFunc_0_eppp)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_= b, + TCGv_ptr reg_c); +typedef void (*SSEFunc_0_epppp)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg= _b, + TCGv_ptr reg_c, TCGv_ptr reg_d); typedef void (*SSEFunc_0_eppi)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_= b, TCGv_i32 val); +typedef void (*SSEFunc_0_epppi)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg= _b, + TCGv_ptr reg_c, TCGv_i32 val); typedef void (*SSEFunc_0_ppi)(TCGv_ptr reg_a, TCGv_ptr reg_b, TCGv_i32 val= ); +typedef void (*SSEFunc_0_pppi)(TCGv_ptr reg_a, TCGv_ptr reg_b, TCGv_ptr re= g_c, + TCGv_i32 val); typedef void (*SSEFunc_0_eppt)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_= b, TCGv val); +typedef void (*SSEFunc_0_epppt)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg= _b, + TCGv_ptr reg_c, TCGv val); + +#define SSE_OPF_V0 (1 << 0) /* vex.v must be 1111b (only 2 operands= ) */ +#define SSE_OPF_CMP (1 << 1) /* does not write for first operand */ +#define SSE_OPF_BLENDV (1 << 2) /* blendv* instruction */ +#define SSE_OPF_SPECIAL (1 << 3) /* magic */ +#define SSE_OPF_3DNOW (1 << 4) /* 3DNow! instruction */ +#define SSE_OPF_MMX (1 << 5) /* MMX/integer/AVX2 instruction */ +#define SSE_OPF_SCALAR (1 << 6) /* Has SSE scalar variants */ +#define SSE_OPF_AVX2 (1 << 7) /* AVX2 instruction */ +#define SSE_OPF_SHUF (1 << 9) /* pshufx/shufpx */ + +#define OP(op, flags, a, b, c, d, e, f, g, h) \ + {flags, {{.op =3D a}, {.op =3D b}, {.op =3D c}, {.op =3D d}, \ + {.op =3D e}, {.op =3D f}, {.op =3D g}, {.op =3D h} } } + +#define MMX_OP(x) OP(op2, SSE_OPF_MMX, \ + gen_helper_ ## x ## _mmx, gen_helper_ ## x ## _xmm, NULL, NULL, \ + NULL, gen_helper_ ## x ## _ymm, NULL, NULL) + +#define SSE_FOP(name) OP(op2, SSE_OPF_SCALAR, \ + gen_helper_##name##ps_xmm, gen_helper_##name##pd_xmm, \ + gen_helper_##name##ss, gen_helper_##name##sd, \ + gen_helper_##name##ps_ymm, gen_helper_##name##pd_ymm, NULL, NULL) +#define SSE_OP(sname, dname, op, flags) OP(op, flags, \ + gen_helper_##sname##_xmm, gen_helper_##dname##_xmm, NULL, NULL, \ + gen_helper_##sname##_ymm, gen_helper_##dname##_ymm, NULL, NULL) + +struct SSEOpHelper_table1 { + int flags; + union { + SSEFunc_0_epp op1; + SSEFunc_0_ppi op1i; + SSEFunc_0_eppt op1t; + SSEFunc_0_eppp op2; + SSEFunc_0_pppi op2i; + } fn[8]; +}; =20 -#define SSE_SPECIAL ((void *)1) -#define SSE_DUMMY ((void *)2) - -#define MMX_OP2(x) { gen_helper_ ## x ## _mmx, gen_helper_ ## x ## _xmm } -#define SSE_FOP(x) { gen_helper_ ## x ## ps, gen_helper_ ## x ## pd, \ - gen_helper_ ## x ## ss, gen_helper_ ## x ## sd, } +#define SSE_3DNOW { SSE_OPF_3DNOW } +#define SSE_SPECIAL { SSE_OPF_SPECIAL } =20 -static const SSEFunc_0_epp sse_op_table1[256][4] =3D { +static const struct SSEOpHelper_table1 sse_op_table1[256] =3D { /* 3DNow! extensions */ - [0x0e] =3D { SSE_DUMMY }, /* femms */ - [0x0f] =3D { SSE_DUMMY }, /* pf... */ + [0x0e] =3D SSE_SPECIAL, /* femms */ + [0x0f] =3D SSE_3DNOW, /* pf... (sse_op_table5) */ /* pure SSE operations */ - [0x10] =3D { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* = movups, movupd, movss, movsd */ - [0x11] =3D { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* = movups, movupd, movss, movsd */ - [0x12] =3D { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* = movlps, movlpd, movsldup, movddup */ - [0x13] =3D { SSE_SPECIAL, SSE_SPECIAL }, /* movlps, movlpd */ - [0x14] =3D { gen_helper_punpckldq_xmm, gen_helper_punpcklqdq_xmm }, - [0x15] =3D { gen_helper_punpckhdq_xmm, gen_helper_punpckhqdq_xmm }, - [0x16] =3D { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* movhps, movh= pd, movshdup */ - [0x17] =3D { SSE_SPECIAL, SSE_SPECIAL }, /* movhps, movhpd */ - - [0x28] =3D { SSE_SPECIAL, SSE_SPECIAL }, /* movaps, movapd */ - [0x29] =3D { SSE_SPECIAL, SSE_SPECIAL }, /* movaps, movapd */ - [0x2a] =3D { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* = cvtpi2ps, cvtpi2pd, cvtsi2ss, cvtsi2sd */ - [0x2b] =3D { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* = movntps, movntpd, movntss, movntsd */ - [0x2c] =3D { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* = cvttps2pi, cvttpd2pi, cvttsd2si, cvttss2si */ - [0x2d] =3D { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* = cvtps2pi, cvtpd2pi, cvtsd2si, cvtss2si */ - [0x2e] =3D { gen_helper_ucomiss, gen_helper_ucomisd }, - [0x2f] =3D { gen_helper_comiss, gen_helper_comisd }, - [0x50] =3D { SSE_SPECIAL, SSE_SPECIAL }, /* movmskps, movmskpd */ - [0x51] =3D SSE_FOP(sqrt), - [0x52] =3D { gen_helper_rsqrtps, NULL, gen_helper_rsqrtss, NULL }, - [0x53] =3D { gen_helper_rcpps, NULL, gen_helper_rcpss, NULL }, - [0x54] =3D { gen_helper_pand_xmm, gen_helper_pand_xmm }, /* andps, and= pd */ - [0x55] =3D { gen_helper_pandn_xmm, gen_helper_pandn_xmm }, /* andnps, = andnpd */ - [0x56] =3D { gen_helper_por_xmm, gen_helper_por_xmm }, /* orps, orpd */ - [0x57] =3D { gen_helper_pxor_xmm, gen_helper_pxor_xmm }, /* xorps, xor= pd */ + [0x10] =3D SSE_SPECIAL, /* movups, movupd, movss, movsd */ + [0x11] =3D SSE_SPECIAL, /* movups, movupd, movss, movsd */ + [0x12] =3D SSE_SPECIAL, /* movlps, movlpd, movsldup, movddup */ + [0x13] =3D SSE_SPECIAL, /* movlps, movlpd */ + [0x14] =3D SSE_OP(punpckldq, punpcklqdq, op2, 0), /* unpcklps, unpcklp= d */ + [0x15] =3D SSE_OP(punpckhdq, punpckhqdq, op2, 0), /* unpckhps, unpckhp= d */ + [0x16] =3D SSE_SPECIAL, /* movhps, movhpd, movshdup */ + [0x17] =3D SSE_SPECIAL, /* movhps, movhpd */ + + [0x28] =3D SSE_SPECIAL, /* movaps, movapd */ + [0x29] =3D SSE_SPECIAL, /* movaps, movapd */ + [0x2a] =3D SSE_SPECIAL, /* cvtpi2ps, cvtpi2pd, cvtsi2ss, cvtsi2sd */ + [0x2b] =3D SSE_SPECIAL, /* movntps, movntpd, movntss, movntsd */ + [0x2c] =3D SSE_SPECIAL, /* cvttps2pi, cvttpd2pi, cvttsd2si, cvttss2si = */ + [0x2d] =3D SSE_SPECIAL, /* cvtps2pi, cvtpd2pi, cvtsd2si, cvtss2si */ + [0x2e] =3D OP(op1, SSE_OPF_CMP | SSE_OPF_SCALAR | SSE_OPF_V0, + gen_helper_ucomiss, gen_helper_ucomisd, NULL, NULL, + NULL, NULL, NULL, NULL), + [0x2f] =3D OP(op1, SSE_OPF_CMP | SSE_OPF_SCALAR | SSE_OPF_V0, + gen_helper_comiss, gen_helper_comisd, NULL, NULL, + NULL, NULL, NULL, NULL), + [0x50] =3D SSE_SPECIAL, /* movmskps, movmskpd */ + [0x51] =3D OP(op1, SSE_OPF_SCALAR | SSE_OPF_V0, + gen_helper_sqrtps_xmm, gen_helper_sqrtpd_xmm, + gen_helper_sqrtss, gen_helper_sqrtsd, + gen_helper_sqrtps_ymm, gen_helper_sqrtpd_ymm, NULL, NULL), + [0x52] =3D OP(op1, SSE_OPF_SCALAR | SSE_OPF_V0, + gen_helper_rsqrtps_xmm, NULL, gen_helper_rsqrtss, NULL, + gen_helper_rsqrtps_ymm, NULL, NULL, NULL), + [0x53] =3D OP(op1, SSE_OPF_SCALAR | SSE_OPF_V0, + gen_helper_rcpps_xmm, NULL, gen_helper_rcpss, NULL, + gen_helper_rcpps_ymm, NULL, NULL, NULL), + [0x54] =3D SSE_OP(pand, pand, op2, 0), /* andps, andpd */ + [0x55] =3D SSE_OP(pandn, pandn, op2, 0), /* andnps, andnpd */ + [0x56] =3D SSE_OP(por, por, op2, 0), /* orps, orpd */ + [0x57] =3D SSE_OP(pxor, pxor, op2, 0), /* xorps, xorpd */ [0x58] =3D SSE_FOP(add), [0x59] =3D SSE_FOP(mul), - [0x5a] =3D { gen_helper_cvtps2pd, gen_helper_cvtpd2ps, - gen_helper_cvtss2sd, gen_helper_cvtsd2ss }, - [0x5b] =3D { gen_helper_cvtdq2ps, gen_helper_cvtps2dq, gen_helper_cvtt= ps2dq }, + [0x5a] =3D OP(op1, SSE_OPF_SCALAR | SSE_OPF_V0, + gen_helper_cvtps2pd_xmm, gen_helper_cvtpd2ps_xmm, + gen_helper_cvtss2sd, gen_helper_cvtsd2ss, + gen_helper_cvtps2pd_ymm, gen_helper_cvtpd2ps_ymm, NULL, NU= LL), + [0x5b] =3D OP(op1, SSE_OPF_V0, + gen_helper_cvtdq2ps_xmm, gen_helper_cvtps2dq_xmm, + gen_helper_cvttps2dq_xmm, NULL, + gen_helper_cvtdq2ps_ymm, gen_helper_cvtps2dq_ymm, + gen_helper_cvttps2dq_ymm, NULL), [0x5c] =3D SSE_FOP(sub), [0x5d] =3D SSE_FOP(min), [0x5e] =3D SSE_FOP(div), [0x5f] =3D SSE_FOP(max), =20 - [0xc2] =3D SSE_FOP(cmpeq), - [0xc6] =3D { (SSEFunc_0_epp)gen_helper_shufps, - (SSEFunc_0_epp)gen_helper_shufpd }, /* XXX: casts */ + [0xc2] =3D SSE_FOP(cmpeq), /* sse_op_table4 */ + [0xc6] =3D SSE_OP(shufps, shufpd, op2i, SSE_OPF_SHUF), =20 /* SSSE3, SSE4, MOVBE, CRC32, BMI1, BMI2, ADX. */ - [0x38] =3D { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, - [0x3a] =3D { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, + [0x38] =3D SSE_SPECIAL, + [0x3a] =3D SSE_SPECIAL, =20 /* MMX ops and their SSE extensions */ - [0x60] =3D MMX_OP2(punpcklbw), - [0x61] =3D MMX_OP2(punpcklwd), - [0x62] =3D MMX_OP2(punpckldq), - [0x63] =3D MMX_OP2(packsswb), - [0x64] =3D MMX_OP2(pcmpgtb), - [0x65] =3D MMX_OP2(pcmpgtw), - [0x66] =3D MMX_OP2(pcmpgtl), - [0x67] =3D MMX_OP2(packuswb), - [0x68] =3D MMX_OP2(punpckhbw), - [0x69] =3D MMX_OP2(punpckhwd), - [0x6a] =3D MMX_OP2(punpckhdq), - [0x6b] =3D MMX_OP2(packssdw), - [0x6c] =3D { NULL, gen_helper_punpcklqdq_xmm }, - [0x6d] =3D { NULL, gen_helper_punpckhqdq_xmm }, - [0x6e] =3D { SSE_SPECIAL, SSE_SPECIAL }, /* movd mm, ea */ - [0x6f] =3D { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* movq, movdqa,= , movqdu */ - [0x70] =3D { (SSEFunc_0_epp)gen_helper_pshufw_mmx, - (SSEFunc_0_epp)gen_helper_pshufd_xmm, - (SSEFunc_0_epp)gen_helper_pshufhw_xmm, - (SSEFunc_0_epp)gen_helper_pshuflw_xmm }, /* XXX: casts */ - [0x71] =3D { SSE_SPECIAL, SSE_SPECIAL }, /* shiftw */ - [0x72] =3D { SSE_SPECIAL, SSE_SPECIAL }, /* shiftd */ - [0x73] =3D { SSE_SPECIAL, SSE_SPECIAL }, /* shiftq */ - [0x74] =3D MMX_OP2(pcmpeqb), - [0x75] =3D MMX_OP2(pcmpeqw), - [0x76] =3D MMX_OP2(pcmpeql), - [0x77] =3D { SSE_DUMMY }, /* emms */ - [0x78] =3D { NULL, SSE_SPECIAL, NULL, SSE_SPECIAL }, /* extrq_i, inser= tq_i */ - [0x79] =3D { NULL, gen_helper_extrq_r, NULL, gen_helper_insertq_r }, - [0x7c] =3D { NULL, gen_helper_haddpd, NULL, gen_helper_haddps }, - [0x7d] =3D { NULL, gen_helper_hsubpd, NULL, gen_helper_hsubps }, - [0x7e] =3D { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* movd, movd, ,= movq */ - [0x7f] =3D { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* movq, movdqa,= movdqu */ - [0xc4] =3D { SSE_SPECIAL, SSE_SPECIAL }, /* pinsrw */ - [0xc5] =3D { SSE_SPECIAL, SSE_SPECIAL }, /* pextrw */ - [0xd0] =3D { NULL, gen_helper_addsubpd, NULL, gen_helper_addsubps }, - [0xd1] =3D MMX_OP2(psrlw), - [0xd2] =3D MMX_OP2(psrld), - [0xd3] =3D MMX_OP2(psrlq), - [0xd4] =3D MMX_OP2(paddq), - [0xd5] =3D MMX_OP2(pmullw), - [0xd6] =3D { NULL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, - [0xd7] =3D { SSE_SPECIAL, SSE_SPECIAL }, /* pmovmskb */ - [0xd8] =3D MMX_OP2(psubusb), - [0xd9] =3D MMX_OP2(psubusw), - [0xda] =3D MMX_OP2(pminub), - [0xdb] =3D MMX_OP2(pand), - [0xdc] =3D MMX_OP2(paddusb), - [0xdd] =3D MMX_OP2(paddusw), - [0xde] =3D MMX_OP2(pmaxub), - [0xdf] =3D MMX_OP2(pandn), - [0xe0] =3D MMX_OP2(pavgb), - [0xe1] =3D MMX_OP2(psraw), - [0xe2] =3D MMX_OP2(psrad), - [0xe3] =3D MMX_OP2(pavgw), - [0xe4] =3D MMX_OP2(pmulhuw), - [0xe5] =3D MMX_OP2(pmulhw), - [0xe6] =3D { NULL, gen_helper_cvttpd2dq, gen_helper_cvtdq2pd, gen_help= er_cvtpd2dq }, - [0xe7] =3D { SSE_SPECIAL , SSE_SPECIAL }, /* movntq, movntq */ - [0xe8] =3D MMX_OP2(psubsb), - [0xe9] =3D MMX_OP2(psubsw), - [0xea] =3D MMX_OP2(pminsw), - [0xeb] =3D MMX_OP2(por), - [0xec] =3D MMX_OP2(paddsb), - [0xed] =3D MMX_OP2(paddsw), - [0xee] =3D MMX_OP2(pmaxsw), - [0xef] =3D MMX_OP2(pxor), - [0xf0] =3D { NULL, NULL, NULL, SSE_SPECIAL }, /* lddqu */ - [0xf1] =3D MMX_OP2(psllw), - [0xf2] =3D MMX_OP2(pslld), - [0xf3] =3D MMX_OP2(psllq), - [0xf4] =3D MMX_OP2(pmuludq), - [0xf5] =3D MMX_OP2(pmaddwd), - [0xf6] =3D MMX_OP2(psadbw), - [0xf7] =3D { (SSEFunc_0_epp)gen_helper_maskmov_mmx, - (SSEFunc_0_epp)gen_helper_maskmov_xmm }, /* XXX: casts */ - [0xf8] =3D MMX_OP2(psubb), - [0xf9] =3D MMX_OP2(psubw), - [0xfa] =3D MMX_OP2(psubl), - [0xfb] =3D MMX_OP2(psubq), - [0xfc] =3D MMX_OP2(paddb), - [0xfd] =3D MMX_OP2(paddw), - [0xfe] =3D MMX_OP2(paddl), + [0x60] =3D MMX_OP(punpcklbw), + [0x61] =3D MMX_OP(punpcklwd), + [0x62] =3D MMX_OP(punpckldq), + [0x63] =3D MMX_OP(packsswb), + [0x64] =3D MMX_OP(pcmpgtb), + [0x65] =3D MMX_OP(pcmpgtw), + [0x66] =3D MMX_OP(pcmpgtl), + [0x67] =3D MMX_OP(packuswb), + [0x68] =3D MMX_OP(punpckhbw), + [0x69] =3D MMX_OP(punpckhwd), + [0x6a] =3D MMX_OP(punpckhdq), + [0x6b] =3D MMX_OP(packssdw), + [0x6c] =3D OP(op2, SSE_OPF_MMX, + NULL, gen_helper_punpcklqdq_xmm, NULL, NULL, + NULL, gen_helper_punpcklqdq_ymm, NULL, NULL), + [0x6d] =3D OP(op2, SSE_OPF_MMX, + NULL, gen_helper_punpckhqdq_xmm, NULL, NULL, + NULL, gen_helper_punpckhqdq_ymm, NULL, NULL), + [0x6e] =3D SSE_SPECIAL, /* movd mm, ea */ + [0x6f] =3D SSE_SPECIAL, /* movq, movdqa, , movqdu */ + [0x70] =3D OP(op1i, SSE_OPF_SHUF | SSE_OPF_MMX | SSE_OPF_V0, + gen_helper_pshufw_mmx, gen_helper_pshufd_xmm, + gen_helper_pshufhw_xmm, gen_helper_pshuflw_xmm, + NULL, gen_helper_pshufd_ymm, + gen_helper_pshufhw_ymm, gen_helper_pshuflw_ymm), + [0x71] =3D SSE_SPECIAL, /* shiftw */ + [0x72] =3D SSE_SPECIAL, /* shiftd */ + [0x73] =3D SSE_SPECIAL, /* shiftq */ + [0x74] =3D MMX_OP(pcmpeqb), + [0x75] =3D MMX_OP(pcmpeqw), + [0x76] =3D MMX_OP(pcmpeql), + [0x77] =3D SSE_SPECIAL, /* emms */ + [0x78] =3D SSE_SPECIAL, /* extrq_i, insertq_i (sse4a) */ + [0x79] =3D OP(op1, SSE_OPF_V0, + NULL, gen_helper_extrq_r, NULL, gen_helper_insertq_r, + NULL, NULL, NULL, NULL), + [0x7c] =3D OP(op2, 0, + NULL, gen_helper_haddpd_xmm, NULL, gen_helper_haddps_xmm, + NULL, gen_helper_haddpd_ymm, NULL, gen_helper_haddps_ymm), + [0x7d] =3D OP(op2, 0, + NULL, gen_helper_hsubpd_xmm, NULL, gen_helper_hsubps_xmm, + NULL, gen_helper_hsubpd_ymm, NULL, gen_helper_hsubps_ymm), + [0x7e] =3D SSE_SPECIAL, /* movd, movd, , movq */ + [0x7f] =3D SSE_SPECIAL, /* movq, movdqa, movdqu */ + [0xc4] =3D SSE_SPECIAL, /* pinsrw */ + [0xc5] =3D SSE_SPECIAL, /* pextrw */ + [0xd0] =3D OP(op2, 0, + NULL, gen_helper_addsubpd_xmm, NULL, gen_helper_addsubps_x= mm, + NULL, gen_helper_addsubpd_ymm, NULL, gen_helper_addsubps_y= mm), + [0xd1] =3D MMX_OP(psrlw), + [0xd2] =3D MMX_OP(psrld), + [0xd3] =3D MMX_OP(psrlq), + [0xd4] =3D MMX_OP(paddq), + [0xd5] =3D MMX_OP(pmullw), + [0xd6] =3D SSE_SPECIAL, + [0xd7] =3D SSE_SPECIAL, /* pmovmskb */ + [0xd8] =3D MMX_OP(psubusb), + [0xd9] =3D MMX_OP(psubusw), + [0xda] =3D MMX_OP(pminub), + [0xdb] =3D MMX_OP(pand), + [0xdc] =3D MMX_OP(paddusb), + [0xdd] =3D MMX_OP(paddusw), + [0xde] =3D MMX_OP(pmaxub), + [0xdf] =3D MMX_OP(pandn), + [0xe0] =3D MMX_OP(pavgb), + [0xe1] =3D MMX_OP(psraw), + [0xe2] =3D MMX_OP(psrad), + [0xe3] =3D MMX_OP(pavgw), + [0xe4] =3D MMX_OP(pmulhuw), + [0xe5] =3D MMX_OP(pmulhw), + [0xe6] =3D OP(op1, SSE_OPF_V0, + NULL, gen_helper_cvttpd2dq_xmm, + gen_helper_cvtdq2pd_xmm, gen_helper_cvtpd2dq_xmm, + NULL, gen_helper_cvttpd2dq_ymm, + gen_helper_cvtdq2pd_ymm, gen_helper_cvtpd2dq_ymm), + [0xe7] =3D SSE_SPECIAL, /* movntq, movntq */ + [0xe8] =3D MMX_OP(psubsb), + [0xe9] =3D MMX_OP(psubsw), + [0xea] =3D MMX_OP(pminsw), + [0xeb] =3D MMX_OP(por), + [0xec] =3D MMX_OP(paddsb), + [0xed] =3D MMX_OP(paddsw), + [0xee] =3D MMX_OP(pmaxsw), + [0xef] =3D MMX_OP(pxor), + [0xf0] =3D SSE_SPECIAL, /* lddqu */ + [0xf1] =3D MMX_OP(psllw), + [0xf2] =3D MMX_OP(pslld), + [0xf3] =3D MMX_OP(psllq), + [0xf4] =3D MMX_OP(pmuludq), + [0xf5] =3D MMX_OP(pmaddwd), + [0xf6] =3D MMX_OP(psadbw), + [0xf7] =3D OP(op1t, SSE_OPF_MMX | SSE_OPF_V0, + gen_helper_maskmov_mmx, gen_helper_maskmov_xmm, NULL, NULL, + NULL, NULL, NULL, NULL), + [0xf8] =3D MMX_OP(psubb), + [0xf9] =3D MMX_OP(psubw), + [0xfa] =3D MMX_OP(psubl), + [0xfb] =3D MMX_OP(psubq), + [0xfc] =3D MMX_OP(paddb), + [0xfd] =3D MMX_OP(paddw), + [0xfe] =3D MMX_OP(paddl), }; - -static const SSEFunc_0_epp sse_op_table2[3 * 8][2] =3D { - [0 + 2] =3D MMX_OP2(psrlw), - [0 + 4] =3D MMX_OP2(psraw), - [0 + 6] =3D MMX_OP2(psllw), - [8 + 2] =3D MMX_OP2(psrld), - [8 + 4] =3D MMX_OP2(psrad), - [8 + 6] =3D MMX_OP2(pslld), - [16 + 2] =3D MMX_OP2(psrlq), - [16 + 3] =3D { NULL, gen_helper_psrldq_xmm }, - [16 + 6] =3D MMX_OP2(psllq), - [16 + 7] =3D { NULL, gen_helper_pslldq_xmm }, +#undef MMX_OP +#undef OP +#undef SSE_FOP +#undef SSE_OP +#undef SSE_SPECIAL + +#define MMX_OP(x) { gen_helper_ ## x ## _mmx, gen_helper_ ## x ## _xmm, \ + gen_helper_ ## x ## _ymm} +static const SSEFunc_0_eppp sse_op_table2[3 * 8][3] =3D { + [0 + 2] =3D MMX_OP(psrlw), + [0 + 4] =3D MMX_OP(psraw), + [0 + 6] =3D MMX_OP(psllw), + [8 + 2] =3D MMX_OP(psrld), + [8 + 4] =3D MMX_OP(psrad), + [8 + 6] =3D MMX_OP(pslld), + [16 + 2] =3D MMX_OP(psrlq), + [16 + 3] =3D { NULL, gen_helper_psrldq_xmm, gen_helper_psrldq_ymm}, + [16 + 6] =3D MMX_OP(psllq), + [16 + 7] =3D { NULL, gen_helper_pslldq_xmm, gen_helper_pslldq_ymm}, }; +#undef MMX_OP =20 static const SSEFunc_0_epi sse_op_table3ai[] =3D { gen_helper_cvtsi2ss, @@ -2968,16 +3141,53 @@ static const SSEFunc_l_ep sse_op_table3bq[] =3D { }; #endif =20 -static const SSEFunc_0_epp sse_op_table4[8][4] =3D { - SSE_FOP(cmpeq), - SSE_FOP(cmplt), - SSE_FOP(cmple), - SSE_FOP(cmpunord), - SSE_FOP(cmpneq), - SSE_FOP(cmpnlt), - SSE_FOP(cmpnle), - SSE_FOP(cmpord), +#define SSE_CMP(x) { \ + gen_helper_ ## x ## ps ## _xmm, gen_helper_ ## x ## pd ## _xmm, \ + gen_helper_ ## x ## ss, gen_helper_ ## x ## sd, \ + gen_helper_ ## x ## ps ## _ymm, gen_helper_ ## x ## pd ## _ymm} +static const SSEFunc_0_eppp sse_op_table4[32][6] =3D { + SSE_CMP(cmpeq), + SSE_CMP(cmplt), + SSE_CMP(cmple), + SSE_CMP(cmpunord), + SSE_CMP(cmpneq), + SSE_CMP(cmpnlt), + SSE_CMP(cmpnle), + SSE_CMP(cmpord), + + SSE_CMP(cmpequ), + SSE_CMP(cmpnge), + SSE_CMP(cmpngt), + SSE_CMP(cmpfalse), + SSE_CMP(cmpnequ), + SSE_CMP(cmpge), + SSE_CMP(cmpgt), + SSE_CMP(cmptrue), + + SSE_CMP(cmpeqs), + SSE_CMP(cmpltq), + SSE_CMP(cmpleq), + SSE_CMP(cmpunords), + SSE_CMP(cmpneqq), + SSE_CMP(cmpnltq), + SSE_CMP(cmpnleq), + SSE_CMP(cmpords), + + SSE_CMP(cmpequs), + SSE_CMP(cmpngeq), + SSE_CMP(cmpngtq), + SSE_CMP(cmpfalses), + SSE_CMP(cmpnequs), + SSE_CMP(cmpgeq), + SSE_CMP(cmpgtq), + SSE_CMP(cmptrues), }; +#undef SSE_CMP + +static void gen_helper_pavgusb(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_= b) +{ + gen_helper_pavgb_mmx(env, reg_a, reg_a, reg_b); +} =20 static const SSEFunc_0_epp sse_op_table5[256] =3D { [0x0c] =3D gen_helper_pi2fw, @@ -3003,117 +3213,291 @@ static const SSEFunc_0_epp sse_op_table5[256] =3D= { [0xb6] =3D gen_helper_movq, /* pfrcpit2 */ [0xb7] =3D gen_helper_pmulhrw_mmx, [0xbb] =3D gen_helper_pswapd, - [0xbf] =3D gen_helper_pavgb_mmx /* pavgusb */ + [0xbf] =3D gen_helper_pavgusb, }; =20 -struct SSEOpHelper_epp { - SSEFunc_0_epp op[2]; +struct SSEOpHelper_table6 { + union { + SSEFunc_0_epp op1; + SSEFunc_0_eppp op2; + SSEFunc_0_epppp op3; + } fn[3]; /* [0] =3D mmx, [1] =3D xmm, fn[2] =3D ymm */ uint32_t ext_mask; + int flags; }; =20 -struct SSEOpHelper_eppi { - SSEFunc_0_eppi op[2]; +struct SSEOpHelper_table7 { + union { + SSEFunc_0_eppi op1; + SSEFunc_0_epppi op2; + SSEFunc_0_epppp op3; + } fn[3]; uint32_t ext_mask; + int flags; +}; + +#define gen_helper_special_xmm NULL +#define gen_helper_special_ymm NULL + +#define OP(name, op, flags, ext, mmx_name) \ + {{{.op =3D mmx_name}, {.op =3D gen_helper_ ## name ## _xmm}, \ + {.op =3D gen_helper_ ## name ## _ymm} }, CPUID_EXT_ ## ext, flags} +#define BINARY_OP_MMX(name, ext) \ + OP(name, op2, SSE_OPF_MMX, ext, gen_helper_ ## name ## _mmx) +#define BINARY_OP(name, ext, flags) \ + OP(name, op2, flags, ext, NULL) +#define UNARY_OP_MMX(name, ext) \ + OP(name, op1, SSE_OPF_V0 | SSE_OPF_MMX, ext, gen_helper_ ## name ## _m= mx) +#define UNARY_OP(name, ext, flags) \ + OP(name, op1, SSE_OPF_V0 | flags, ext, NULL) +#define BLENDV_OP(name, ext, flags) OP(name, op3, SSE_OPF_BLENDV, ext, NUL= L) +#define CMP_OP(name, ext) OP(name, op1, SSE_OPF_CMP | SSE_OPF_V0, ext, NUL= L) +#define SPECIAL_OP(ext) OP(special, op1, SSE_OPF_SPECIAL, ext, NULL) + +/* prefix [66] 0f 38 */ +static const struct SSEOpHelper_table6 sse_op_table6[256] =3D { + [0x00] =3D BINARY_OP_MMX(pshufb, SSSE3), + [0x01] =3D BINARY_OP_MMX(phaddw, SSSE3), + [0x02] =3D BINARY_OP_MMX(phaddd, SSSE3), + [0x03] =3D BINARY_OP_MMX(phaddsw, SSSE3), + [0x04] =3D BINARY_OP_MMX(pmaddubsw, SSSE3), + [0x05] =3D BINARY_OP_MMX(phsubw, SSSE3), + [0x06] =3D BINARY_OP_MMX(phsubd, SSSE3), + [0x07] =3D BINARY_OP_MMX(phsubsw, SSSE3), + [0x08] =3D BINARY_OP_MMX(psignb, SSSE3), + [0x09] =3D BINARY_OP_MMX(psignw, SSSE3), + [0x0a] =3D BINARY_OP_MMX(psignd, SSSE3), + [0x0b] =3D BINARY_OP_MMX(pmulhrsw, SSSE3), + [0x0c] =3D BINARY_OP(vpermilps, AVX, 0), + [0x0d] =3D BINARY_OP(vpermilpd, AVX, 0), + [0x0e] =3D CMP_OP(vtestps, AVX), + [0x0f] =3D CMP_OP(vtestpd, AVX), + [0x10] =3D BLENDV_OP(pblendvb, SSE41, SSE_OPF_MMX), + [0x14] =3D BLENDV_OP(blendvps, SSE41, 0), + [0x15] =3D BLENDV_OP(blendvpd, SSE41, 0), +#define gen_helper_vpermd_xmm NULL + [0x16] =3D BINARY_OP(vpermd, AVX, SSE_OPF_AVX2), /* vpermps */ + [0x17] =3D CMP_OP(ptest, SSE41), + /* TODO:Some vbroadcast variants require AVX2 */ + [0x18] =3D UNARY_OP(vbroadcastl, AVX, SSE_OPF_SCALAR), /* vbroadcastss= */ + [0x19] =3D UNARY_OP(vbroadcastq, AVX, SSE_OPF_SCALAR), /* vbroadcastsd= */ +#define gen_helper_vbroadcastdq_xmm NULL + [0x1a] =3D UNARY_OP(vbroadcastdq, AVX, SSE_OPF_SCALAR), /* vbroadcastf= 128 */ + [0x1c] =3D UNARY_OP_MMX(pabsb, SSSE3), + [0x1d] =3D UNARY_OP_MMX(pabsw, SSSE3), + [0x1e] =3D UNARY_OP_MMX(pabsd, SSSE3), + [0x20] =3D UNARY_OP(pmovsxbw, SSE41, SSE_OPF_MMX), + [0x21] =3D UNARY_OP(pmovsxbd, SSE41, SSE_OPF_MMX), + [0x22] =3D UNARY_OP(pmovsxbq, SSE41, SSE_OPF_MMX), + [0x23] =3D UNARY_OP(pmovsxwd, SSE41, SSE_OPF_MMX), + [0x24] =3D UNARY_OP(pmovsxwq, SSE41, SSE_OPF_MMX), + [0x25] =3D UNARY_OP(pmovsxdq, SSE41, SSE_OPF_MMX), + [0x28] =3D BINARY_OP(pmuldq, SSE41, SSE_OPF_MMX), + [0x29] =3D BINARY_OP(pcmpeqq, SSE41, SSE_OPF_MMX), + [0x2a] =3D SPECIAL_OP(SSE41), /* movntqda */ + [0x2b] =3D BINARY_OP(packusdw, SSE41, SSE_OPF_MMX), + [0x2c] =3D BINARY_OP(vpmaskmovd, AVX, 0), /* vmaskmovps */ + [0x2d] =3D BINARY_OP(vpmaskmovq, AVX, 0), /* vmaskmovpd */ + [0x2e] =3D SPECIAL_OP(AVX), /* vmaskmovps */ + [0x2f] =3D SPECIAL_OP(AVX), /* vmaskmovpd */ + [0x30] =3D UNARY_OP(pmovzxbw, SSE41, SSE_OPF_MMX), + [0x31] =3D UNARY_OP(pmovzxbd, SSE41, SSE_OPF_MMX), + [0x32] =3D UNARY_OP(pmovzxbq, SSE41, SSE_OPF_MMX), + [0x33] =3D UNARY_OP(pmovzxwd, SSE41, SSE_OPF_MMX), + [0x34] =3D UNARY_OP(pmovzxwq, SSE41, SSE_OPF_MMX), + [0x35] =3D UNARY_OP(pmovzxdq, SSE41, SSE_OPF_MMX), + [0x36] =3D BINARY_OP(vpermd, AVX, SSE_OPF_AVX2), /* vpermd */ + [0x37] =3D BINARY_OP(pcmpgtq, SSE41, SSE_OPF_MMX), + [0x38] =3D BINARY_OP(pminsb, SSE41, SSE_OPF_MMX), + [0x39] =3D BINARY_OP(pminsd, SSE41, SSE_OPF_MMX), + [0x3a] =3D BINARY_OP(pminuw, SSE41, SSE_OPF_MMX), + [0x3b] =3D BINARY_OP(pminud, SSE41, SSE_OPF_MMX), + [0x3c] =3D BINARY_OP(pmaxsb, SSE41, SSE_OPF_MMX), + [0x3d] =3D BINARY_OP(pmaxsd, SSE41, SSE_OPF_MMX), + [0x3e] =3D BINARY_OP(pmaxuw, SSE41, SSE_OPF_MMX), + [0x3f] =3D BINARY_OP(pmaxud, SSE41, SSE_OPF_MMX), + [0x40] =3D BINARY_OP(pmulld, SSE41, SSE_OPF_MMX), +#define gen_helper_phminposuw_ymm NULL + [0x41] =3D UNARY_OP(phminposuw, SSE41, 0), + [0x45] =3D BINARY_OP(vpsrlvd, AVX, SSE_OPF_AVX2), + [0x46] =3D BINARY_OP(vpsravd, AVX, SSE_OPF_AVX2), + [0x47] =3D BINARY_OP(vpsllvd, AVX, SSE_OPF_AVX2), + /* vpbroadcastd */ + [0x58] =3D UNARY_OP(vbroadcastl, AVX, SSE_OPF_SCALAR | SSE_OPF_MMX), + /* vpbroadcastq */ + [0x59] =3D UNARY_OP(vbroadcastq, AVX, SSE_OPF_SCALAR | SSE_OPF_MMX), + /* vbroadcasti128 */ + [0x5a] =3D UNARY_OP(vbroadcastdq, AVX, SSE_OPF_SCALAR | SSE_OPF_MMX), + /* vpbroadcastb */ + [0x78] =3D UNARY_OP(vbroadcastb, AVX, SSE_OPF_SCALAR | SSE_OPF_MMX), + /* vpbroadcastw */ + [0x79] =3D UNARY_OP(vbroadcastw, AVX, SSE_OPF_SCALAR | SSE_OPF_MMX), + /* vpmaskmovd, vpmaskmovq */ + [0x8c] =3D BINARY_OP(vpmaskmovd, AVX, SSE_OPF_AVX2), + [0x8e] =3D SPECIAL_OP(AVX), /* vpmaskmovd, vpmaskmovq */ + [0x90] =3D SPECIAL_OP(AVX), /* vpgatherdd, vpgatherdq */ + [0x91] =3D SPECIAL_OP(AVX), /* vpgatherqd, vpgatherqq */ + [0x92] =3D SPECIAL_OP(AVX), /* vgatherdpd, vgatherdps */ + [0x93] =3D SPECIAL_OP(AVX), /* vgatherqpd, vgatherqps */ +#define gen_helper_aesimc_ymm NULL + [0xdb] =3D UNARY_OP(aesimc, AES, 0), + [0xdc] =3D BINARY_OP(aesenc, AES, 0), + [0xdd] =3D BINARY_OP(aesenclast, AES, 0), + [0xde] =3D BINARY_OP(aesdec, AES, 0), + [0xdf] =3D BINARY_OP(aesdeclast, AES, 0), +}; + +/* prefix [66] 0f 3a */ +static const struct SSEOpHelper_table7 sse_op_table7[256] =3D { +#define gen_helper_vpermq_xmm NULL + [0x00] =3D UNARY_OP(vpermq, AVX, SSE_OPF_AVX2), + [0x01] =3D UNARY_OP(vpermq, AVX, SSE_OPF_AVX2), /* vpermpd */ + [0x02] =3D BINARY_OP(blendps, AVX, SSE_OPF_AVX2), /* vpblendd */ + [0x04] =3D UNARY_OP(vpermilps_imm, AVX, 0), + [0x05] =3D UNARY_OP(vpermilpd_imm, AVX, 0), +#define gen_helper_vpermdq_xmm NULL + [0x06] =3D BINARY_OP(vpermdq, AVX, 0), /* vperm2f128 */ + [0x08] =3D UNARY_OP(roundps, SSE41, 0), + [0x09] =3D UNARY_OP(roundpd, SSE41, 0), +#define gen_helper_roundss_ymm NULL + [0x0a] =3D UNARY_OP(roundss, SSE41, SSE_OPF_SCALAR), +#define gen_helper_roundsd_ymm NULL + [0x0b] =3D UNARY_OP(roundsd, SSE41, SSE_OPF_SCALAR), + [0x0c] =3D BINARY_OP(blendps, SSE41, 0), + [0x0d] =3D BINARY_OP(blendpd, SSE41, 0), + [0x0e] =3D BINARY_OP(pblendw, SSE41, SSE_OPF_MMX), + [0x0f] =3D BINARY_OP_MMX(palignr, SSSE3), + [0x14] =3D SPECIAL_OP(SSE41), /* pextrb */ + [0x15] =3D SPECIAL_OP(SSE41), /* pextrw */ + [0x16] =3D SPECIAL_OP(SSE41), /* pextrd/pextrq */ + [0x17] =3D SPECIAL_OP(SSE41), /* extractps */ + [0x18] =3D SPECIAL_OP(AVX), /* vinsertf128 */ + [0x19] =3D SPECIAL_OP(AVX), /* vextractf128 */ + [0x20] =3D SPECIAL_OP(SSE41), /* pinsrb */ + [0x21] =3D SPECIAL_OP(SSE41), /* insertps */ + [0x22] =3D SPECIAL_OP(SSE41), /* pinsrd/pinsrq */ + [0x38] =3D SPECIAL_OP(AVX), /* vinserti128 */ + [0x39] =3D SPECIAL_OP(AVX), /* vextracti128 */ + [0x40] =3D BINARY_OP(dpps, SSE41, 0), +#define gen_helper_dppd_ymm NULL + [0x41] =3D BINARY_OP(dppd, SSE41, 0), + [0x42] =3D BINARY_OP(mpsadbw, SSE41, SSE_OPF_MMX), + [0x44] =3D BINARY_OP(pclmulqdq, PCLMULQDQ, 0), + [0x46] =3D BINARY_OP(vpermdq, AVX, SSE_OPF_AVX2), /* vperm2i128 */ + [0x4a] =3D BLENDV_OP(blendvps, SSE41, 0), + [0x4b] =3D BLENDV_OP(blendvpd, SSE41, 0), + [0x4c] =3D BLENDV_OP(pblendvb, SSE41, SSE_OPF_MMX), +#define gen_helper_pcmpestrm_ymm NULL + [0x60] =3D CMP_OP(pcmpestrm, SSE42), +#define gen_helper_pcmpestri_ymm NULL + [0x61] =3D CMP_OP(pcmpestri, SSE42), +#define gen_helper_pcmpistrm_ymm NULL + [0x62] =3D CMP_OP(pcmpistrm, SSE42), +#define gen_helper_pcmpistri_ymm NULL + [0x63] =3D CMP_OP(pcmpistri, SSE42), +#define gen_helper_aeskeygenassist_ymm NULL + [0xdf] =3D UNARY_OP(aeskeygenassist, AES, 0), }; =20 -#define SSSE3_OP(x) { MMX_OP2(x), CPUID_EXT_SSSE3 } -#define SSE41_OP(x) { { NULL, gen_helper_ ## x ## _xmm }, CPUID_EXT_SSE41 } -#define SSE42_OP(x) { { NULL, gen_helper_ ## x ## _xmm }, CPUID_EXT_SSE42 } -#define SSE41_SPECIAL { { NULL, SSE_SPECIAL }, CPUID_EXT_SSE41 } -#define PCLMULQDQ_OP(x) { { NULL, gen_helper_ ## x ## _xmm }, \ - CPUID_EXT_PCLMULQDQ } -#define AESNI_OP(x) { { NULL, gen_helper_ ## x ## _xmm }, CPUID_EXT_AES } - -static const struct SSEOpHelper_epp sse_op_table6[256] =3D { - [0x00] =3D SSSE3_OP(pshufb), - [0x01] =3D SSSE3_OP(phaddw), - [0x02] =3D SSSE3_OP(phaddd), - [0x03] =3D SSSE3_OP(phaddsw), - [0x04] =3D SSSE3_OP(pmaddubsw), - [0x05] =3D SSSE3_OP(phsubw), - [0x06] =3D SSSE3_OP(phsubd), - [0x07] =3D SSSE3_OP(phsubsw), - [0x08] =3D SSSE3_OP(psignb), - [0x09] =3D SSSE3_OP(psignw), - [0x0a] =3D SSSE3_OP(psignd), - [0x0b] =3D SSSE3_OP(pmulhrsw), - [0x10] =3D SSE41_OP(pblendvb), - [0x14] =3D SSE41_OP(blendvps), - [0x15] =3D SSE41_OP(blendvpd), - [0x17] =3D SSE41_OP(ptest), - [0x1c] =3D SSSE3_OP(pabsb), - [0x1d] =3D SSSE3_OP(pabsw), - [0x1e] =3D SSSE3_OP(pabsd), - [0x20] =3D SSE41_OP(pmovsxbw), - [0x21] =3D SSE41_OP(pmovsxbd), - [0x22] =3D SSE41_OP(pmovsxbq), - [0x23] =3D SSE41_OP(pmovsxwd), - [0x24] =3D SSE41_OP(pmovsxwq), - [0x25] =3D SSE41_OP(pmovsxdq), - [0x28] =3D SSE41_OP(pmuldq), - [0x29] =3D SSE41_OP(pcmpeqq), - [0x2a] =3D SSE41_SPECIAL, /* movntqda */ - [0x2b] =3D SSE41_OP(packusdw), - [0x30] =3D SSE41_OP(pmovzxbw), - [0x31] =3D SSE41_OP(pmovzxbd), - [0x32] =3D SSE41_OP(pmovzxbq), - [0x33] =3D SSE41_OP(pmovzxwd), - [0x34] =3D SSE41_OP(pmovzxwq), - [0x35] =3D SSE41_OP(pmovzxdq), - [0x37] =3D SSE42_OP(pcmpgtq), - [0x38] =3D SSE41_OP(pminsb), - [0x39] =3D SSE41_OP(pminsd), - [0x3a] =3D SSE41_OP(pminuw), - [0x3b] =3D SSE41_OP(pminud), - [0x3c] =3D SSE41_OP(pmaxsb), - [0x3d] =3D SSE41_OP(pmaxsd), - [0x3e] =3D SSE41_OP(pmaxuw), - [0x3f] =3D SSE41_OP(pmaxud), - [0x40] =3D SSE41_OP(pmulld), - [0x41] =3D SSE41_OP(phminposuw), - [0xdb] =3D AESNI_OP(aesimc), - [0xdc] =3D AESNI_OP(aesenc), - [0xdd] =3D AESNI_OP(aesenclast), - [0xde] =3D AESNI_OP(aesdec), - [0xdf] =3D AESNI_OP(aesdeclast), +#define SSE_OP(name) \ + {gen_helper_ ## name ##_xmm, gen_helper_ ## name ##_ymm} +static const SSEFunc_0_eppp sse_op_table8[3][2] =3D { + SSE_OP(vpsrlvq), + SSE_OP(vpsravq), + SSE_OP(vpsllvq), }; =20 -static const struct SSEOpHelper_eppi sse_op_table7[256] =3D { - [0x08] =3D SSE41_OP(roundps), - [0x09] =3D SSE41_OP(roundpd), - [0x0a] =3D SSE41_OP(roundss), - [0x0b] =3D SSE41_OP(roundsd), - [0x0c] =3D SSE41_OP(blendps), - [0x0d] =3D SSE41_OP(blendpd), - [0x0e] =3D SSE41_OP(pblendw), - [0x0f] =3D SSSE3_OP(palignr), - [0x14] =3D SSE41_SPECIAL, /* pextrb */ - [0x15] =3D SSE41_SPECIAL, /* pextrw */ - [0x16] =3D SSE41_SPECIAL, /* pextrd/pextrq */ - [0x17] =3D SSE41_SPECIAL, /* extractps */ - [0x20] =3D SSE41_SPECIAL, /* pinsrb */ - [0x21] =3D SSE41_SPECIAL, /* insertps */ - [0x22] =3D SSE41_SPECIAL, /* pinsrd/pinsrq */ - [0x40] =3D SSE41_OP(dpps), - [0x41] =3D SSE41_OP(dppd), - [0x42] =3D SSE41_OP(mpsadbw), - [0x44] =3D PCLMULQDQ_OP(pclmulqdq), - [0x60] =3D SSE42_OP(pcmpestrm), - [0x61] =3D SSE42_OP(pcmpestri), - [0x62] =3D SSE42_OP(pcmpistrm), - [0x63] =3D SSE42_OP(pcmpistri), - [0xdf] =3D AESNI_OP(aeskeygenassist), +static const SSEFunc_0_eppt sse_op_table9[2][2] =3D { + SSE_OP(vpmaskmovd_st), + SSE_OP(vpmaskmovq_st), }; =20 +static const SSEFunc_0_epppt sse_op_table10[16][2] =3D { + SSE_OP(vpgatherdd0), + SSE_OP(vpgatherdq0), + SSE_OP(vpgatherqd0), + SSE_OP(vpgatherqq0), + SSE_OP(vpgatherdd1), + SSE_OP(vpgatherdq1), + SSE_OP(vpgatherqd1), + SSE_OP(vpgatherqq1), + SSE_OP(vpgatherdd2), + SSE_OP(vpgatherdq2), + SSE_OP(vpgatherqd2), + SSE_OP(vpgatherqq2), + SSE_OP(vpgatherdd3), + SSE_OP(vpgatherdq3), + SSE_OP(vpgatherqd3), + SSE_OP(vpgatherqq3), +}; +#undef SSE_OP + +#undef OP +#undef BINARY_OP_MMX +#undef BINARY_OP +#undef UNARY_OP_MMX +#undef UNARY_OP +#undef BLENDV_OP +#undef SPECIAL_OP + +/* VEX prefix not allowed */ +#define CHECK_NO_VEX(s) do { \ + if (s->prefix & PREFIX_VEX) \ + goto illegal_op; \ + } while (0) + +/* + * VEX encodings require AVX + * Allow legacy SSE encodings even if AVX not enabled + */ +#define CHECK_AVX(s) do { \ + if ((s->prefix & PREFIX_VEX) \ + && !(env->hflags & HF_AVX_EN_MASK)) \ + goto illegal_op; \ + } while (0) + +/* If a VEX prefix is used then it must have V=3D1111b */ +#define CHECK_AVX_V0(s) do { \ + CHECK_AVX(s); \ + if ((s->prefix & PREFIX_VEX) && (s->vex_v !=3D 0)) \ + goto illegal_op; \ + } while (0) + +/* If a VEX prefix is used then it must have L=3D0 */ +#define CHECK_AVX_128(s) do { \ + CHECK_AVX(s); \ + if ((s->prefix & PREFIX_VEX) && (s->vex_l !=3D 0)) \ + goto illegal_op; \ + } while (0) + +/* If a VEX prefix is used then it must have V=3D1111b and L=3D0 */ +#define CHECK_AVX_V0_128(s) do { \ + CHECK_AVX(s); \ + if ((s->prefix & PREFIX_VEX) && (s->vex_v !=3D 0 || s->vex_l !=3D 0)) \ + goto illegal_op; \ + } while (0) + +/* 256-bit (ymm) variants require AVX2 */ +#define CHECK_AVX2_256(s) do { \ + if (s->vex_l && !(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_AVX2)) \ + goto illegal_op; \ + } while (0) + +/* Requires AVX2 and VEX encoding */ +#define CHECK_AVX2(s) do { \ + if ((s->prefix & PREFIX_VEX) =3D=3D 0 \ + || !(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_AVX2)) \ + goto illegal_op; \ + } while (0) + static void gen_sse(CPUX86State *env, DisasContext *s, int b, target_ulong pc_start) { - int b1, op1_offset, op2_offset, is_xmm, val; - int modrm, mod, rm, reg; - SSEFunc_0_epp sse_fn_epp; - SSEFunc_0_eppi sse_fn_eppi; - SSEFunc_0_ppi sse_fn_ppi; - SSEFunc_0_eppt sse_fn_eppt; + int b1, op1_offset, op2_offset, v_offset, is_xmm, val, scalar_op; + int modrm, mod, rm, reg, reg_v; + struct SSEOpHelper_table1 sse_op; + struct SSEOpHelper_table6 op6; + struct SSEOpHelper_table7 op7; MemOp ot; =20 b &=3D 0xff; @@ -3125,10 +3509,7 @@ static void gen_sse(CPUX86State *env, DisasContext *= s, int b, b1 =3D 3; else b1 =3D 0; - sse_fn_epp =3D sse_op_table1[b][b1]; - if (!sse_fn_epp) { - goto unknown_op; - } + sse_op =3D sse_op_table1[b]; if ((b <=3D 0x5f && b >=3D 0x10) || b =3D=3D 0xc6 || b =3D=3D 0xc2) { is_xmm =3D 1; } else { @@ -3139,20 +3520,28 @@ static void gen_sse(CPUX86State *env, DisasContext = *s, int b, is_xmm =3D 1; } } + if (sse_op.flags & SSE_OPF_3DNOW) { + if (!(s->cpuid_ext2_features & CPUID_EXT2_3DNOW)) { + goto illegal_op; + } + } /* simple MMX/SSE operation */ if (s->flags & HF_TS_MASK) { gen_exception(s, EXCP07_PREX, pc_start - s->cs_base); return; } - if (s->flags & HF_EM_MASK) { - illegal_op: - gen_illegal_opcode(s); - return; - } - if (is_xmm - && !(s->flags & HF_OSFXSR_MASK) - && (b !=3D 0x38 && b !=3D 0x3a)) { - goto unknown_op; + /* VEX encoded instuctions ignore EM bit. See also CHECK_AVX */ + if (!(s->prefix & PREFIX_VEX)) { + if (s->flags & HF_EM_MASK) { + illegal_op: + gen_illegal_opcode(s); + return; + } + if (is_xmm + && !(s->flags & HF_OSFXSR_MASK) + && (b !=3D 0x38 && b !=3D 0x3a)) { + goto unknown_op; + } } if (b =3D=3D 0x0e) { if (!(s->cpuid_ext2_features & CPUID_EXT2_3DNOW)) { @@ -3164,9 +3553,29 @@ static void gen_sse(CPUX86State *env, DisasContext *= s, int b, return; } if (b =3D=3D 0x77) { - /* emms */ - gen_helper_emms(cpu_env); - return; + if (s->prefix & PREFIX_VEX) { + CHECK_AVX(s); + if (s->vex_l) { + gen_helper_vzeroall(cpu_env); +#ifdef TARGET_X86_64 + if (CODE64(s)) { + gen_helper_vzeroall_hi8(cpu_env); + } +#endif + } else { + gen_helper_vzeroupper(cpu_env); +#ifdef TARGET_X86_64 + if (CODE64(s)) { + gen_helper_vzeroupper_hi8(cpu_env); + } +#endif + } + return; + } else { + /* emms */ + gen_helper_emms(cpu_env); + return; + } } /* prepare MMX state (XXX: optimize by storing fptt and fptags in the static cpu state) */ @@ -3179,11 +3588,17 @@ static void gen_sse(CPUX86State *env, DisasContext = *s, int b, if (is_xmm) { reg |=3D REX_R(s); } + if (s->prefix & PREFIX_VEX) { + reg_v =3D s->vex_v; + } else { + reg_v =3D reg; + } mod =3D (modrm >> 6) & 3; - if (sse_fn_epp =3D=3D SSE_SPECIAL) { + if (sse_op.flags & SSE_OPF_SPECIAL) { b |=3D (b1 << 8); switch(b) { case 0x0e7: /* movntq */ + CHECK_NO_VEX(s); if (mod =3D=3D 3) { goto illegal_op; } @@ -3193,19 +3608,31 @@ static void gen_sse(CPUX86State *env, DisasContext = *s, int b, case 0x1e7: /* movntdq */ case 0x02b: /* movntps */ case 0x12b: /* movntps */ + CHECK_AVX_V0(s); if (mod =3D=3D 3) goto illegal_op; gen_lea_modrm(env, s, modrm); - gen_sto_env_A0(s, offsetof(CPUX86State, xmm_regs[reg])); + if (s->vex_l) { + gen_sty_env_A0(s, XMM_OFFSET(reg)); + } else { + gen_sto_env_A0(s, XMM_OFFSET(reg)); + } break; case 0x3f0: /* lddqu */ + CHECK_AVX_V0(s); if (mod =3D=3D 3) goto illegal_op; gen_lea_modrm(env, s, modrm); - gen_ldo_env_A0(s, offsetof(CPUX86State, xmm_regs[reg])); + if (s->vex_l) { + gen_ldy_env_A0(s, XMM_OFFSET(reg)); + } else { + gen_ldo_env_A0(s, XMM_OFFSET(reg)); + gen_clear_ymmh(s, reg); + } break; case 0x22b: /* movntss */ case 0x32b: /* movntsd */ + CHECK_AVX_V0_128(s); if (mod =3D=3D 3) goto illegal_op; gen_lea_modrm(env, s, modrm); @@ -3219,6 +3646,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s= , int b, } break; case 0x6e: /* movd mm, ea */ + CHECK_NO_VEX(s); #ifdef TARGET_X86_64 if (s->dflag =3D=3D MO_64) { gen_ldst_modrm(env, s, modrm, MO_64, OR_TMP0, 0); @@ -3235,23 +3663,24 @@ static void gen_sse(CPUX86State *env, DisasContext = *s, int b, } break; case 0x16e: /* movd xmm, ea */ + CHECK_AVX_V0_128(s); #ifdef TARGET_X86_64 if (s->dflag =3D=3D MO_64) { gen_ldst_modrm(env, s, modrm, MO_64, OR_TMP0, 0); - tcg_gen_addi_ptr(s->ptr0, cpu_env, - offsetof(CPUX86State,xmm_regs[reg])); + tcg_gen_addi_ptr(s->ptr0, cpu_env, XMM_OFFSET(reg)); gen_helper_movq_mm_T0_xmm(s->ptr0, s->T0); } else #endif { gen_ldst_modrm(env, s, modrm, MO_32, OR_TMP0, 0); - tcg_gen_addi_ptr(s->ptr0, cpu_env, - offsetof(CPUX86State,xmm_regs[reg])); + tcg_gen_addi_ptr(s->ptr0, cpu_env, XMM_OFFSET(reg)); tcg_gen_trunc_tl_i32(s->tmp2_i32, s->T0); gen_helper_movl_mm_T0_xmm(s->ptr0, s->tmp2_i32); } + gen_clear_ymmh(s, reg); break; case 0x6f: /* movq mm, ea */ + CHECK_NO_VEX(s); if (mod !=3D 3) { gen_lea_modrm(env, s, modrm); gen_ldq_env_A0(s, offsetof(CPUX86State, fpregs[reg].mmx)); @@ -3269,17 +3698,28 @@ static void gen_sse(CPUX86State *env, DisasContext = *s, int b, case 0x128: /* movapd */ case 0x16f: /* movdqa xmm, ea */ case 0x26f: /* movdqu xmm, ea */ + CHECK_AVX_V0(s); if (mod !=3D 3) { gen_lea_modrm(env, s, modrm); - gen_ldo_env_A0(s, offsetof(CPUX86State, xmm_regs[reg])); + if (s->vex_l) { + gen_ldy_env_A0(s, XMM_OFFSET(reg)); + } else { + gen_ldo_env_A0(s, XMM_OFFSET(reg)); + } } else { rm =3D (modrm & 7) | REX_B(s); - gen_op_movo(s, offsetof(CPUX86State, xmm_regs[reg]), - offsetof(CPUX86State,xmm_regs[rm])); + gen_op_movo(s, XMM_OFFSET(reg), XMM_OFFSET(rm)); + if (s->vex_l) { + gen_op_movo_ymmh(s, XMM_OFFSET(reg), XMM_OFFSET(rm)); + } + } + if (!s->vex_l) { + gen_clear_ymmh(s, reg); } break; case 0x210: /* movss xmm, ea */ if (mod !=3D 3) { + CHECK_AVX_V0_128(s); gen_lea_modrm(env, s, modrm); gen_op_ld_v(s, MO_32, s->T0, s->A0); tcg_gen_st32_tl(s->T0, cpu_env, @@ -3292,13 +3732,21 @@ static void gen_sse(CPUX86State *env, DisasContext = *s, int b, tcg_gen_st32_tl(s->T0, cpu_env, offsetof(CPUX86State, xmm_regs[reg].ZMM_L(= 3))); } else { + CHECK_AVX_128(s); rm =3D (modrm & 7) | REX_B(s); - gen_op_movl(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_L(0= )), - offsetof(CPUX86State,xmm_regs[rm].ZMM_L(0))); + tcg_gen_ld_i32(s->tmp2_i32, cpu_env, + offsetof(CPUX86State, xmm_regs[rm].ZMM_L(0)= )); + if (reg !=3D reg_v) { + gen_op_movo(s, XMM_OFFSET(reg), XMM_OFFSET(reg_v)); + } + tcg_gen_st_i32(s->tmp2_i32, cpu_env, + offsetof(CPUX86State, xmm_regs[reg].ZMM_L(0= ))); } + gen_clear_ymmh(s, reg); break; case 0x310: /* movsd xmm, ea */ if (mod !=3D 3) { + CHECK_AVX_V0_128(s); gen_lea_modrm(env, s, modrm); gen_ldq_env_A0(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(0))); @@ -3308,13 +3756,21 @@ static void gen_sse(CPUX86State *env, DisasContext = *s, int b, tcg_gen_st32_tl(s->T0, cpu_env, offsetof(CPUX86State, xmm_regs[reg].ZMM_L(= 3))); } else { + CHECK_AVX_128(s); rm =3D (modrm & 7) | REX_B(s); + if (reg !=3D reg_v) { + gen_op_movq(s, + offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(1)), + offsetof(CPUX86State, xmm_regs[reg_v].ZMM_Q(1)= )); + } gen_op_movq(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(0= )), - offsetof(CPUX86State,xmm_regs[rm].ZMM_Q(0))); + offsetof(CPUX86State, xmm_regs[rm].ZMM_Q(0))); } + gen_clear_ymmh(s, reg); break; case 0x012: /* movlps */ case 0x112: /* movlpd */ + CHECK_AVX_128(s); if (mod !=3D 3) { gen_lea_modrm(env, s, modrm); gen_ldq_env_A0(s, offsetof(CPUX86State, @@ -3323,40 +3779,84 @@ static void gen_sse(CPUX86State *env, DisasContext = *s, int b, /* movhlps */ rm =3D (modrm & 7) | REX_B(s); gen_op_movq(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(0= )), - offsetof(CPUX86State,xmm_regs[rm].ZMM_Q(1))); + offsetof(CPUX86State, xmm_regs[rm].ZMM_Q(1))); + } + if (reg !=3D reg_v) { + gen_op_movq(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(1= )), + offsetof(CPUX86State, xmm_regs[reg_v].ZMM_Q(1)= )); } + gen_clear_ymmh(s, reg); break; case 0x212: /* movsldup */ + CHECK_AVX_V0(s); if (mod !=3D 3) { gen_lea_modrm(env, s, modrm); - gen_ldo_env_A0(s, offsetof(CPUX86State, xmm_regs[reg])); + if (s->vex_l) { + gen_ldy_env_A0(s, XMM_OFFSET(reg)); + } else { + gen_ldo_env_A0(s, XMM_OFFSET(reg)); + } } else { rm =3D (modrm & 7) | REX_B(s); gen_op_movl(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_L(0= )), - offsetof(CPUX86State,xmm_regs[rm].ZMM_L(0))); + offsetof(CPUX86State, xmm_regs[rm].ZMM_L(0))); gen_op_movl(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_L(2= )), - offsetof(CPUX86State,xmm_regs[rm].ZMM_L(2))); + offsetof(CPUX86State, xmm_regs[rm].ZMM_L(2))); + if (s->vex_l) { + gen_op_movl(s, + offsetof(CPUX86State, xmm_regs[reg].ZMM_L(= 4)), + offsetof(CPUX86State, xmm_regs[rm].ZMM_L(4= ))); + gen_op_movl(s, + offsetof(CPUX86State, xmm_regs[reg].ZMM_L(= 6)), + offsetof(CPUX86State, xmm_regs[rm].ZMM_L(6= ))); + } } gen_op_movl(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_L(1)), - offsetof(CPUX86State,xmm_regs[reg].ZMM_L(0))); + offsetof(CPUX86State, xmm_regs[reg].ZMM_L(0))); gen_op_movl(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_L(3)), - offsetof(CPUX86State,xmm_regs[reg].ZMM_L(2))); + offsetof(CPUX86State, xmm_regs[reg].ZMM_L(2))); + if (s->vex_l) { + gen_op_movl(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_L(5= )), + offsetof(CPUX86State, xmm_regs[reg].ZMM_L(4))); + gen_op_movl(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_L(7= )), + offsetof(CPUX86State, xmm_regs[reg].ZMM_L(6))); + } else { + gen_clear_ymmh(s, reg); + } break; case 0x312: /* movddup */ + CHECK_AVX_V0(s); if (mod !=3D 3) { gen_lea_modrm(env, s, modrm); gen_ldq_env_A0(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(0))); + if (s->vex_l) { + tcg_gen_addi_tl(s->A0, s->A0, 16); + gen_ldq_env_A0(s, offsetof(CPUX86State, + xmm_regs[reg].ZMM_Q(2))); + } } else { rm =3D (modrm & 7) | REX_B(s); gen_op_movq(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(0= )), - offsetof(CPUX86State,xmm_regs[rm].ZMM_Q(0))); + offsetof(CPUX86State, xmm_regs[rm].ZMM_Q(0))); + if (s->vex_l) { + gen_op_movq(s, + offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(= 2)), + offsetof(CPUX86State, xmm_regs[rm].ZMM_Q(2= ))); + } } gen_op_movq(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(1)), - offsetof(CPUX86State,xmm_regs[reg].ZMM_Q(0))); + offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(0))); + if (s->vex_l) { + gen_op_movq(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(3= )), + offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(2))); + } else { + gen_clear_ymmh(s, reg); + } break; case 0x016: /* movhps */ case 0x116: /* movhpd */ + CHECK_AVX_128(s); if (mod !=3D 3) { gen_lea_modrm(env, s, modrm); gen_ldq_env_A0(s, offsetof(CPUX86State, @@ -3365,27 +3865,54 @@ static void gen_sse(CPUX86State *env, DisasContext = *s, int b, /* movlhps */ rm =3D (modrm & 7) | REX_B(s); gen_op_movq(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(1= )), - offsetof(CPUX86State,xmm_regs[rm].ZMM_Q(0))); + offsetof(CPUX86State, xmm_regs[rm].ZMM_Q(0))); + } + if (reg !=3D reg_v) { + gen_op_movq(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(0= )), + offsetof(CPUX86State, xmm_regs[reg_v].ZMM_Q(0)= )); } + gen_clear_ymmh(s, reg); break; case 0x216: /* movshdup */ + CHECK_AVX_V0(s); if (mod !=3D 3) { gen_lea_modrm(env, s, modrm); - gen_ldo_env_A0(s, offsetof(CPUX86State, xmm_regs[reg])); + if (s->vex_l) { + gen_ldy_env_A0(s, XMM_OFFSET(reg)); + } else { + gen_ldo_env_A0(s, XMM_OFFSET(reg)); + } } else { rm =3D (modrm & 7) | REX_B(s); gen_op_movl(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_L(1= )), - offsetof(CPUX86State,xmm_regs[rm].ZMM_L(1))); + offsetof(CPUX86State, xmm_regs[rm].ZMM_L(1))); gen_op_movl(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_L(3= )), - offsetof(CPUX86State,xmm_regs[rm].ZMM_L(3))); + offsetof(CPUX86State, xmm_regs[rm].ZMM_L(3))); + if (s->vex_l) { + gen_op_movl(s, + offsetof(CPUX86State, xmm_regs[reg].ZMM_L(= 5)), + offsetof(CPUX86State, xmm_regs[rm].ZMM_L(5= ))); + gen_op_movl(s, + offsetof(CPUX86State, xmm_regs[reg].ZMM_L(= 7)), + offsetof(CPUX86State, xmm_regs[rm].ZMM_L(7= ))); + } } gen_op_movl(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_L(0)), - offsetof(CPUX86State,xmm_regs[reg].ZMM_L(1))); + offsetof(CPUX86State, xmm_regs[reg].ZMM_L(1))); gen_op_movl(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_L(2)), - offsetof(CPUX86State,xmm_regs[reg].ZMM_L(3))); + offsetof(CPUX86State, xmm_regs[reg].ZMM_L(3))); + if (s->vex_l) { + gen_op_movl(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_L(4= )), + offsetof(CPUX86State, xmm_regs[reg].ZMM_L(5))); + gen_op_movl(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_L(6= )), + offsetof(CPUX86State, xmm_regs[reg].ZMM_L(7))); + } else { + gen_clear_ymmh(s, reg); + } break; case 0x178: case 0x378: + CHECK_NO_VEX(s); { int bit_index, field_length; =20 @@ -3393,8 +3920,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s= , int b, goto illegal_op; field_length =3D x86_ldub_code(env, s) & 0x3F; bit_index =3D x86_ldub_code(env, s) & 0x3F; - tcg_gen_addi_ptr(s->ptr0, cpu_env, - offsetof(CPUX86State,xmm_regs[reg])); + tcg_gen_addi_ptr(s->ptr0, cpu_env, XMM_OFFSET(reg)); if (b1 =3D=3D 1) gen_helper_extrq_i(cpu_env, s->ptr0, tcg_const_i32(bit_index), @@ -3406,6 +3932,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s= , int b, } break; case 0x7e: /* movd ea, mm */ + CHECK_NO_VEX(s); #ifdef TARGET_X86_64 if (s->dflag =3D=3D MO_64) { tcg_gen_ld_i64(s->T0, cpu_env, @@ -3420,20 +3947,22 @@ static void gen_sse(CPUX86State *env, DisasContext = *s, int b, } break; case 0x17e: /* movd ea, xmm */ + CHECK_AVX_V0_128(s); #ifdef TARGET_X86_64 if (s->dflag =3D=3D MO_64) { tcg_gen_ld_i64(s->T0, cpu_env, - offsetof(CPUX86State,xmm_regs[reg].ZMM_Q(0)= )); + offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(0= ))); gen_ldst_modrm(env, s, modrm, MO_64, OR_TMP0, 1); } else #endif { tcg_gen_ld32u_tl(s->T0, cpu_env, - offsetof(CPUX86State,xmm_regs[reg].ZMM_L(= 0))); + offsetof(CPUX86State, xmm_regs[reg].ZMM_L= (0))); gen_ldst_modrm(env, s, modrm, MO_32, OR_TMP0, 1); } break; case 0x27e: /* movq xmm, ea */ + CHECK_AVX_V0_128(s); if (mod !=3D 3) { gen_lea_modrm(env, s, modrm); gen_ldq_env_A0(s, offsetof(CPUX86State, @@ -3441,11 +3970,13 @@ static void gen_sse(CPUX86State *env, DisasContext = *s, int b, } else { rm =3D (modrm & 7) | REX_B(s); gen_op_movq(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(0= )), - offsetof(CPUX86State,xmm_regs[rm].ZMM_Q(0))); + offsetof(CPUX86State, xmm_regs[rm].ZMM_Q(0))); } gen_op_movq_env_0(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q= (1))); + gen_clear_ymmh(s, reg); break; case 0x7f: /* movq ea, mm */ + CHECK_NO_VEX(s); if (mod !=3D 3) { gen_lea_modrm(env, s, modrm); gen_stq_env_A0(s, offsetof(CPUX86State, fpregs[reg].mmx)); @@ -3461,40 +3992,64 @@ static void gen_sse(CPUX86State *env, DisasContext = *s, int b, case 0x129: /* movapd */ case 0x17f: /* movdqa ea, xmm */ case 0x27f: /* movdqu ea, xmm */ + CHECK_AVX_V0(s); if (mod !=3D 3) { gen_lea_modrm(env, s, modrm); - gen_sto_env_A0(s, offsetof(CPUX86State, xmm_regs[reg])); + if (s->vex_l) { + gen_sty_env_A0(s, XMM_OFFSET(reg)); + } else { + gen_sto_env_A0(s, XMM_OFFSET(reg)); + } } else { rm =3D (modrm & 7) | REX_B(s); - gen_op_movo(s, offsetof(CPUX86State, xmm_regs[rm]), - offsetof(CPUX86State,xmm_regs[reg])); + gen_op_movo(s, XMM_OFFSET(rm), XMM_OFFSET(reg)); + if (s->vex_l) { + gen_op_movo_ymmh(s, XMM_OFFSET(rm), XMM_OFFSET(reg)); + } else { + gen_clear_ymmh(s, rm); + } } break; case 0x211: /* movss ea, xmm */ if (mod !=3D 3) { + CHECK_AVX_V0_128(s); gen_lea_modrm(env, s, modrm); tcg_gen_ld32u_tl(s->T0, cpu_env, offsetof(CPUX86State, xmm_regs[reg].ZMM_L= (0))); gen_op_st_v(s, MO_32, s->T0, s->A0); } else { + CHECK_AVX_128(s); rm =3D (modrm & 7) | REX_B(s); + if (rm !=3D reg_v) { + gen_op_movo(s, XMM_OFFSET(rm), XMM_OFFSET(reg_v)); + } gen_op_movl(s, offsetof(CPUX86State, xmm_regs[rm].ZMM_L(0)= ), - offsetof(CPUX86State,xmm_regs[reg].ZMM_L(0))); + offsetof(CPUX86State, xmm_regs[reg].ZMM_L(0))); + gen_clear_ymmh(s, rm); } break; case 0x311: /* movsd ea, xmm */ if (mod !=3D 3) { + CHECK_AVX_V0_128(s); gen_lea_modrm(env, s, modrm); gen_stq_env_A0(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(0))); } else { + CHECK_AVX_128(s); rm =3D (modrm & 7) | REX_B(s); + if (rm !=3D reg_v) { + gen_op_movq(s, + offsetof(CPUX86State, xmm_regs[rm].ZMM_Q(1)), + offsetof(CPUX86State, xmm_regs[reg_v].ZMM_Q(1)= )); + } gen_op_movq(s, offsetof(CPUX86State, xmm_regs[rm].ZMM_Q(0)= ), - offsetof(CPUX86State,xmm_regs[reg].ZMM_Q(0))); + offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(0))); + gen_clear_ymmh(s, rm); } break; case 0x013: /* movlps */ case 0x113: /* movlpd */ + CHECK_AVX_V0_128(s); if (mod !=3D 3) { gen_lea_modrm(env, s, modrm); gen_stq_env_A0(s, offsetof(CPUX86State, @@ -3505,6 +4060,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s= , int b, break; case 0x017: /* movhps */ case 0x117: /* movhpd */ + CHECK_AVX_V0_128(s); if (mod !=3D 3) { gen_lea_modrm(env, s, modrm); gen_stq_env_A0(s, offsetof(CPUX86State, @@ -3521,65 +4077,91 @@ static void gen_sse(CPUX86State *env, DisasContext = *s, int b, case 0x173: val =3D x86_ldub_code(env, s); if (is_xmm) { + CHECK_AVX(s); + CHECK_AVX2_256(s); tcg_gen_movi_tl(s->T0, val); tcg_gen_st32_tl(s->T0, cpu_env, offsetof(CPUX86State, xmm_t0.ZMM_L(0))); tcg_gen_movi_tl(s->T0, 0); tcg_gen_st32_tl(s->T0, cpu_env, offsetof(CPUX86State, xmm_t0.ZMM_L(1))); - op1_offset =3D offsetof(CPUX86State,xmm_t0); + op1_offset =3D offsetof(CPUX86State, xmm_t0); } else { + CHECK_NO_VEX(s); tcg_gen_movi_tl(s->T0, val); tcg_gen_st32_tl(s->T0, cpu_env, offsetof(CPUX86State, mmx_t0.MMX_L(0))); tcg_gen_movi_tl(s->T0, 0); tcg_gen_st32_tl(s->T0, cpu_env, offsetof(CPUX86State, mmx_t0.MMX_L(1))); - op1_offset =3D offsetof(CPUX86State,mmx_t0); + op1_offset =3D offsetof(CPUX86State, mmx_t0); } assert(b1 < 2); - sse_fn_epp =3D sse_op_table2[((b - 1) & 3) * 8 + + if (s->vex_l) { + b1 =3D 2; + } + SSEFunc_0_eppp fn =3D sse_op_table2[((b - 1) & 3) * 8 + (((modrm >> 3)) & 7)][b1]; - if (!sse_fn_epp) { + if (!fn) { goto unknown_op; } if (is_xmm) { rm =3D (modrm & 7) | REX_B(s); - op2_offset =3D offsetof(CPUX86State,xmm_regs[rm]); + op2_offset =3D XMM_OFFSET(rm); + if (s->prefix & PREFIX_VEX) { + v_offset =3D XMM_OFFSET(reg_v); + } else { + v_offset =3D op2_offset; + } } else { rm =3D (modrm & 7); op2_offset =3D offsetof(CPUX86State,fpregs[rm].mmx); + v_offset =3D op2_offset; + } + tcg_gen_addi_ptr(s->ptr0, cpu_env, v_offset); + tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset); + tcg_gen_addi_ptr(s->ptr2, cpu_env, op1_offset); + fn(cpu_env, s->ptr0, s->ptr1, s->ptr2); + if (!s->vex_l) { + gen_clear_ymmh(s, reg_v); } - tcg_gen_addi_ptr(s->ptr0, cpu_env, op2_offset); - tcg_gen_addi_ptr(s->ptr1, cpu_env, op1_offset); - sse_fn_epp(cpu_env, s->ptr0, s->ptr1); break; case 0x050: /* movmskps */ + CHECK_AVX_V0(s); rm =3D (modrm & 7) | REX_B(s); tcg_gen_addi_ptr(s->ptr0, cpu_env, - offsetof(CPUX86State,xmm_regs[rm])); - gen_helper_movmskps(s->tmp2_i32, cpu_env, s->ptr0); + offsetof(CPUX86State, xmm_regs[rm])); + if (s->vex_l) { + gen_helper_movmskps_ymm(s->tmp2_i32, cpu_env, s->ptr0); + } else { + gen_helper_movmskps_xmm(s->tmp2_i32, cpu_env, s->ptr0); + } tcg_gen_extu_i32_tl(cpu_regs[reg], s->tmp2_i32); break; case 0x150: /* movmskpd */ + CHECK_AVX_V0(s); rm =3D (modrm & 7) | REX_B(s); - tcg_gen_addi_ptr(s->ptr0, cpu_env, - offsetof(CPUX86State,xmm_regs[rm])); - gen_helper_movmskpd(s->tmp2_i32, cpu_env, s->ptr0); + tcg_gen_addi_ptr(s->ptr0, cpu_env, XMM_OFFSET(rm)); + if (s->vex_l) { + gen_helper_movmskpd_ymm(s->tmp2_i32, cpu_env, s->ptr0); + } else { + gen_helper_movmskpd_xmm(s->tmp2_i32, cpu_env, s->ptr0); + } tcg_gen_extu_i32_tl(cpu_regs[reg], s->tmp2_i32); break; case 0x02a: /* cvtpi2ps */ case 0x12a: /* cvtpi2pd */ - gen_helper_enter_mmx(cpu_env); + CHECK_NO_VEX(s); if (mod !=3D 3) { gen_lea_modrm(env, s, modrm); op2_offset =3D offsetof(CPUX86State,mmx_t0); gen_ldq_env_A0(s, op2_offset); } else { + gen_helper_enter_mmx(cpu_env); rm =3D (modrm & 7); op2_offset =3D offsetof(CPUX86State,fpregs[rm].mmx); } - op1_offset =3D offsetof(CPUX86State,xmm_regs[reg]); + op1_offset =3D XMM_OFFSET(reg); tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset); tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset); switch(b >> 8) { @@ -3594,9 +4176,14 @@ static void gen_sse(CPUX86State *env, DisasContext *= s, int b, break; case 0x22a: /* cvtsi2ss */ case 0x32a: /* cvtsi2sd */ + CHECK_AVX(s); ot =3D mo_64_32(s->dflag); gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0); - op1_offset =3D offsetof(CPUX86State,xmm_regs[reg]); + op1_offset =3D XMM_OFFSET(reg); + v_offset =3D XMM_OFFSET(reg_v); + if (op1_offset !=3D v_offset) { + gen_op_movo(s, op1_offset, v_offset); + } tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset); if (ot =3D=3D MO_32) { SSEFunc_0_epi sse_fn_epi =3D sse_op_table3ai[(b >> 8) & 1]; @@ -3610,19 +4197,21 @@ static void gen_sse(CPUX86State *env, DisasContext = *s, int b, goto illegal_op; #endif } + gen_clear_ymmh(s, reg); break; case 0x02c: /* cvttps2pi */ case 0x12c: /* cvttpd2pi */ case 0x02d: /* cvtps2pi */ case 0x12d: /* cvtpd2pi */ + CHECK_NO_VEX(s); gen_helper_enter_mmx(cpu_env); if (mod !=3D 3) { gen_lea_modrm(env, s, modrm); - op2_offset =3D offsetof(CPUX86State,xmm_t0); + op2_offset =3D offsetof(CPUX86State, xmm_t0); gen_ldo_env_A0(s, op2_offset); } else { rm =3D (modrm & 7) | REX_B(s); - op2_offset =3D offsetof(CPUX86State,xmm_regs[rm]); + op2_offset =3D XMM_OFFSET(rm); } op1_offset =3D offsetof(CPUX86State,fpregs[reg & 7].mmx); tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset); @@ -3646,6 +4235,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s= , int b, case 0x32c: /* cvttsd2si */ case 0x22d: /* cvtss2si */ case 0x32d: /* cvtsd2si */ + CHECK_AVX_V0(s); ot =3D mo_64_32(s->dflag); if (mod !=3D 3) { gen_lea_modrm(env, s, modrm); @@ -3656,10 +4246,10 @@ static void gen_sse(CPUX86State *env, DisasContext = *s, int b, tcg_gen_st32_tl(s->T0, cpu_env, offsetof(CPUX86State, xmm_t0.ZMM_L(0))= ); } - op2_offset =3D offsetof(CPUX86State,xmm_t0); + op2_offset =3D offsetof(CPUX86State, xmm_t0); } else { rm =3D (modrm & 7) | REX_B(s); - op2_offset =3D offsetof(CPUX86State,xmm_regs[rm]); + op2_offset =3D XMM_OFFSET(rm); } tcg_gen_addi_ptr(s->ptr0, cpu_env, op2_offset); if (ot =3D=3D MO_32) { @@ -3680,21 +4270,28 @@ static void gen_sse(CPUX86State *env, DisasContext = *s, int b, break; case 0xc4: /* pinsrw */ case 0x1c4: + CHECK_AVX_128(s); s->rip_offset =3D 1; gen_ldst_modrm(env, s, modrm, MO_16, OR_TMP0, 0); val =3D x86_ldub_code(env, s); + if (reg !=3D reg_v) { + gen_op_movo(s, XMM_OFFSET(reg), XMM_OFFSET(reg_v)); + } if (b1) { val &=3D 7; tcg_gen_st16_tl(s->T0, cpu_env, - offsetof(CPUX86State,xmm_regs[reg].ZMM_W(v= al))); + offsetof(CPUX86State, xmm_regs[reg].ZMM_W(val))); } else { + CHECK_NO_VEX(s); val &=3D 3; tcg_gen_st16_tl(s->T0, cpu_env, - offsetof(CPUX86State,fpregs[reg].mmx.MMX_W= (val))); + offsetof(CPUX86State, fpregs[reg].mmx.MMX_W(val))); } + gen_clear_ymmh(s, reg); break; case 0xc5: /* pextrw */ case 0x1c5: + CHECK_AVX_V0_128(s); if (mod !=3D 3) goto illegal_op; ot =3D mo_64_32(s->dflag); @@ -3703,17 +4300,18 @@ static void gen_sse(CPUX86State *env, DisasContext = *s, int b, val &=3D 7; rm =3D (modrm & 7) | REX_B(s); tcg_gen_ld16u_tl(s->T0, cpu_env, - offsetof(CPUX86State,xmm_regs[rm].ZMM_W(v= al))); + offsetof(CPUX86State, xmm_regs[rm].ZMM_W(val))); } else { val &=3D 3; rm =3D (modrm & 7); tcg_gen_ld16u_tl(s->T0, cpu_env, - offsetof(CPUX86State,fpregs[rm].mmx.MMX_W(= val))); + offsetof(CPUX86State, fpregs[rm].mmx.MMX_W(val))); } reg =3D ((modrm >> 3) & 7) | REX_R(s); gen_op_mov_reg_v(s, ot, reg, s->T0); break; case 0x1d6: /* movq ea, xmm */ + CHECK_AVX_V0_128(s); if (mod !=3D 3) { gen_lea_modrm(env, s, modrm); gen_stq_env_A0(s, offsetof(CPUX86State, @@ -3721,12 +4319,13 @@ static void gen_sse(CPUX86State *env, DisasContext = *s, int b, } else { rm =3D (modrm & 7) | REX_B(s); gen_op_movq(s, offsetof(CPUX86State, xmm_regs[rm].ZMM_Q(0)= ), - offsetof(CPUX86State,xmm_regs[reg].ZMM_Q(0))); + offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(0))); gen_op_movq_env_0(s, offsetof(CPUX86State, xmm_regs[rm].ZMM_Q= (1))); } break; case 0x2d6: /* movq2dq */ + CHECK_NO_VEX(s); gen_helper_enter_mmx(cpu_env); rm =3D (modrm & 7); gen_op_movq(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(0)), @@ -3734,21 +4333,27 @@ static void gen_sse(CPUX86State *env, DisasContext = *s, int b, gen_op_movq_env_0(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q= (1))); break; case 0x3d6: /* movdq2q */ + CHECK_NO_VEX(s); gen_helper_enter_mmx(cpu_env); rm =3D (modrm & 7) | REX_B(s); gen_op_movq(s, offsetof(CPUX86State, fpregs[reg & 7].mmx), - offsetof(CPUX86State,xmm_regs[rm].ZMM_Q(0))); + offsetof(CPUX86State, xmm_regs[rm].ZMM_Q(0))); break; case 0xd7: /* pmovmskb */ case 0x1d7: if (mod !=3D 3) goto illegal_op; if (b1) { + CHECK_AVX_V0(s); rm =3D (modrm & 7) | REX_B(s); - tcg_gen_addi_ptr(s->ptr0, cpu_env, - offsetof(CPUX86State, xmm_regs[rm])); - gen_helper_pmovmskb_xmm(s->tmp2_i32, cpu_env, s->ptr0); + tcg_gen_addi_ptr(s->ptr0, cpu_env, XMM_OFFSET(rm)); + if (s->vex_l) { + gen_helper_pmovmskb_ymm(s->tmp2_i32, cpu_env, s->ptr0); + } else { + gen_helper_pmovmskb_xmm(s->tmp2_i32, cpu_env, s->ptr0); + } } else { + CHECK_NO_VEX(s); rm =3D (modrm & 7); tcg_gen_addi_ptr(s->ptr0, cpu_env, offsetof(CPUX86State, fpregs[rm].mmx)); @@ -3768,50 +4373,241 @@ static void gen_sse(CPUX86State *env, DisasContext= *s, int b, rm =3D modrm & 7; reg =3D ((modrm >> 3) & 7) | REX_R(s); mod =3D (modrm >> 6) & 3; + if (s->prefix & PREFIX_VEX) { + reg_v =3D s->vex_v; + } else { + reg_v =3D reg; + } =20 assert(b1 < 2); - sse_fn_epp =3D sse_op_table6[b].op[b1]; - if (!sse_fn_epp) { + op6 =3D sse_op_table6[b]; + if (op6.ext_mask =3D=3D 0) { goto unknown_op; } - if (!(s->cpuid_ext_features & sse_op_table6[b].ext_mask)) + if (!(s->cpuid_ext_features & op6.ext_mask)) { + goto illegal_op; + } + + if (op6.ext_mask =3D=3D CPUID_EXT_AVX + && (s->prefix & PREFIX_VEX) =3D=3D 0) { goto illegal_op; + } + if (op6.flags & SSE_OPF_AVX2) { + CHECK_AVX2(s); + } =20 if (b1) { - op1_offset =3D offsetof(CPUX86State,xmm_regs[reg]); + if (op6.flags & SSE_OPF_V0) { + CHECK_AVX_V0(s); + } else { + CHECK_AVX(s); + } + + op1_offset =3D XMM_OFFSET(reg); + + if ((b & 0xfc) =3D=3D 0x90) { /* vgather */ + int scale, index, base; + target_long disp =3D 0; + CHECK_AVX2(s); + if (mod =3D=3D 3 || rm !=3D 4) { + goto illegal_op; + } + + /* Vector SIB */ + val =3D x86_ldub_code(env, s); + scale =3D (val >> 6) & 3; + index =3D ((val >> 3) & 7) | REX_X(s); + base =3D (val & 7) | REX_B(s); + switch (mod) { + case 0: + if (base =3D=3D 5) { + base =3D -1; + disp =3D (int32_t)x86_ldl_code(env, s); + } + break; + case 1: + disp =3D (int8_t)x86_ldub_code(env, s); + break; + default: + case 2: + disp =3D (int32_t)x86_ldl_code(env, s); + break; + } + + /* destination, index and mask registers must not over= lap */ + if (reg =3D=3D index || reg =3D=3D reg_v) { + goto illegal_op; + } + + tcg_gen_addi_tl(s->A0, cpu_regs[base], disp); + gen_add_A0_ds_seg(s); + op2_offset =3D XMM_OFFSET(index); + v_offset =3D XMM_OFFSET(reg_v); + tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset); + tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset); + tcg_gen_addi_ptr(s->ptr2, cpu_env, v_offset); + b1 =3D REX_W(s) | ((b & 1) << 1) | (scale << 2); + sse_op_table10[b1][s->vex_l](cpu_env, + s->ptr0, s->ptr2, s->ptr1, s->A0); + if (!s->vex_l) { + gen_clear_ymmh(s, reg); + gen_clear_ymmh(s, reg_v); + } + return; + } + + if (op6.flags & SSE_OPF_MMX) { + CHECK_AVX2_256(s); + } + if (op6.flags & SSE_OPF_BLENDV) { + /* + * VEX encodings of the blendv opcodes are not valid + * they use a different opcode with an 0f 3a prefix + */ + CHECK_NO_VEX(s); + } + if (mod =3D=3D 3) { - op2_offset =3D offsetof(CPUX86State,xmm_regs[rm | REX_= B(s)]); + op2_offset =3D XMM_OFFSET(rm | REX_B(s)); } else { - op2_offset =3D offsetof(CPUX86State,xmm_t0); + int size; + op2_offset =3D offsetof(CPUX86State, xmm_t0); gen_lea_modrm(env, s, modrm); switch (b) { + case 0x78: /* vpbroadcastb */ + size =3D 8; + break; + case 0x79: /* vpbroadcastw */ + size =3D 16; + break; + case 0x18: /* vbroadcastss */ + case 0x58: /* vpbroadcastd */ + size =3D 32; + break; + case 0x19: /* vbroadcastsd */ + case 0x59: /* vpbroadcastq */ + size =3D 64; + break; + case 0x1a: /* vbroadcastf128 */ + case 0x5a: /* vbroadcasti128 */ + size =3D 128; + break; case 0x20: case 0x30: /* pmovsxbw, pmovzxbw */ case 0x23: case 0x33: /* pmovsxwd, pmovzxwd */ case 0x25: case 0x35: /* pmovsxdq, pmovzxdq */ - gen_ldq_env_A0(s, op2_offset + - offsetof(ZMMReg, ZMM_Q(0))); + size =3D 64; break; case 0x21: case 0x31: /* pmovsxbd, pmovzxbd */ case 0x24: case 0x34: /* pmovsxwq, pmovzxwq */ - tcg_gen_qemu_ld_i32(s->tmp2_i32, s->A0, - s->mem_index, MO_LEUL); - tcg_gen_st_i32(s->tmp2_i32, cpu_env, op2_offset + - offsetof(ZMMReg, ZMM_L(0))); + size =3D 32; break; case 0x22: case 0x32: /* pmovsxbq, pmovzxbq */ + size =3D 16; + break; + case 0x2a: /* movntqda */ + if (s->vex_l) { + gen_ldy_env_A0(s, op1_offset); + } else { + gen_ldo_env_A0(s, op1_offset); + gen_clear_ymmh(s, reg); + } + return; + case 0x2e: /* maskmovpd */ + b1 =3D 0; + goto vpmaskmov; + case 0x2f: /* maskmovpd */ + b1 =3D 1; + goto vpmaskmov; + case 0x8e: /* vpmaskmovd, vpmaskmovq */ + CHECK_AVX2(s); + b1 =3D REX_W(s); + vpmaskmov: + tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset); + v_offset =3D XMM_OFFSET(reg_v); + tcg_gen_addi_ptr(s->ptr2, cpu_env, v_offset); + sse_op_table9[b1][s->vex_l](cpu_env, + s->ptr0, s->ptr2, s->A0); + return; + default: + size =3D 128; + } + if ((op6.flags & SSE_OPF_SCALAR) =3D=3D 0 && s->vex_l)= { + size *=3D 2; + } + switch (size) { + case 8: + tcg_gen_qemu_ld_tl(s->tmp0, s->A0, + s->mem_index, MO_UB); + tcg_gen_st16_tl(s->tmp0, cpu_env, op2_offset + + offsetof(ZMMReg, ZMM_B(0))); + break; + case 16: tcg_gen_qemu_ld_tl(s->tmp0, s->A0, s->mem_index, MO_LEUW); tcg_gen_st16_tl(s->tmp0, cpu_env, op2_offset + offsetof(ZMMReg, ZMM_W(0))); break; - case 0x2a: /* movntqda */ - gen_ldo_env_A0(s, op1_offset); - return; - default: + case 32: + tcg_gen_qemu_ld_i32(s->tmp2_i32, s->A0, + s->mem_index, MO_LEUL); + tcg_gen_st_i32(s->tmp2_i32, cpu_env, op2_offset + + offsetof(ZMMReg, ZMM_L(0))); + break; + case 64: + gen_ldq_env_A0(s, op2_offset + + offsetof(ZMMReg, ZMM_Q(0))); + break; + case 128: gen_ldo_env_A0(s, op2_offset); + break; + case 256: + gen_ldy_env_A0(s, op2_offset); + break; + } + } + tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset); + tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset); + if (s->vex_l) { + b1 =3D 2; + } + if (!op6.fn[b1].op1) { + goto illegal_op; + } + if (op6.flags & SSE_OPF_V0) { + op6.fn[b1].op1(cpu_env, s->ptr0, s->ptr1); + } else { + v_offset =3D XMM_OFFSET(reg_v); + tcg_gen_addi_ptr(s->ptr2, cpu_env, v_offset); + if (op6.flags & SSE_OPF_BLENDV) { + TCGv_ptr mask =3D tcg_temp_new_ptr(); + tcg_gen_addi_ptr(mask, cpu_env, XMM_OFFSET(0)); + op6.fn[b1].op3(cpu_env, s->ptr0, s->ptr2, s->ptr1, + mask); + tcg_temp_free_ptr(mask); + } else { + SSEFunc_0_eppp fn =3D op6.fn[b1].op2; + if (REX_W(s)) { + if (b >=3D 0x45 && b <=3D 0x47) { + fn =3D sse_op_table8[b - 0x45][b1 - 1]; + } else if (b =3D=3D 0x8c) { + if (s->vex_l) { + fn =3D gen_helper_vpmaskmovq_ymm; + } else { + fn =3D gen_helper_vpmaskmovq_xmm; + } + } + } + fn(cpu_env, s->ptr0, s->ptr2, s->ptr1); } } + if ((op6.flags & SSE_OPF_CMP) =3D=3D 0 && s->vex_l =3D=3D = 0) { + gen_clear_ymmh(s, reg); + } } else { + CHECK_NO_VEX(s); + if ((op6.flags & SSE_OPF_MMX) =3D=3D 0) { + goto unknown_op; + } op1_offset =3D offsetof(CPUX86State,fpregs[reg].mmx); if (mod =3D=3D 3) { op2_offset =3D offsetof(CPUX86State,fpregs[rm].mmx); @@ -3820,16 +4616,16 @@ static void gen_sse(CPUX86State *env, DisasContext = *s, int b, gen_lea_modrm(env, s, modrm); gen_ldq_env_A0(s, op2_offset); } + tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset); + tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset); + if (op6.flags & SSE_OPF_V0) { + op6.fn[0].op1(cpu_env, s->ptr0, s->ptr1); + } else { + op6.fn[0].op2(cpu_env, s->ptr0, s->ptr0, s->ptr1); + } } - if (sse_fn_epp =3D=3D SSE_SPECIAL) { - goto unknown_op; - } - - tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset); - tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset); - sse_fn_epp(cpu_env, s->ptr0, s->ptr1); =20 - if (b =3D=3D 0x17) { + if (op6.flags & SSE_OPF_CMP) { set_cc_op(s, CC_OP_EFLAGS); } break; @@ -3846,6 +4642,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s= , int b, case 0x3f0: /* crc32 Gd,Eb */ case 0x3f1: /* crc32 Gd,Ey */ do_crc32: + CHECK_NO_VEX(s); if (!(s->cpuid_ext_features & CPUID_EXT_SSE42)) { goto illegal_op; } @@ -3877,6 +4674,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s= , int b, /* FALLTHRU */ case 0x0f0: /* movbe Gy,My */ case 0x0f1: /* movbe My,Gy */ + CHECK_NO_VEX(s); if (!(s->cpuid_ext_features & CPUID_EXT_MOVBE)) { goto illegal_op; } @@ -4043,6 +4841,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s= , int b, =20 case 0x1f6: /* adcx Gy, Ey */ case 0x2f6: /* adox Gy, Ey */ + CHECK_NO_VEX(s); if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_ADX)) { goto illegal_op; } else { @@ -4196,18 +4995,28 @@ static void gen_sse(CPUX86State *env, DisasContext = *s, int b, rm =3D modrm & 7; reg =3D ((modrm >> 3) & 7) | REX_R(s); mod =3D (modrm >> 6) & 3; + if (s->prefix & PREFIX_VEX) { + reg_v =3D s->vex_v; + } else { + reg_v =3D reg; + } =20 assert(b1 < 2); - sse_fn_eppi =3D sse_op_table7[b].op[b1]; - if (!sse_fn_eppi) { + op7 =3D sse_op_table7[b]; + if (op7.ext_mask =3D=3D 0) { goto unknown_op; } - if (!(s->cpuid_ext_features & sse_op_table7[b].ext_mask)) + if (!(s->cpuid_ext_features & op7.ext_mask)) { goto illegal_op; + } =20 s->rip_offset =3D 1; =20 - if (sse_fn_eppi =3D=3D SSE_SPECIAL) { + if (op7.flags & SSE_OPF_SPECIAL) { + /* None of the "special" ops are valid on mmx registers */ + if (b1 =3D=3D 0) { + goto illegal_op; + } ot =3D mo_64_32(s->dflag); rm =3D (modrm & 7) | REX_B(s); if (mod !=3D 3) @@ -4216,6 +5025,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s= , int b, val =3D x86_ldub_code(env, s); switch (b) { case 0x14: /* pextrb */ + CHECK_AVX_V0_128(s); tcg_gen_ld8u_tl(s->T0, cpu_env, offsetof(CPUX86State, xmm_regs[reg].ZMM_B(val & 15))= ); if (mod =3D=3D 3) { @@ -4226,6 +5036,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s= , int b, } break; case 0x15: /* pextrw */ + CHECK_AVX_V0_128(s); tcg_gen_ld16u_tl(s->T0, cpu_env, offsetof(CPUX86State, xmm_regs[reg].ZMM_W(val & 7))); if (mod =3D=3D 3) { @@ -4236,6 +5047,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s= , int b, } break; case 0x16: + CHECK_AVX_V0_128(s); if (ot =3D=3D MO_32) { /* pextrd */ tcg_gen_ld_i32(s->tmp2_i32, cpu_env, offsetof(CPUX86State, @@ -4263,6 +5075,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s= , int b, } break; case 0x17: /* extractps */ + CHECK_AVX_V0_128(s); tcg_gen_ld32u_tl(s->T0, cpu_env, offsetof(CPUX86State, xmm_regs[reg].ZMM_L(val & 3))); if (mod =3D=3D 3) { @@ -4273,6 +5086,10 @@ static void gen_sse(CPUX86State *env, DisasContext *= s, int b, } break; case 0x20: /* pinsrb */ + CHECK_AVX_128(s); + if (reg !=3D reg_v) { + gen_op_movo(s, XMM_OFFSET(reg), XMM_OFFSET(reg_v)); + } if (mod =3D=3D 3) { gen_op_mov_v_reg(s, MO_32, s->T0, rm); } else { @@ -4281,18 +5098,23 @@ static void gen_sse(CPUX86State *env, DisasContext = *s, int b, } tcg_gen_st8_tl(s->T0, cpu_env, offsetof(CPUX86State, xmm_regs[reg].ZMM_B(val & 15))= ); + gen_clear_ymmh(s, reg); break; case 0x21: /* insertps */ + CHECK_AVX_128(s); if (mod =3D=3D 3) { tcg_gen_ld_i32(s->tmp2_i32, cpu_env, - offsetof(CPUX86State,xmm_regs[rm] + offsetof(CPUX86State, xmm_regs[rm] .ZMM_L((val >> 6) & 3))); } else { tcg_gen_qemu_ld_i32(s->tmp2_i32, s->A0, s->mem_index, MO_LEUL); } + if (reg !=3D reg_v) { + gen_op_movo(s, XMM_OFFSET(reg), XMM_OFFSET(reg_v)); + } tcg_gen_st_i32(s->tmp2_i32, cpu_env, - offsetof(CPUX86State,xmm_regs[reg] + offsetof(CPUX86State, xmm_regs[reg] .ZMM_L((val >> 4) & 3))); if ((val >> 0) & 1) tcg_gen_st_i32(tcg_const_i32(0 /*float32_zero*/), @@ -4310,8 +5132,13 @@ static void gen_sse(CPUX86State *env, DisasContext *= s, int b, tcg_gen_st_i32(tcg_const_i32(0 /*float32_zero*/), cpu_env, offsetof(CPUX86State, xmm_regs[reg].ZMM_L(3))); + gen_clear_ymmh(s, reg); break; case 0x22: + CHECK_AVX_128(s); + if (reg !=3D reg_v) { + gen_op_movo(s, XMM_OFFSET(reg), XMM_OFFSET(reg_v)); + } if (ot =3D=3D MO_32) { /* pinsrd */ if (mod =3D=3D 3) { tcg_gen_trunc_tl_i32(s->tmp2_i32, cpu_regs[rm]= ); @@ -4337,21 +5164,91 @@ static void gen_sse(CPUX86State *env, DisasContext = *s, int b, goto illegal_op; #endif } + gen_clear_ymmh(s, reg); + break; + case 0x38: /* vinserti128 */ + CHECK_AVX2_256(s); + /* fall through */ + case 0x18: /* vinsertf128 */ + CHECK_AVX(s); + if ((s->prefix & PREFIX_VEX) =3D=3D 0 || s->vex_l =3D= =3D 0) { + goto illegal_op; + } + if (mod =3D=3D 3) { + if (val & 1) { + gen_op_movo_ymm_l2h(s, XMM_OFFSET(reg), + XMM_OFFSET(rm)); + } else { + gen_op_movo(s, XMM_OFFSET(reg), XMM_OFFSET(rm)= ); + } + } else { + if (val & 1) { + gen_ldo_env_A0_ymmh(s, XMM_OFFSET(reg)); + } else { + gen_ldo_env_A0(s, XMM_OFFSET(reg)); + } + } + if (reg !=3D reg_v) { + if (val & 1) { + gen_op_movo(s, XMM_OFFSET(reg), XMM_OFFSET(reg= _v)); + } else { + gen_op_movo_ymmh(s, XMM_OFFSET(reg), + XMM_OFFSET(reg_v)); + } + } + break; + case 0x39: /* vextracti128 */ + CHECK_AVX2_256(s); + /* fall through */ + case 0x19: /* vextractf128 */ + CHECK_AVX_V0(s); + if ((s->prefix & PREFIX_VEX) =3D=3D 0 || s->vex_l =3D= =3D 0) { + goto illegal_op; + } + if (mod =3D=3D 3) { + op1_offset =3D XMM_OFFSET(rm); + if (val & 1) { + gen_op_movo_ymm_h2l(s, XMM_OFFSET(rm), + XMM_OFFSET(reg)); + } else { + gen_op_movo(s, XMM_OFFSET(rm), XMM_OFFSET(reg)= ); + } + gen_clear_ymmh(s, rm); + } else{ + if (val & 1) { + gen_sto_env_A0_ymmh(s, XMM_OFFSET(reg)); + } else { + gen_sto_env_A0(s, XMM_OFFSET(reg)); + } + } break; + default: + goto unknown_op; } return; } =20 - if (b1) { - op1_offset =3D offsetof(CPUX86State,xmm_regs[reg]); - if (mod =3D=3D 3) { - op2_offset =3D offsetof(CPUX86State,xmm_regs[rm | REX_= B(s)]); - } else { - op2_offset =3D offsetof(CPUX86State,xmm_t0); - gen_lea_modrm(env, s, modrm); - gen_ldo_env_A0(s, op2_offset); + CHECK_AVX(s); + scalar_op =3D (s->prefix & PREFIX_VEX) + && (op7.flags & SSE_OPF_SCALAR) + && !(op7.flags & SSE_OPF_CMP); + if (is_xmm && (op7.flags & SSE_OPF_MMX)) { + CHECK_AVX2_256(s); + } + if (op7.flags & SSE_OPF_AVX2) { + CHECK_AVX2(s); + } + if ((op7.flags & SSE_OPF_V0) && !scalar_op) { + CHECK_AVX_V0(s); + } + + if (b1 =3D=3D 0) { + CHECK_NO_VEX(s); + /* MMX */ + if ((op7.flags & SSE_OPF_MMX) =3D=3D 0) { + goto illegal_op; } - } else { + op1_offset =3D offsetof(CPUX86State,fpregs[reg].mmx); if (mod =3D=3D 3) { op2_offset =3D offsetof(CPUX86State,fpregs[rm].mmx); @@ -4360,9 +5257,37 @@ static void gen_sse(CPUX86State *env, DisasContext *= s, int b, gen_lea_modrm(env, s, modrm); gen_ldq_env_A0(s, op2_offset); } + val =3D x86_ldub_code(env, s); + tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset); + tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset); + + /* We only actually have one MMX instuction (palignr) */ + assert(b =3D=3D 0x0f); + + op7.fn[0].op2(cpu_env, s->ptr0, s->ptr0, s->ptr1, + tcg_const_i32(val)); + break; + } + + /* SSE */ + if (op7.flags & SSE_OPF_BLENDV && !(s->prefix & PREFIX_VEX)) { + /* Only VEX encodings are valid for these blendv opcodes */ + goto illegal_op; + } + op1_offset =3D XMM_OFFSET(reg); + if (mod =3D=3D 3) { + op2_offset =3D XMM_OFFSET(rm | REX_B(s)); + } else { + op2_offset =3D offsetof(CPUX86State, xmm_t0); + gen_lea_modrm(env, s, modrm); + if (s->vex_l) { + gen_ldy_env_A0(s, op2_offset); + } else { + gen_ldo_env_A0(s, op2_offset); + } } - val =3D x86_ldub_code(env, s); =20 + val =3D x86_ldub_code(env, s); if ((b & 0xfc) =3D=3D 0x60) { /* pcmpXstrX */ set_cc_op(s, CC_OP_EFLAGS); =20 @@ -4370,11 +5295,49 @@ static void gen_sse(CPUX86State *env, DisasContext = *s, int b, /* The helper must use entire 64-bit gp registers */ val |=3D 1 << 8; } + if ((b & 1) =3D=3D 0) /* pcmpXsrtm */ + gen_clear_ymmh(s, 0); } =20 + if (s->vex_l) { + b1 =3D 2; + } + v_offset =3D XMM_OFFSET(reg_v); + /* + * Populate the top part of the destination register for VEX + * encoded scalar operations + */ + if (scalar_op && op1_offset !=3D v_offset) { + if (b =3D=3D 0x0a) { /* roundss */ + gen_op_movl(s, + offsetof(CPUX86State, xmm_regs[reg].ZMM_L(1)), + offsetof(CPUX86State, xmm_regs[reg_v].ZMM_L(1)= )); + } + gen_op_movq(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(1= )), + offsetof(CPUX86State, xmm_regs[reg_v].ZMM_Q(1)= )); + } tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset); tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset); - sse_fn_eppi(cpu_env, s->ptr0, s->ptr1, tcg_const_i32(val)); + if (op7.flags & SSE_OPF_V0) { + op7.fn[b1].op1(cpu_env, s->ptr0, s->ptr1, tcg_const_i32(va= l)); + } else { + tcg_gen_addi_ptr(s->ptr2, cpu_env, v_offset); + if (op7.flags & SSE_OPF_BLENDV) { + TCGv_ptr mask =3D tcg_temp_new_ptr(); + tcg_gen_addi_ptr(mask, cpu_env, XMM_OFFSET(val >> 4)); + op7.fn[b1].op3(cpu_env, s->ptr0, s->ptr2, s->ptr1, mas= k); + tcg_temp_free_ptr(mask); + } else { + op7.fn[b1].op2(cpu_env, s->ptr0, s->ptr2, s->ptr1, + tcg_const_i32(val)); + } + } + if ((op7.flags & SSE_OPF_CMP) =3D=3D 0 && s->vex_l =3D=3D 0) { + gen_clear_ymmh(s, reg); + } + if (op7.flags & SSE_OPF_CMP) { + set_cc_op(s, CC_OP_EFLAGS); + } break; =20 case 0x33a: @@ -4424,34 +5387,49 @@ static void gen_sse(CPUX86State *env, DisasContext = *s, int b, default: break; } + if (s->vex_l) { + b1 +=3D 4; + } + if ((sse_op.flags & SSE_OPF_3DNOW) =3D=3D 0 && !sse_op.fn[b1].op1)= { + goto unknown_op; + } if (is_xmm) { - op1_offset =3D offsetof(CPUX86State,xmm_regs[reg]); + scalar_op =3D (s->prefix & PREFIX_VEX) + && (sse_op.flags & SSE_OPF_SCALAR) + && !(sse_op.flags & SSE_OPF_CMP) + && (b1 =3D=3D 2 || b1 =3D=3D 3); + /* VEX encoded scalar ops always have 3 operands! */ + if ((sse_op.flags & SSE_OPF_V0) && !scalar_op) { + CHECK_AVX_V0(s); + } else { + CHECK_AVX(s); + } + if (sse_op.flags & SSE_OPF_MMX) { + CHECK_AVX2_256(s); + } + op1_offset =3D XMM_OFFSET(reg); if (mod !=3D 3) { - int sz =3D 4; + int sz =3D s->vex_l ? 5 : 4; =20 gen_lea_modrm(env, s, modrm); - op2_offset =3D offsetof(CPUX86State,xmm_t0); - - switch (b) { - case 0x50 ... 0x5a: - case 0x5c ... 0x5f: - case 0xc2: - /* Most sse scalar operations. */ - if (b1 =3D=3D 2) { - sz =3D 2; - } else if (b1 =3D=3D 3) { - sz =3D 3; - } - break; + op2_offset =3D offsetof(CPUX86State, xmm_t0); =20 - case 0x2e: /* ucomis[sd] */ - case 0x2f: /* comis[sd] */ - if (b1 =3D=3D 0) { - sz =3D 2; + if (sse_op.flags & SSE_OPF_SCALAR) { + if (sse_op.flags & SSE_OPF_CMP) { + /* ucomis[sd], comis[sd] */ + if (b1 =3D=3D 0) { + sz =3D 2; + } else { + sz =3D 3; + } } else { - sz =3D 3; + /* Most sse scalar operations. */ + if (b1 =3D=3D 2) { + sz =3D 2; + } else if (b1 =3D=3D 3) { + sz =3D 3; + } } - break; } =20 switch (sz) { @@ -4459,22 +5437,29 @@ static void gen_sse(CPUX86State *env, DisasContext = *s, int b, /* 32 bit access */ gen_op_ld_v(s, MO_32, s->T0, s->A0); tcg_gen_st32_tl(s->T0, cpu_env, - offsetof(CPUX86State,xmm_t0.ZMM_L(0))); + offsetof(CPUX86State, xmm_t0.ZMM_L(0))= ); break; case 3: /* 64 bit access */ gen_ldq_env_A0(s, offsetof(CPUX86State, xmm_t0.ZMM_D(0= ))); break; - default: + case 4: /* 128 bit access */ gen_ldo_env_A0(s, op2_offset); break; + case 5: + /* 256 bit access */ + gen_ldy_env_A0(s, op2_offset); + break; } } else { rm =3D (modrm & 7) | REX_B(s); - op2_offset =3D offsetof(CPUX86State,xmm_regs[rm]); + op2_offset =3D XMM_OFFSET(rm); } + v_offset =3D XMM_OFFSET(reg_v); } else { + CHECK_NO_VEX(s); + scalar_op =3D 0; op1_offset =3D offsetof(CPUX86State,fpregs[reg].mmx); if (mod !=3D 3) { gen_lea_modrm(env, s, modrm); @@ -4484,60 +5469,100 @@ static void gen_sse(CPUX86State *env, DisasContext= *s, int b, rm =3D (modrm & 7); op2_offset =3D offsetof(CPUX86State,fpregs[rm].mmx); } - } - switch(b) { - case 0x0f: /* 3DNow! data insns */ - val =3D x86_ldub_code(env, s); - sse_fn_epp =3D sse_op_table5[val]; - if (!sse_fn_epp) { - goto unknown_op; + if (sse_op.flags & SSE_OPF_3DNOW) { + /* 3DNow! data insns */ + val =3D x86_ldub_code(env, s); + SSEFunc_0_epp sse_fn_epp =3D sse_op_table5[val]; + if (!sse_fn_epp) { + goto unknown_op; + } + tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset); + tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset); + sse_fn_epp(cpu_env, s->ptr0, s->ptr1); + return; } - if (!(s->cpuid_ext2_features & CPUID_EXT2_3DNOW)) { - goto illegal_op; + v_offset =3D op1_offset; + } + + /* + * Populate the top part of the destination register for VEX + * encoded scalar operations + */ + if (scalar_op && op1_offset !=3D v_offset) { + if (b =3D=3D 0x5a) { + /* + * Scalar conversions are tricky because the src and dest + * may be different sizes + */ + if (op1_offset =3D=3D op2_offset) { + /* + * The the second source operand overlaps the + * destination, so we need to copy the value + */ + op2_offset =3D offsetof(CPUX86State, xmm_t0); + gen_op_movq(s, op2_offset, op1_offset); + } + gen_op_movo(s, op1_offset, v_offset); + } else { + if (b1 =3D=3D 2) { /* ss */ + gen_op_movl(s, + offsetof(CPUX86State, xmm_regs[reg].ZMM_L(1)), + offsetof(CPUX86State, xmm_regs[reg_v].ZMM_L(1)= )); + } + gen_op_movq(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(1= )), + offsetof(CPUX86State, xmm_regs[reg_v].ZMM_Q(1)= )); } - tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset); - tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset); - sse_fn_epp(cpu_env, s->ptr0, s->ptr1); - break; - case 0x70: /* pshufx insn */ - case 0xc6: /* pshufx insn */ - val =3D x86_ldub_code(env, s); - tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset); - tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset); - /* XXX: introduce a new table? */ - sse_fn_ppi =3D (SSEFunc_0_ppi)sse_fn_epp; - sse_fn_ppi(s->ptr0, s->ptr1, tcg_const_i32(val)); - break; - case 0xc2: - /* compare insns, bits 7:3 (7:5 for AVX) are ignored */ - val =3D x86_ldub_code(env, s) & 7; - sse_fn_epp =3D sse_op_table4[val][b1]; + } =20 - tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset); - tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset); - sse_fn_epp(cpu_env, s->ptr0, s->ptr1); - break; - case 0xf7: - /* maskmov : we must prepare A0 */ - if (mod !=3D 3) - goto illegal_op; - tcg_gen_mov_tl(s->A0, cpu_regs[R_EDI]); - gen_extu(s->aflag, s->A0); - gen_add_A0_ds_seg(s); + tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset); + tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset); + if (sse_op.flags & SSE_OPF_V0) { + if (sse_op.flags & SSE_OPF_SHUF) { + val =3D x86_ldub_code(env, s); + sse_op.fn[b1].op1i(s->ptr0, s->ptr1, tcg_const_i32(val)); + } else if (b =3D=3D 0xf7) { + /* maskmov : we must prepare A0 */ + if (mod !=3D 3) { + goto illegal_op; + } + tcg_gen_mov_tl(s->A0, cpu_regs[R_EDI]); + gen_extu(s->aflag, s->A0); + gen_add_A0_ds_seg(s); + + tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset); + tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset); + sse_op.fn[b1].op1t(cpu_env, s->ptr0, s->ptr1, s->A0); + /* Does not write to the fist operand */ + return; + } else { + sse_op.fn[b1].op1(cpu_env, s->ptr0, s->ptr1); + } + } else { + tcg_gen_addi_ptr(s->ptr2, cpu_env, v_offset); + if (sse_op.flags & SSE_OPF_SHUF) { + val =3D x86_ldub_code(env, s); + sse_op.fn[b1].op2i(s->ptr0, s->ptr2, s->ptr1, + tcg_const_i32(val)); + } else { + SSEFunc_0_eppp fn =3D sse_op.fn[b1].op2; + if (b =3D=3D 0xc2) { + /* compare insns */ + val =3D x86_ldub_code(env, s); + if (s->prefix & PREFIX_VEX) { + val &=3D 0x1f; + } else { + val &=3D 7; + } + fn =3D sse_op_table4[val][b1]; + } + fn(cpu_env, s->ptr0, s->ptr2, s->ptr1); + } + } =20 - tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset); - tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset); - /* XXX: introduce a new table? */ - sse_fn_eppt =3D (SSEFunc_0_eppt)sse_fn_epp; - sse_fn_eppt(cpu_env, s->ptr0, s->ptr1, s->A0); - break; - default: - tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset); - tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset); - sse_fn_epp(cpu_env, s->ptr0, s->ptr1); - break; + if (s->vex_l =3D=3D 0 && (sse_op.flags & SSE_OPF_CMP) =3D=3D 0) { + gen_clear_ymmh(s, reg); } - if (b =3D=3D 0x2e || b =3D=3D 0x2f) { + if (sse_op.flags & SSE_OPF_CMP) { set_cc_op(s, CC_OP_EFLAGS); } } @@ -8619,6 +9644,7 @@ static void i386_tr_init_disas_context(DisasContextBa= se *dcbase, CPUState *cpu) dc->tmp4 =3D tcg_temp_new(); dc->ptr0 =3D tcg_temp_new_ptr(); dc->ptr1 =3D tcg_temp_new_ptr(); + dc->ptr2 =3D tcg_temp_new_ptr(); dc->cc_srcT =3D tcg_temp_local_new(); } =20 --=20 2.35.2