From nobody Tue Feb 10 00:25:12 2026 Delivered-To: importer@patchew.org Authentication-Results: mx.zohomail.com; spf=pass (zohomail.com: domain of gnu.org designates 209.51.188.17 as permitted sender) smtp.mailfrom=qemu-devel-bounces+importer=patchew.org@nongnu.org Return-Path: Received: from lists.gnu.org (lists.gnu.org [209.51.188.17]) by mx.zohomail.com with SMTPS id 1650838877737440.2849707890525; Sun, 24 Apr 2022 15:21:17 -0700 (PDT) Received: from localhost ([::1]:54108 helo=lists1p.gnu.org) by lists.gnu.org with esmtp (Exim 4.90_1) (envelope-from ) id 1nikbU-0006jR-5y for importer@patchew.org; Sun, 24 Apr 2022 18:21:16 -0400 Received: from eggs.gnu.org ([2001:470:142:3::10]:50478) by lists.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256) (Exim 4.90_1) (envelope-from ) id 1nikS6-00016T-3A for qemu-devel@nongnu.org; Sun, 24 Apr 2022 18:11:34 -0400 Received: from nowt.default.pbrook.uk0.bigv.io ([2001:41c8:51:832:fcff:ff:fe00:46dd]:58763) by eggs.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_128_GCM_SHA256:128) (Exim 4.90_1) (envelope-from ) id 1nikS4-0002mv-78 for qemu-devel@nongnu.org; Sun, 24 Apr 2022 18:11:33 -0400 Received: from cpc91554-seac25-2-0-cust857.7-2.cable.virginm.net ([82.27.199.90] helo=wren.home) by nowt.default.pbrook.uk0.bigv.io with esmtpsa (TLS1.2:ECDHE_RSA_AES_128_GCM_SHA256:128) (Exim 4.84_2) (envelope-from ) id 1nikJA-0001ea-Of; Sun, 24 Apr 2022 23:02:20 +0100 From: Paul Brook To: Paolo Bonzini , Richard Henderson , Eduardo Habkost Subject: [PATCH v2 29/42] i386: Implement VBROADCAST Date: Sun, 24 Apr 2022 23:01:51 +0100 Message-Id: <20220424220204.2493824-30-paul@nowt.org> X-Mailer: git-send-email 2.36.0 In-Reply-To: <20220418173904.3746036-1-paul@nowt.org> References: <20220418173904.3746036-1-paul@nowt.org> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Received-SPF: pass (zohomail.com: domain of gnu.org designates 209.51.188.17 as permitted sender) client-ip=209.51.188.17; envelope-from=qemu-devel-bounces+importer=patchew.org@nongnu.org; helo=lists.gnu.org; Received-SPF: pass client-ip=2001:41c8:51:832:fcff:ff:fe00:46dd; envelope-from=paul@nowt.org; helo=nowt.default.pbrook.uk0.bigv.io X-Spam_score_int: -18 X-Spam_score: -1.9 X-Spam_bar: - X-Spam_report: (-1.9 / 5.0 requ) BAYES_00=-1.9, SPF_HELO_NONE=0.001, SPF_PASS=-0.001, T_SCC_BODY_TEXT_LINE=-0.01 autolearn=ham autolearn_force=no X-Spam_action: no action X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Cc: "open list:All patches CC here" , Paul Brook Errors-To: qemu-devel-bounces+importer=patchew.org@nongnu.org Sender: "Qemu-devel" X-ZM-MESSAGEID: 1650838879614100001 Content-Type: text/plain; charset="utf-8" The catch here is that these are whole vector operations (not independent 1= 28 bit lanes). We abuse the SSE_OPF_SCALAR flag to select the memory operand width appropriately. Signed-off-by: Paul Brook --- target/i386/ops_sse.h | 51 ++++++++++++++++++++++++++++++++++++ target/i386/ops_sse_header.h | 8 ++++++ target/i386/tcg/translate.c | 42 ++++++++++++++++++++++++++++- 3 files changed, 100 insertions(+), 1 deletion(-) diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index a1f50f0c8b..4115c9a257 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -3071,7 +3071,57 @@ void glue(helper_aeskeygenassist, SUFFIX)(CPUX86Stat= e *env, Reg *d, Reg *s, #endif #endif =20 +#if SHIFT >=3D 1 +void glue(helper_vbroadcastb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +{ + uint8_t val =3D s->B(0); + int i; + + for (i =3D 0; i < 16 * SHIFT; i++) { + d->B(i) =3D val; + } +} + +void glue(helper_vbroadcastw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +{ + uint16_t val =3D s->W(0); + int i; + + for (i =3D 0; i < 8 * SHIFT; i++) { + d->W(i) =3D val; + } +} + +void glue(helper_vbroadcastl, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +{ + uint32_t val =3D s->L(0); + int i; + + for (i =3D 0; i < 8 * SHIFT; i++) { + d->L(i) =3D val; + } +} + +void glue(helper_vbroadcastq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +{ + uint64_t val =3D s->Q(0); + d->Q(0) =3D val; + d->Q(1) =3D val; #if SHIFT =3D=3D 2 + d->Q(2) =3D val; + d->Q(3) =3D val; +#endif +} + +#if SHIFT =3D=3D 2 +void glue(helper_vbroadcastdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +{ + d->Q(0) =3D s->Q(0); + d->Q(1) =3D s->Q(1); + d->Q(2) =3D s->Q(0); + d->Q(3) =3D s->Q(1); +} + void helper_vzeroall(CPUX86State *env) { int i; @@ -3118,6 +3168,7 @@ void helper_vzeroupper_hi8(CPUX86State *env) } #endif #endif +#endif =20 #undef SSE_HELPER_S =20 diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h index 48f0945917..51e02cd4fa 100644 --- a/target/i386/ops_sse_header.h +++ b/target/i386/ops_sse_header.h @@ -411,7 +411,14 @@ DEF_HELPER_4(glue(aeskeygenassist, SUFFIX), void, env,= Reg, Reg, i32) DEF_HELPER_5(glue(pclmulqdq, SUFFIX), void, env, Reg, Reg, Reg, i32) #endif =20 +/* AVX helpers */ +#if SHIFT >=3D 1 +DEF_HELPER_3(glue(vbroadcastb, SUFFIX), void, env, Reg, Reg) +DEF_HELPER_3(glue(vbroadcastw, SUFFIX), void, env, Reg, Reg) +DEF_HELPER_3(glue(vbroadcastl, SUFFIX), void, env, Reg, Reg) +DEF_HELPER_3(glue(vbroadcastq, SUFFIX), void, env, Reg, Reg) #if SHIFT =3D=3D 2 +DEF_HELPER_3(glue(vbroadcastdq, SUFFIX), void, env, Reg, Reg) DEF_HELPER_1(vzeroall, void, env) DEF_HELPER_1(vzeroupper, void, env) #ifdef TARGET_X86_64 @@ -419,6 +426,7 @@ DEF_HELPER_1(vzeroall_hi8, void, env) DEF_HELPER_1(vzeroupper_hi8, void, env) #endif #endif +#endif =20 #undef SHIFT #undef Reg diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index ba70aeb039..59ab1dc562 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -3255,6 +3255,11 @@ static const struct SSEOpHelper_table6 sse_op_table6= [256] =3D { [0x14] =3D BLENDV_OP(blendvps, SSE41, 0), [0x15] =3D BLENDV_OP(blendvpd, SSE41, 0), [0x17] =3D CMP_OP(ptest, SSE41), + /* TODO:Some vbroadcast variants require AVX2 */ + [0x18] =3D UNARY_OP(vbroadcastl, AVX, SSE_OPF_SCALAR), /* vbroadcastss= */ + [0x19] =3D UNARY_OP(vbroadcastq, AVX, SSE_OPF_SCALAR), /* vbroadcastsd= */ +#define gen_helper_vbroadcastdq_xmm NULL + [0x1a] =3D UNARY_OP(vbroadcastdq, AVX, SSE_OPF_SCALAR), /* vbroadcastf= 128 */ [0x1c] =3D UNARY_OP_MMX(pabsb, SSSE3), [0x1d] =3D UNARY_OP_MMX(pabsw, SSSE3), [0x1e] =3D UNARY_OP_MMX(pabsd, SSSE3), @@ -3286,6 +3291,16 @@ static const struct SSEOpHelper_table6 sse_op_table6= [256] =3D { [0x40] =3D BINARY_OP(pmulld, SSE41, SSE_OPF_MMX), #define gen_helper_phminposuw_ymm NULL [0x41] =3D UNARY_OP(phminposuw, SSE41, 0), + /* vpbroadcastd */ + [0x58] =3D UNARY_OP(vbroadcastl, AVX, SSE_OPF_SCALAR | SSE_OPF_MMX), + /* vpbroadcastq */ + [0x59] =3D UNARY_OP(vbroadcastq, AVX, SSE_OPF_SCALAR | SSE_OPF_MMX), + /* vbroadcasti128 */ + [0x5a] =3D UNARY_OP(vbroadcastdq, AVX, SSE_OPF_SCALAR | SSE_OPF_MMX), + /* vpbroadcastb */ + [0x78] =3D UNARY_OP(vbroadcastb, AVX, SSE_OPF_SCALAR | SSE_OPF_MMX), + /* vpbroadcastw */ + [0x79] =3D UNARY_OP(vbroadcastw, AVX, SSE_OPF_SCALAR | SSE_OPF_MMX), #define gen_helper_aesimc_ymm NULL [0xdb] =3D UNARY_OP(aesimc, AES, 0), [0xdc] =3D BINARY_OP(aesenc, AES, 0), @@ -4323,6 +4338,24 @@ static void gen_sse(CPUX86State *env, DisasContext *= s, int b, op2_offset =3D offsetof(CPUX86State, xmm_t0); gen_lea_modrm(env, s, modrm); switch (b) { + case 0x78: /* vpbroadcastb */ + size =3D 8; + break; + case 0x79: /* vpbroadcastw */ + size =3D 16; + break; + case 0x18: /* vbroadcastss */ + case 0x58: /* vpbroadcastd */ + size =3D 32; + break; + case 0x19: /* vbroadcastsd */ + case 0x59: /* vpbroadcastq */ + size =3D 64; + break; + case 0x1a: /* vbroadcastf128 */ + case 0x5a: /* vbroadcasti128 */ + size =3D 128; + break; case 0x20: case 0x30: /* pmovsxbw, pmovzxbw */ case 0x23: case 0x33: /* pmovsxwd, pmovzxwd */ case 0x25: case 0x35: /* pmovsxdq, pmovzxdq */ @@ -4346,10 +4379,17 @@ static void gen_sse(CPUX86State *env, DisasContext = *s, int b, default: size =3D 128; } - if (s->vex_l) { + /* 256 bit vbroadcast only load a single element. */ + if ((op6.flags & SSE_OPF_SCALAR) =3D=3D 0 && s->vex_l)= { size *=3D 2; } switch (size) { + case 8: + tcg_gen_qemu_ld_tl(s->tmp0, s->A0, + s->mem_index, MO_UB); + tcg_gen_st16_tl(s->tmp0, cpu_env, op2_offset + + offsetof(ZMMReg, ZMM_B(0))); + break; case 16: tcg_gen_qemu_ld_tl(s->tmp0, s->A0, s->mem_index, MO_LEUW); --=20 2.36.0