Use the gvec infrastructure to achieve the desired functionality.
Note: This commit adds several bits which will not be part of the
final patch series and which are only present to allow for incremenal
write-and-test development cycle. Notably, the SSE_TOMBSTONE define
will go away entirely with all of the tables, and nothing will follow
the new dispatch switch in gen_sse.
Signed-off-by: Jan Bobek <jan.bobek@gmail.com>
---
target/i386/ops_sse.h | 2 --
target/i386/ops_sse_header.h | 1 -
target/i386/translate.c | 49 ++++++++++++++++++++++++++++++++++--
3 files changed, 47 insertions(+), 5 deletions(-)
diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index ed05989768..b3ba23287d 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -353,7 +353,6 @@ static inline int satsw(int x)
#define FMAXUB(a, b) ((a) > (b)) ? (a) : (b)
#define FMAXSW(a, b) ((int16_t)(a) > (int16_t)(b)) ? (a) : (b)
-#define FAND(a, b) ((a) & (b))
#define FANDN(a, b) ((~(a)) & (b))
#define FOR(a, b) ((a) | (b))
#define FXOR(a, b) ((a) ^ (b))
@@ -397,7 +396,6 @@ SSE_HELPER_B(helper_pmaxub, FMAXUB)
SSE_HELPER_W(helper_pminsw, FMINSW)
SSE_HELPER_W(helper_pmaxsw, FMAXSW)
-SSE_HELPER_Q(helper_pand, FAND)
SSE_HELPER_Q(helper_pandn, FANDN)
SSE_HELPER_Q(helper_por, FOR)
SSE_HELPER_Q(helper_pxor, FXOR)
diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h
index 094aafc573..63b4376389 100644
--- a/target/i386/ops_sse_header.h
+++ b/target/i386/ops_sse_header.h
@@ -86,7 +86,6 @@ SSE_HELPER_B(pmaxub, FMAXUB)
SSE_HELPER_W(pminsw, FMINSW)
SSE_HELPER_W(pmaxsw, FMAXSW)
-SSE_HELPER_Q(pand, FAND)
SSE_HELPER_Q(pandn, FANDN)
SSE_HELPER_Q(por, FOR)
SSE_HELPER_Q(pxor, FXOR)
diff --git a/target/i386/translate.c b/target/i386/translate.c
index d576b3345c..3821733a4e 100644
--- a/target/i386/translate.c
+++ b/target/i386/translate.c
@@ -23,6 +23,7 @@
#include "disas/disas.h"
#include "exec/exec-all.h"
#include "tcg-op.h"
+#include "tcg-op-gvec.h"
#include "exec/cpu_ldst.h"
#include "exec/translator.h"
@@ -2723,6 +2724,7 @@ typedef void (*SSEFunc_0_eppt)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
#define SSE_SPECIAL ((void *)1)
#define SSE_DUMMY ((void *)2)
+#define SSE_TOMBSTONE ((void *)3)
#define MMX_OP2(x) { gen_helper_ ## x ## _mmx, gen_helper_ ## x ## _xmm }
#define SSE_FOP(x) { gen_helper_ ## x ## ps, gen_helper_ ## x ## pd, \
@@ -2754,7 +2756,7 @@ static const SSEFunc_0_epp sse_op_table1[256][4] = {
[0x51] = SSE_FOP(sqrt),
[0x52] = { gen_helper_rsqrtps, NULL, gen_helper_rsqrtss, NULL },
[0x53] = { gen_helper_rcpps, NULL, gen_helper_rcpss, NULL },
- [0x54] = { gen_helper_pand_xmm, gen_helper_pand_xmm }, /* andps, andpd */
+ [0x54] = { SSE_TOMBSTONE, SSE_TOMBSTONE }, /* andps, andpd */
[0x55] = { gen_helper_pandn_xmm, gen_helper_pandn_xmm }, /* andnps, andnpd */
[0x56] = { gen_helper_por_xmm, gen_helper_por_xmm }, /* orps, orpd */
[0x57] = { gen_helper_pxor_xmm, gen_helper_pxor_xmm }, /* xorps, xorpd */
@@ -2823,7 +2825,7 @@ static const SSEFunc_0_epp sse_op_table1[256][4] = {
[0xd8] = MMX_OP2(psubusb),
[0xd9] = MMX_OP2(psubusw),
[0xda] = MMX_OP2(pminub),
- [0xdb] = MMX_OP2(pand),
+ [0xdb] = { SSE_TOMBSTONE, SSE_TOMBSTONE },
[0xdc] = MMX_OP2(paddusb),
[0xdd] = MMX_OP2(paddusw),
[0xde] = MMX_OP2(pmaxub),
@@ -3164,6 +3166,17 @@ static inline void gen_gvec_ld_modrm_3(CPUX86State *env, DisasContext *s,
gen_ld_modrm_VxHxWx, \
gen_gvec_2_fp, (opctl))
+#define gen_pand_mm(env, s, modrm) gen_gvec_ld_modrm_mm ((env), (s), (modrm), MO_64, tcg_gen_gvec_and, 0112)
+#define gen_pand_xmm(env, s, modrm) gen_gvec_ld_modrm_xmm ((env), (s), (modrm), MO_64, tcg_gen_gvec_and, 0112)
+#define gen_vpand_xmm(env, s, modrm) gen_gvec_ld_modrm_vxmm((env), (s), (modrm), MO_64, tcg_gen_gvec_and, 0123)
+#define gen_vpand_ymm(env, s, modrm) gen_gvec_ld_modrm_vymm((env), (s), (modrm), MO_64, tcg_gen_gvec_and, 0123)
+#define gen_andps_xmm gen_pand_xmm
+#define gen_vandps_xmm gen_vpand_xmm
+#define gen_vandps_ymm gen_vpand_ymm
+#define gen_andpd_xmm gen_pand_xmm
+#define gen_vandpd_xmm gen_vpand_xmm
+#define gen_vandpd_ymm gen_vpand_ymm
+
static void gen_sse(CPUX86State *env, DisasContext *s, int b)
{
int b1, op1_offset, op2_offset, is_xmm, val;
@@ -3238,6 +3251,38 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b)
reg |= REX_R(s);
}
mod = (modrm >> 6) & 3;
+
+ enum {
+ M_0F = 0x01 << 8,
+ M_0F38 = 0x02 << 8,
+ M_0F3A = 0x04 << 8,
+ P_66 = 0x08 << 8,
+ P_F3 = 0x10 << 8,
+ P_F2 = 0x20 << 8,
+ VEX_128 = 0x40 << 8,
+ VEX_256 = 0x80 << 8,
+ };
+
+ switch(b | M_0F
+ | (s->prefix & PREFIX_DATA ? P_66 : 0)
+ | (s->prefix & PREFIX_REPZ ? P_F3 : 0)
+ | (s->prefix & PREFIX_REPNZ ? P_F2 : 0)
+ | (s->prefix & PREFIX_VEX ? (s->vex_l ? VEX_256 : VEX_128) : 0)) {
+ case 0xdb | M_0F: gen_pand_mm(env, s, modrm); return;
+ case 0xdb | M_0F | P_66: gen_pand_xmm(env, s, modrm); return;
+ case 0xdb | M_0F | P_66 | VEX_128: gen_vpand_xmm(env, s, modrm); return;
+ case 0xdb | M_0F | P_66 | VEX_256: gen_vpand_ymm(env, s, modrm); return;
+ case 0x54 | M_0F: gen_andps_xmm(env, s, modrm); return;
+ case 0x54 | M_0F | VEX_128: gen_vandps_xmm(env, s, modrm); return;
+ case 0x54 | M_0F | VEX_256: gen_vandps_ymm(env, s, modrm); return;
+ case 0x54 | M_0F | P_66: gen_andpd_xmm(env, s, modrm); return;
+ case 0x54 | M_0F | P_66 | VEX_128: gen_vandpd_xmm(env, s, modrm); return;
+ case 0x54 | M_0F | P_66 | VEX_256: gen_vandpd_ymm(env, s, modrm); return;
+ default: break;
+ }
+
+ assert(sse_fn_epp != SSE_TOMBSTONE);
+
if (sse_fn_epp == SSE_SPECIAL) {
b |= (b1 << 8);
switch(b) {
--
2.20.1
On 7/31/19 10:56 AM, Jan Bobek wrote:
> +#define gen_pand_mm(env, s, modrm) gen_gvec_ld_modrm_mm ((env), (s), (modrm), MO_64, tcg_gen_gvec_and, 0112)
> +#define gen_pand_xmm(env, s, modrm) gen_gvec_ld_modrm_xmm ((env), (s), (modrm), MO_64, tcg_gen_gvec_and, 0112)
> +#define gen_vpand_xmm(env, s, modrm) gen_gvec_ld_modrm_vxmm((env), (s), (modrm), MO_64, tcg_gen_gvec_and, 0123)
> +#define gen_vpand_ymm(env, s, modrm) gen_gvec_ld_modrm_vymm((env), (s), (modrm), MO_64, tcg_gen_gvec_and, 0123)
> +#define gen_andps_xmm gen_pand_xmm
> +#define gen_vandps_xmm gen_vpand_xmm
> +#define gen_vandps_ymm gen_vpand_ymm
> +#define gen_andpd_xmm gen_pand_xmm
> +#define gen_vandpd_xmm gen_vpand_xmm
> +#define gen_vandpd_ymm gen_vpand_ymm
Why all of these extra defines?
> + enum {
> + M_0F = 0x01 << 8,
> + M_0F38 = 0x02 << 8,
> + M_0F3A = 0x04 << 8,
> + P_66 = 0x08 << 8,
> + P_F3 = 0x10 << 8,
> + P_F2 = 0x20 << 8,
> + VEX_128 = 0x40 << 8,
> + VEX_256 = 0x80 << 8,
> + };
> +
> + switch(b | M_0F
> + | (s->prefix & PREFIX_DATA ? P_66 : 0)
> + | (s->prefix & PREFIX_REPZ ? P_F3 : 0)
> + | (s->prefix & PREFIX_REPNZ ? P_F2 : 0)
> + | (s->prefix & PREFIX_VEX ? (s->vex_l ? VEX_256 : VEX_128) : 0)) {
I think you can move this above almost everything in this function, so that all
of the legacy bits follow this switch.
> + case 0xdb | M_0F: gen_pand_mm(env, s, modrm); return;
You'll want to put these on the next lines -- checkpatch.pl again.
> + case 0xdb | M_0F | P_66: gen_pand_xmm(env, s, modrm); return;
> + case 0xdb | M_0F | P_66 | VEX_128: gen_vpand_xmm(env, s, modrm); return;
> + case 0xdb | M_0F | P_66 | VEX_256: gen_vpand_ymm(env, s, modrm); return;
> + case 0x54 | M_0F: gen_andps_xmm(env, s, modrm); return;
> + case 0x54 | M_0F | VEX_128: gen_vandps_xmm(env, s, modrm); return;
> + case 0x54 | M_0F | VEX_256: gen_vandps_ymm(env, s, modrm); return;
> + case 0x54 | M_0F | P_66: gen_andpd_xmm(env, s, modrm); return;
> + case 0x54 | M_0F | P_66 | VEX_128: gen_vandpd_xmm(env, s, modrm); return;
> + case 0x54 | M_0F | P_66 | VEX_256: gen_vandpd_ymm(env, s, modrm); return;
> + default: break;
> + }
Perhaps group cases together?
case 0xdb | M_0F | P_66: /* PAND */
case 0x54 | M_0F: /* ANDPS */
case 0x54 | M_0F | P_66: /* ANDPD */
gen_gvec_ld_modrm_xmm(env, s, modrm, MO_64, tcg_gen_gvec_and, 0112);
return;
How are you planning to handle CPUID checks? I know the currently handling is
quite spotty, but with a reorg we might as well fix that too.
r~
On Wed, Jul 31, 2019 at 9:36 PM Richard Henderson <
richard.henderson@linaro.org> wrote:
> On 7/31/19 10:56 AM, Jan Bobek wrote:
> > +#define gen_pand_mm(env, s, modrm) gen_gvec_ld_modrm_mm ((env), (s),
> (modrm), MO_64, tcg_gen_gvec_and, 0112)
> > +#define gen_pand_xmm(env, s, modrm) gen_gvec_ld_modrm_xmm ((env), (s),
> (modrm), MO_64, tcg_gen_gvec_and, 0112)
> > +#define gen_vpand_xmm(env, s, modrm) gen_gvec_ld_modrm_vxmm((env), (s),
> (modrm), MO_64, tcg_gen_gvec_and, 0123)
> > +#define gen_vpand_ymm(env, s, modrm) gen_gvec_ld_modrm_vymm((env), (s),
> (modrm), MO_64, tcg_gen_gvec_and, 0123)
> > +#define gen_andps_xmm gen_pand_xmm
> > +#define gen_vandps_xmm gen_vpand_xmm
> > +#define gen_vandps_ymm gen_vpand_ymm
> > +#define gen_andpd_xmm gen_pand_xmm
> > +#define gen_vandpd_xmm gen_vpand_xmm
> > +#define gen_vandpd_ymm gen_vpand_ymm
>
>
> Why all of these extra defines?
>
Because of code clarity and safety, I would say.
This line:
case 0x54 | M_0F: gen_andps_xmm(env, s, modrm); return;
looks much clearer than this one:
case 0x54 | M_0F: gen_gvec_ld_modrm_mm(env, s, modrm,
MO_64, tcg_gen_gvec_and, 0112)
and such organization is also much less prone to copy/paste bugs etc.
Needless to say there is no performance price at all, so it is a no-brainer.
Sincerely,
Aleksandar
>
> > + enum {
> > + M_0F = 0x01 << 8,
> > + M_0F38 = 0x02 << 8,
> > + M_0F3A = 0x04 << 8,
> > + P_66 = 0x08 << 8,
> > + P_F3 = 0x10 << 8,
> > + P_F2 = 0x20 << 8,
> > + VEX_128 = 0x40 << 8,
> > + VEX_256 = 0x80 << 8,
> > + };
> > +
> > + switch(b | M_0F
> > + | (s->prefix & PREFIX_DATA ? P_66 : 0)
> > + | (s->prefix & PREFIX_REPZ ? P_F3 : 0)
> > + | (s->prefix & PREFIX_REPNZ ? P_F2 : 0)
> > + | (s->prefix & PREFIX_VEX ? (s->vex_l ? VEX_256 : VEX_128) :
> 0)) {
>
> I think you can move this above almost everything in this function, so
> that all
> of the legacy bits follow this switch.
>
> > + case 0xdb | M_0F: gen_pand_mm(env, s, modrm);
> return;
>
> You'll want to put these on the next lines -- checkpatch.pl again.
>
> > + case 0xdb | M_0F | P_66: gen_pand_xmm(env, s, modrm);
> return;
> > + case 0xdb | M_0F | P_66 | VEX_128: gen_vpand_xmm(env, s, modrm);
> return;
> > + case 0xdb | M_0F | P_66 | VEX_256: gen_vpand_ymm(env, s, modrm);
> return;
> > + case 0x54 | M_0F: gen_andps_xmm(env, s, modrm);
> return;
> > + case 0x54 | M_0F | VEX_128: gen_vandps_xmm(env, s, modrm);
> return;
> > + case 0x54 | M_0F | VEX_256: gen_vandps_ymm(env, s, modrm);
> return;
> > + case 0x54 | M_0F | P_66: gen_andpd_xmm(env, s, modrm);
> return;
> > + case 0x54 | M_0F | P_66 | VEX_128: gen_vandpd_xmm(env, s, modrm);
> return;
> > + case 0x54 | M_0F | P_66 | VEX_256: gen_vandpd_ymm(env, s, modrm);
> return;
> > + default: break;
> > + }
>
> Perhaps group cases together?
>
> case 0xdb | M_0F | P_66: /* PAND */
> case 0x54 | M_0F: /* ANDPS */
> case 0x54 | M_0F | P_66: /* ANDPD */
> gen_gvec_ld_modrm_xmm(env, s, modrm, MO_64, tcg_gen_gvec_and, 0112);
> return;
>
> How are you planning to handle CPUID checks? I know the currently
> handling is
> quite spotty, but with a reorg we might as well fix that too.
>
>
> r~
>
>
On 7/31/19 1:27 PM, Aleksandar Markovic wrote: > > > On Wed, Jul 31, 2019 at 9:36 PM Richard Henderson <richard.henderson@linaro.org > <mailto:richard.henderson@linaro.org>> wrote: > > On 7/31/19 10:56 AM, Jan Bobek wrote: > > +#define gen_pand_mm(env, s, modrm) gen_gvec_ld_modrm_mm ((env), (s), > (modrm), MO_64, tcg_gen_gvec_and, 0112) > > +#define gen_pand_xmm(env, s, modrm) gen_gvec_ld_modrm_xmm ((env), (s), > (modrm), MO_64, tcg_gen_gvec_and, 0112) > > +#define gen_vpand_xmm(env, s, modrm) gen_gvec_ld_modrm_vxmm((env), (s), > (modrm), MO_64, tcg_gen_gvec_and, 0123) > > +#define gen_vpand_ymm(env, s, modrm) gen_gvec_ld_modrm_vymm((env), (s), > (modrm), MO_64, tcg_gen_gvec_and, 0123) > > +#define gen_andps_xmm gen_pand_xmm > > +#define gen_vandps_xmm gen_vpand_xmm > > +#define gen_vandps_ymm gen_vpand_ymm > > +#define gen_andpd_xmm gen_pand_xmm > > +#define gen_vandpd_xmm gen_vpand_xmm > > +#define gen_vandpd_ymm gen_vpand_ymm > > > Why all of these extra defines? > > > Because of code clarity and safety, I would say. > > This line: > > case 0x54 | M_0F: gen_andps_xmm(env, s, modrm); return; > > looks much clearer than this one: > > case 0x54 | M_0F: gen_gvec_ld_modrm_mm(env, s, modrm, MO_64, > tcg_gen_gvec_and, 0112) > > and such organization is also much less prone to copy/paste bugs etc. I'm not convinced. These macros will be used exactly once. Because there is no reuse, there is no added safety. There is only the chance of a typo in a location removed from the actual use within the switch. I agree that the goal is to minimize any replication per case. But I don't think this particular arrangement of macros is the way to accomplish that. r~
On 7/31/19 3:35 PM, Richard Henderson wrote:
> On 7/31/19 10:56 AM, Jan Bobek wrote:
>> +#define gen_pand_mm(env, s, modrm) gen_gvec_ld_modrm_mm ((env), (s), (modrm), MO_64, tcg_gen_gvec_and, 0112)
>> +#define gen_pand_xmm(env, s, modrm) gen_gvec_ld_modrm_xmm ((env), (s), (modrm), MO_64, tcg_gen_gvec_and, 0112)
>> +#define gen_vpand_xmm(env, s, modrm) gen_gvec_ld_modrm_vxmm((env), (s), (modrm), MO_64, tcg_gen_gvec_and, 0123)
>> +#define gen_vpand_ymm(env, s, modrm) gen_gvec_ld_modrm_vymm((env), (s), (modrm), MO_64, tcg_gen_gvec_and, 0123)
>> +#define gen_andps_xmm gen_pand_xmm
>> +#define gen_vandps_xmm gen_vpand_xmm
>> +#define gen_vandps_ymm gen_vpand_ymm
>> +#define gen_andpd_xmm gen_pand_xmm
>> +#define gen_vandpd_xmm gen_vpand_xmm
>> +#define gen_vandpd_ymm gen_vpand_ymm
>
>
> Why all of these extra defines?
>
>> + enum {
>> + M_0F = 0x01 << 8,
>> + M_0F38 = 0x02 << 8,
>> + M_0F3A = 0x04 << 8,
>> + P_66 = 0x08 << 8,
>> + P_F3 = 0x10 << 8,
>> + P_F2 = 0x20 << 8,
>> + VEX_128 = 0x40 << 8,
>> + VEX_256 = 0x80 << 8,
>> + };
>> +
>> + switch(b | M_0F
>> + | (s->prefix & PREFIX_DATA ? P_66 : 0)
>> + | (s->prefix & PREFIX_REPZ ? P_F3 : 0)
>> + | (s->prefix & PREFIX_REPNZ ? P_F2 : 0)
>> + | (s->prefix & PREFIX_VEX ? (s->vex_l ? VEX_256 : VEX_128) : 0)) {
>
> I think you can move this above almost everything in this function, so that all
> of the legacy bits follow this switch.
>
>> + case 0xdb | M_0F: gen_pand_mm(env, s, modrm); return;
>
> You'll want to put these on the next lines -- checkpatch.pl again.
>
>> + case 0xdb | M_0F | P_66: gen_pand_xmm(env, s, modrm); return;
>> + case 0xdb | M_0F | P_66 | VEX_128: gen_vpand_xmm(env, s, modrm); return;
>> + case 0xdb | M_0F | P_66 | VEX_256: gen_vpand_ymm(env, s, modrm); return;
>> + case 0x54 | M_0F: gen_andps_xmm(env, s, modrm); return;
>> + case 0x54 | M_0F | VEX_128: gen_vandps_xmm(env, s, modrm); return;
>> + case 0x54 | M_0F | VEX_256: gen_vandps_ymm(env, s, modrm); return;
>> + case 0x54 | M_0F | P_66: gen_andpd_xmm(env, s, modrm); return;
>> + case 0x54 | M_0F | P_66 | VEX_128: gen_vandpd_xmm(env, s, modrm); return;
>> + case 0x54 | M_0F | P_66 | VEX_256: gen_vandpd_ymm(env, s, modrm); return;
>> + default: break;
>> + }
>
> Perhaps group cases together?
>
> case 0xdb | M_0F | P_66: /* PAND */
> case 0x54 | M_0F: /* ANDPS */
> case 0x54 | M_0F | P_66: /* ANDPD */
> gen_gvec_ld_modrm_xmm(env, s, modrm, MO_64, tcg_gen_gvec_and, 0112);
> return;
As Aleksandar pointed out in his email, the general intuition was to
have self-documenting code. Seeing
case 0x54 | M_0F | VEX_256: gen_vandps_ymm(env, s, modrm); return;
clearly states that this particular case is a VANDPS, and if one wants
to see what we do with it, they can go look gen_vandps_ymm up.
That being said, I have to the conclusion in the meantime that keeping
all the extra macros is just too much code and not worth it, so I'll
do it like you suggest above.
> How are you planning to handle CPUID checks? I know the currently handling is
> quite spotty, but with a reorg we might as well fix that too.
Good question. CPUID checks are not handled in this patch at all, I
will need to come up with a workable approach.
-Jan
© 2016 - 2026 Red Hat, Inc.