reimplement (some) x86 vector instructions using tcg-gvec

[Qemu-devel] [RFC PATCH v1 08/22] target/i386: reimplement (V)PAND, (V)ANDPS, (V)ANDPD

Posted by Jan Bobek 6 years, 6 months ago

Use the gvec infrastructure to achieve the desired functionality.

Note: This commit adds several bits which will not be part of the
final patch series and which are only present to allow for incremenal
write-and-test development cycle. Notably, the SSE_TOMBSTONE define
will go away entirely with all of the tables, and nothing will follow
the new dispatch switch in gen_sse.

Signed-off-by: Jan Bobek <jan.bobek@gmail.com>
---
 target/i386/ops_sse.h        |  2 --
 target/i386/ops_sse_header.h |  1 -
 target/i386/translate.c      | 49 ++++++++++++++++++++++++++++++++++--
 3 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index ed05989768..b3ba23287d 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -353,7 +353,6 @@ static inline int satsw(int x)
 #define FMAXUB(a, b) ((a) > (b)) ? (a) : (b)
 #define FMAXSW(a, b) ((int16_t)(a) > (int16_t)(b)) ? (a) : (b)
 
-#define FAND(a, b) ((a) & (b))
 #define FANDN(a, b) ((~(a)) & (b))
 #define FOR(a, b) ((a) | (b))
 #define FXOR(a, b) ((a) ^ (b))
@@ -397,7 +396,6 @@ SSE_HELPER_B(helper_pmaxub, FMAXUB)
 SSE_HELPER_W(helper_pminsw, FMINSW)
 SSE_HELPER_W(helper_pmaxsw, FMAXSW)
 
-SSE_HELPER_Q(helper_pand, FAND)
 SSE_HELPER_Q(helper_pandn, FANDN)
 SSE_HELPER_Q(helper_por, FOR)
 SSE_HELPER_Q(helper_pxor, FXOR)
diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h
index 094aafc573..63b4376389 100644
--- a/target/i386/ops_sse_header.h
+++ b/target/i386/ops_sse_header.h
@@ -86,7 +86,6 @@ SSE_HELPER_B(pmaxub, FMAXUB)
 SSE_HELPER_W(pminsw, FMINSW)
 SSE_HELPER_W(pmaxsw, FMAXSW)
 
-SSE_HELPER_Q(pand, FAND)
 SSE_HELPER_Q(pandn, FANDN)
 SSE_HELPER_Q(por, FOR)
 SSE_HELPER_Q(pxor, FXOR)
diff --git a/target/i386/translate.c b/target/i386/translate.c
index d576b3345c..3821733a4e 100644
--- a/target/i386/translate.c
+++ b/target/i386/translate.c
@@ -23,6 +23,7 @@
 #include "disas/disas.h"
 #include "exec/exec-all.h"
 #include "tcg-op.h"
+#include "tcg-op-gvec.h"
 #include "exec/cpu_ldst.h"
 #include "exec/translator.h"
 
@@ -2723,6 +2724,7 @@ typedef void (*SSEFunc_0_eppt)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
 
 #define SSE_SPECIAL ((void *)1)
 #define SSE_DUMMY ((void *)2)
+#define SSE_TOMBSTONE ((void *)3)
 
 #define MMX_OP2(x) { gen_helper_ ## x ## _mmx, gen_helper_ ## x ## _xmm }
 #define SSE_FOP(x) { gen_helper_ ## x ## ps, gen_helper_ ## x ## pd, \
@@ -2754,7 +2756,7 @@ static const SSEFunc_0_epp sse_op_table1[256][4] = {
     [0x51] = SSE_FOP(sqrt),
     [0x52] = { gen_helper_rsqrtps, NULL, gen_helper_rsqrtss, NULL },
     [0x53] = { gen_helper_rcpps, NULL, gen_helper_rcpss, NULL },
-    [0x54] = { gen_helper_pand_xmm, gen_helper_pand_xmm }, /* andps, andpd */
+    [0x54] = { SSE_TOMBSTONE, SSE_TOMBSTONE }, /* andps, andpd */
     [0x55] = { gen_helper_pandn_xmm, gen_helper_pandn_xmm }, /* andnps, andnpd */
     [0x56] = { gen_helper_por_xmm, gen_helper_por_xmm }, /* orps, orpd */
     [0x57] = { gen_helper_pxor_xmm, gen_helper_pxor_xmm }, /* xorps, xorpd */
@@ -2823,7 +2825,7 @@ static const SSEFunc_0_epp sse_op_table1[256][4] = {
     [0xd8] = MMX_OP2(psubusb),
     [0xd9] = MMX_OP2(psubusw),
     [0xda] = MMX_OP2(pminub),
-    [0xdb] = MMX_OP2(pand),
+    [0xdb] = { SSE_TOMBSTONE, SSE_TOMBSTONE },
     [0xdc] = MMX_OP2(paddusb),
     [0xdd] = MMX_OP2(paddusw),
     [0xde] = MMX_OP2(pmaxub),
@@ -3164,6 +3166,17 @@ static inline void gen_gvec_ld_modrm_3(CPUX86State *env, DisasContext *s,
                         gen_ld_modrm_VxHxWx,                            \
                         gen_gvec_2_fp, (opctl))
 
+#define gen_pand_mm(env, s, modrm)   gen_gvec_ld_modrm_mm  ((env), (s), (modrm), MO_64, tcg_gen_gvec_and, 0112)
+#define gen_pand_xmm(env, s, modrm)  gen_gvec_ld_modrm_xmm ((env), (s), (modrm), MO_64, tcg_gen_gvec_and, 0112)
+#define gen_vpand_xmm(env, s, modrm) gen_gvec_ld_modrm_vxmm((env), (s), (modrm), MO_64, tcg_gen_gvec_and, 0123)
+#define gen_vpand_ymm(env, s, modrm) gen_gvec_ld_modrm_vymm((env), (s), (modrm), MO_64, tcg_gen_gvec_and, 0123)
+#define gen_andps_xmm  gen_pand_xmm
+#define gen_vandps_xmm gen_vpand_xmm
+#define gen_vandps_ymm gen_vpand_ymm
+#define gen_andpd_xmm  gen_pand_xmm
+#define gen_vandpd_xmm gen_vpand_xmm
+#define gen_vandpd_ymm gen_vpand_ymm
+
 static void gen_sse(CPUX86State *env, DisasContext *s, int b)
 {
     int b1, op1_offset, op2_offset, is_xmm, val;
@@ -3238,6 +3251,38 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b)
         reg |= REX_R(s);
     }
     mod = (modrm >> 6) & 3;
+
+    enum {
+        M_0F    = 0x01 << 8,
+        M_0F38  = 0x02 << 8,
+        M_0F3A  = 0x04 << 8,
+        P_66    = 0x08 << 8,
+        P_F3    = 0x10 << 8,
+        P_F2    = 0x20 << 8,
+        VEX_128 = 0x40 << 8,
+        VEX_256 = 0x80 << 8,
+    };
+
+    switch(b | M_0F
+           | (s->prefix & PREFIX_DATA ? P_66 : 0)
+           | (s->prefix & PREFIX_REPZ ? P_F3 : 0)
+           | (s->prefix & PREFIX_REPNZ ? P_F2 : 0)
+           | (s->prefix & PREFIX_VEX ? (s->vex_l ? VEX_256 : VEX_128) : 0)) {
+    case 0xdb | M_0F:                  gen_pand_mm(env, s, modrm); return;
+    case 0xdb | M_0F | P_66:           gen_pand_xmm(env, s, modrm); return;
+    case 0xdb | M_0F | P_66 | VEX_128: gen_vpand_xmm(env, s, modrm); return;
+    case 0xdb | M_0F | P_66 | VEX_256: gen_vpand_ymm(env, s, modrm); return;
+    case 0x54 | M_0F:                  gen_andps_xmm(env, s, modrm); return;
+    case 0x54 | M_0F | VEX_128:        gen_vandps_xmm(env, s, modrm); return;
+    case 0x54 | M_0F | VEX_256:        gen_vandps_ymm(env, s, modrm); return;
+    case 0x54 | M_0F | P_66:           gen_andpd_xmm(env, s, modrm); return;
+    case 0x54 | M_0F | P_66 | VEX_128: gen_vandpd_xmm(env, s, modrm); return;
+    case 0x54 | M_0F | P_66 | VEX_256: gen_vandpd_ymm(env, s, modrm); return;
+    default: break;
+    }
+
+    assert(sse_fn_epp != SSE_TOMBSTONE);
+
     if (sse_fn_epp == SSE_SPECIAL) {
         b |= (b1 << 8);
         switch(b) {
-- 
2.20.1

Re: [Qemu-devel] [RFC PATCH v1 08/22] target/i386: reimplement (V)PAND, (V)ANDPS, (V)ANDPD

Posted by Richard Henderson 6 years, 6 months ago

On 7/31/19 10:56 AM, Jan Bobek wrote:
> +#define gen_pand_mm(env, s, modrm)   gen_gvec_ld_modrm_mm  ((env), (s), (modrm), MO_64, tcg_gen_gvec_and, 0112)
> +#define gen_pand_xmm(env, s, modrm)  gen_gvec_ld_modrm_xmm ((env), (s), (modrm), MO_64, tcg_gen_gvec_and, 0112)
> +#define gen_vpand_xmm(env, s, modrm) gen_gvec_ld_modrm_vxmm((env), (s), (modrm), MO_64, tcg_gen_gvec_and, 0123)
> +#define gen_vpand_ymm(env, s, modrm) gen_gvec_ld_modrm_vymm((env), (s), (modrm), MO_64, tcg_gen_gvec_and, 0123)
> +#define gen_andps_xmm  gen_pand_xmm
> +#define gen_vandps_xmm gen_vpand_xmm
> +#define gen_vandps_ymm gen_vpand_ymm
> +#define gen_andpd_xmm  gen_pand_xmm
> +#define gen_vandpd_xmm gen_vpand_xmm
> +#define gen_vandpd_ymm gen_vpand_ymm


Why all of these extra defines?

> +    enum {
> +        M_0F    = 0x01 << 8,
> +        M_0F38  = 0x02 << 8,
> +        M_0F3A  = 0x04 << 8,
> +        P_66    = 0x08 << 8,
> +        P_F3    = 0x10 << 8,
> +        P_F2    = 0x20 << 8,
> +        VEX_128 = 0x40 << 8,
> +        VEX_256 = 0x80 << 8,
> +    };
> +
> +    switch(b | M_0F
> +           | (s->prefix & PREFIX_DATA ? P_66 : 0)
> +           | (s->prefix & PREFIX_REPZ ? P_F3 : 0)
> +           | (s->prefix & PREFIX_REPNZ ? P_F2 : 0)
> +           | (s->prefix & PREFIX_VEX ? (s->vex_l ? VEX_256 : VEX_128) : 0)) {

I think you can move this above almost everything in this function, so that all
of the legacy bits follow this switch.

> +    case 0xdb | M_0F:                  gen_pand_mm(env, s, modrm); return;

You'll want to put these on the next lines -- checkpatch.pl again.

> +    case 0xdb | M_0F | P_66:           gen_pand_xmm(env, s, modrm); return;
> +    case 0xdb | M_0F | P_66 | VEX_128: gen_vpand_xmm(env, s, modrm); return;
> +    case 0xdb | M_0F | P_66 | VEX_256: gen_vpand_ymm(env, s, modrm); return;
> +    case 0x54 | M_0F:                  gen_andps_xmm(env, s, modrm); return;
> +    case 0x54 | M_0F | VEX_128:        gen_vandps_xmm(env, s, modrm); return;
> +    case 0x54 | M_0F | VEX_256:        gen_vandps_ymm(env, s, modrm); return;
> +    case 0x54 | M_0F | P_66:           gen_andpd_xmm(env, s, modrm); return;
> +    case 0x54 | M_0F | P_66 | VEX_128: gen_vandpd_xmm(env, s, modrm); return;
> +    case 0x54 | M_0F | P_66 | VEX_256: gen_vandpd_ymm(env, s, modrm); return;
> +    default: break;
> +    }

Perhaps group cases together?

    case 0xdb | M_0F | P_66:  /* PAND */
    case 0x54 | M_0F:         /* ANDPS */
    case 0x54 | M_0F | P_66:  /* ANDPD */
       gen_gvec_ld_modrm_xmm(env, s, modrm, MO_64, tcg_gen_gvec_and, 0112);
       return;

How are you planning to handle CPUID checks?  I know the currently handling is
quite spotty, but with a reorg we might as well fix that too.


r~

Re: [Qemu-devel] [RFC PATCH v1 08/22] target/i386: reimplement (V)PAND, (V)ANDPS, (V)ANDPD

Posted by Aleksandar Markovic 6 years, 6 months ago

On Wed, Jul 31, 2019 at 9:36 PM Richard Henderson <
richard.henderson@linaro.org> wrote:

> On 7/31/19 10:56 AM, Jan Bobek wrote:
> > +#define gen_pand_mm(env, s, modrm)   gen_gvec_ld_modrm_mm  ((env), (s),
> (modrm), MO_64, tcg_gen_gvec_and, 0112)
> > +#define gen_pand_xmm(env, s, modrm)  gen_gvec_ld_modrm_xmm ((env), (s),
> (modrm), MO_64, tcg_gen_gvec_and, 0112)
> > +#define gen_vpand_xmm(env, s, modrm) gen_gvec_ld_modrm_vxmm((env), (s),
> (modrm), MO_64, tcg_gen_gvec_and, 0123)
> > +#define gen_vpand_ymm(env, s, modrm) gen_gvec_ld_modrm_vymm((env), (s),
> (modrm), MO_64, tcg_gen_gvec_and, 0123)
> > +#define gen_andps_xmm  gen_pand_xmm
> > +#define gen_vandps_xmm gen_vpand_xmm
> > +#define gen_vandps_ymm gen_vpand_ymm
> > +#define gen_andpd_xmm  gen_pand_xmm
> > +#define gen_vandpd_xmm gen_vpand_xmm
> > +#define gen_vandpd_ymm gen_vpand_ymm
>
>
> Why all of these extra defines?
>

Because of code clarity and safety, I would say.

This line:

case 0x54 | M_0F:                  gen_andps_xmm(env, s, modrm); return;

looks much clearer than this one:

case 0x54 | M_0F:                  gen_gvec_ld_modrm_mm(env, s, modrm,
MO_64, tcg_gen_gvec_and, 0112)

and such organization is also much less prone to copy/paste bugs etc.

Needless to say there is no performance price at all, so it is a no-brainer.

Sincerely,
Aleksandar


>
> > +    enum {
> > +        M_0F    = 0x01 << 8,
> > +        M_0F38  = 0x02 << 8,
> > +        M_0F3A  = 0x04 << 8,
> > +        P_66    = 0x08 << 8,
> > +        P_F3    = 0x10 << 8,
> > +        P_F2    = 0x20 << 8,
> > +        VEX_128 = 0x40 << 8,
> > +        VEX_256 = 0x80 << 8,
> > +    };
> > +
> > +    switch(b | M_0F
> > +           | (s->prefix & PREFIX_DATA ? P_66 : 0)
> > +           | (s->prefix & PREFIX_REPZ ? P_F3 : 0)
> > +           | (s->prefix & PREFIX_REPNZ ? P_F2 : 0)
> > +           | (s->prefix & PREFIX_VEX ? (s->vex_l ? VEX_256 : VEX_128) :
> 0)) {
>
> I think you can move this above almost everything in this function, so
> that all
> of the legacy bits follow this switch.
>
> > +    case 0xdb | M_0F:                  gen_pand_mm(env, s, modrm);
> return;
>
> You'll want to put these on the next lines -- checkpatch.pl again.
>
> > +    case 0xdb | M_0F | P_66:           gen_pand_xmm(env, s, modrm);
> return;
> > +    case 0xdb | M_0F | P_66 | VEX_128: gen_vpand_xmm(env, s, modrm);
> return;
> > +    case 0xdb | M_0F | P_66 | VEX_256: gen_vpand_ymm(env, s, modrm);
> return;
> > +    case 0x54 | M_0F:                  gen_andps_xmm(env, s, modrm);
> return;
> > +    case 0x54 | M_0F | VEX_128:        gen_vandps_xmm(env, s, modrm);
> return;
> > +    case 0x54 | M_0F | VEX_256:        gen_vandps_ymm(env, s, modrm);
> return;
> > +    case 0x54 | M_0F | P_66:           gen_andpd_xmm(env, s, modrm);
> return;
> > +    case 0x54 | M_0F | P_66 | VEX_128: gen_vandpd_xmm(env, s, modrm);
> return;
> > +    case 0x54 | M_0F | P_66 | VEX_256: gen_vandpd_ymm(env, s, modrm);
> return;
> > +    default: break;
> > +    }
>
> Perhaps group cases together?
>
>     case 0xdb | M_0F | P_66:  /* PAND */
>     case 0x54 | M_0F:         /* ANDPS */
>     case 0x54 | M_0F | P_66:  /* ANDPD */
>        gen_gvec_ld_modrm_xmm(env, s, modrm, MO_64, tcg_gen_gvec_and, 0112);
>        return;
>
> How are you planning to handle CPUID checks?  I know the currently
> handling is
> quite spotty, but with a reorg we might as well fix that too.
>
>
> r~
>
>

Re: [Qemu-devel] [RFC PATCH v1 08/22] target/i386: reimplement (V)PAND, (V)ANDPS, (V)ANDPD

Posted by Richard Henderson 6 years, 6 months ago

On 7/31/19 1:27 PM, Aleksandar Markovic wrote:
> 
> 
> On Wed, Jul 31, 2019 at 9:36 PM Richard Henderson <richard.henderson@linaro.org
> <mailto:richard.henderson@linaro.org>> wrote:
> 
>     On 7/31/19 10:56 AM, Jan Bobek wrote:
>     > +#define gen_pand_mm(env, s, modrm)   gen_gvec_ld_modrm_mm  ((env), (s),
>     (modrm), MO_64, tcg_gen_gvec_and, 0112)
>     > +#define gen_pand_xmm(env, s, modrm)  gen_gvec_ld_modrm_xmm ((env), (s),
>     (modrm), MO_64, tcg_gen_gvec_and, 0112)
>     > +#define gen_vpand_xmm(env, s, modrm) gen_gvec_ld_modrm_vxmm((env), (s),
>     (modrm), MO_64, tcg_gen_gvec_and, 0123)
>     > +#define gen_vpand_ymm(env, s, modrm) gen_gvec_ld_modrm_vymm((env), (s),
>     (modrm), MO_64, tcg_gen_gvec_and, 0123)
>     > +#define gen_andps_xmm  gen_pand_xmm
>     > +#define gen_vandps_xmm gen_vpand_xmm
>     > +#define gen_vandps_ymm gen_vpand_ymm
>     > +#define gen_andpd_xmm  gen_pand_xmm
>     > +#define gen_vandpd_xmm gen_vpand_xmm
>     > +#define gen_vandpd_ymm gen_vpand_ymm
> 
> 
>     Why all of these extra defines?
> 
> 
> Because of code clarity and safety, I would say.
> 
> This line:
> 
> case 0x54 | M_0F:                  gen_andps_xmm(env, s, modrm); return;
> 
> looks much clearer than this one:
> 
> case 0x54 | M_0F:                  gen_gvec_ld_modrm_mm(env, s, modrm, MO_64,
> tcg_gen_gvec_and, 0112)
> 
> and such organization is also much less prone to copy/paste bugs etc.

I'm not convinced.  These macros will be used exactly once.  Because there is
no reuse, there is no added safety.  There is only the chance of a typo in a
location removed from the actual use within the switch.

I agree that the goal is to minimize any replication per case.  But I don't
think this particular arrangement of macros is the way to accomplish that.


r~

Re: [Qemu-devel] [RFC PATCH v1 08/22] target/i386: reimplement (V)PAND, (V)ANDPS, (V)ANDPD

Posted by Jan Bobek 6 years, 6 months ago

On 7/31/19 3:35 PM, Richard Henderson wrote:
> On 7/31/19 10:56 AM, Jan Bobek wrote:
>> +#define gen_pand_mm(env, s, modrm)   gen_gvec_ld_modrm_mm  ((env), (s), (modrm), MO_64, tcg_gen_gvec_and, 0112)
>> +#define gen_pand_xmm(env, s, modrm)  gen_gvec_ld_modrm_xmm ((env), (s), (modrm), MO_64, tcg_gen_gvec_and, 0112)
>> +#define gen_vpand_xmm(env, s, modrm) gen_gvec_ld_modrm_vxmm((env), (s), (modrm), MO_64, tcg_gen_gvec_and, 0123)
>> +#define gen_vpand_ymm(env, s, modrm) gen_gvec_ld_modrm_vymm((env), (s), (modrm), MO_64, tcg_gen_gvec_and, 0123)
>> +#define gen_andps_xmm  gen_pand_xmm
>> +#define gen_vandps_xmm gen_vpand_xmm
>> +#define gen_vandps_ymm gen_vpand_ymm
>> +#define gen_andpd_xmm  gen_pand_xmm
>> +#define gen_vandpd_xmm gen_vpand_xmm
>> +#define gen_vandpd_ymm gen_vpand_ymm
> 
> 
> Why all of these extra defines?
> 
>> +    enum {
>> +        M_0F    = 0x01 << 8,
>> +        M_0F38  = 0x02 << 8,
>> +        M_0F3A  = 0x04 << 8,
>> +        P_66    = 0x08 << 8,
>> +        P_F3    = 0x10 << 8,
>> +        P_F2    = 0x20 << 8,
>> +        VEX_128 = 0x40 << 8,
>> +        VEX_256 = 0x80 << 8,
>> +    };
>> +
>> +    switch(b | M_0F
>> +           | (s->prefix & PREFIX_DATA ? P_66 : 0)
>> +           | (s->prefix & PREFIX_REPZ ? P_F3 : 0)
>> +           | (s->prefix & PREFIX_REPNZ ? P_F2 : 0)
>> +           | (s->prefix & PREFIX_VEX ? (s->vex_l ? VEX_256 : VEX_128) : 0)) {
> 
> I think you can move this above almost everything in this function, so that all
> of the legacy bits follow this switch.
> 
>> +    case 0xdb | M_0F:                  gen_pand_mm(env, s, modrm); return;
> 
> You'll want to put these on the next lines -- checkpatch.pl again.
> 
>> +    case 0xdb | M_0F | P_66:           gen_pand_xmm(env, s, modrm); return;
>> +    case 0xdb | M_0F | P_66 | VEX_128: gen_vpand_xmm(env, s, modrm); return;
>> +    case 0xdb | M_0F | P_66 | VEX_256: gen_vpand_ymm(env, s, modrm); return;
>> +    case 0x54 | M_0F:                  gen_andps_xmm(env, s, modrm); return;
>> +    case 0x54 | M_0F | VEX_128:        gen_vandps_xmm(env, s, modrm); return;
>> +    case 0x54 | M_0F | VEX_256:        gen_vandps_ymm(env, s, modrm); return;
>> +    case 0x54 | M_0F | P_66:           gen_andpd_xmm(env, s, modrm); return;
>> +    case 0x54 | M_0F | P_66 | VEX_128: gen_vandpd_xmm(env, s, modrm); return;
>> +    case 0x54 | M_0F | P_66 | VEX_256: gen_vandpd_ymm(env, s, modrm); return;
>> +    default: break;
>> +    }
> 
> Perhaps group cases together?
> 
>     case 0xdb | M_0F | P_66:  /* PAND */
>     case 0x54 | M_0F:         /* ANDPS */
>     case 0x54 | M_0F | P_66:  /* ANDPD */
>        gen_gvec_ld_modrm_xmm(env, s, modrm, MO_64, tcg_gen_gvec_and, 0112);
>        return;

As Aleksandar pointed out in his email, the general intuition was to
have self-documenting code. Seeing

case 0x54 | M_0F | VEX_256: gen_vandps_ymm(env, s, modrm); return;

clearly states that this particular case is a VANDPS, and if one wants
to see what we do with it, they can go look gen_vandps_ymm up.

That being said, I have to the conclusion in the meantime that keeping
all the extra macros is just too much code and not worth it, so I'll
do it like you suggest above.

> How are you planning to handle CPUID checks?  I know the currently handling is
> quite spotty, but with a reorg we might as well fix that too.

Good question. CPUID checks are not handled in this patch at all, I
will need to come up with a workable approach.

-Jan