[Xen-devel] [PATCH v9 00/23] x86emul: remaining AVX512 support

Jan Beulich posted 23 patches 2 weeks ago
Only 0 patches received!

[Xen-devel] [PATCH v9 00/23] x86emul: remaining AVX512 support

Posted by Jan Beulich 2 weeks ago
This goes on top of "x86emul: avoid speculative out of bounds
accesses", or else there's a conflict in at least the "gather" patch
here.

01: support AVX512{F,_VBMI2} compress/expand insns
02: support remaining misc AVX512{F,BW} insns
03: prepare for AVX512F S/G insns
04: test harness adjustments for AVX512F S/G insns
05: support AVX512F gather insns
06: add high register S/G test cases
07: support AVX512F scatter insns
08: support AVX512PF insns
09: support AVX512CD insns
10: complete support of AVX512_VBMI insns
11: support of AVX512* population count insns
12: support of AVX512_IFMA insns
13: support remaining AVX512_VBMI2 insns
14: support AVX512_4FMAPS insns
15: support AVX512_4VNNIW insns
16: support AVX512_VNNI insns
17: support VPCLMULQDQ insns
18: support VAES insns
19: support GFNI insns
20: restore ordering within main switch statement
21: add an AES/VAES test case to the harness
22: add a SHA test case to the harness
23: add a PCLMUL/VPCLMUL test case to the harness

Jan
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v9 00/23] x86emul: remaining AVX512 support

Posted by Jan Beulich 2 weeks ago
On 01.07.2019 13:13, Jan Beulich wrote:
> This goes on top of "x86emul: avoid speculative out of bounds
> accesses", or else there's a conflict in at least the "gather" patch
> here.
> 
> 01: support AVX512{F,_VBMI2} compress/expand insns
> 02: support remaining misc AVX512{F,BW} insns
> 03: prepare for AVX512F S/G insns
> 04: test harness adjustments for AVX512F S/G insns
> 05: support AVX512F gather insns
> 06: add high register S/G test cases
> 07: support AVX512F scatter insns
> 08: support AVX512PF insns
> 09: support AVX512CD insns
> 10: complete support of AVX512_VBMI insns
> 11: support of AVX512* population count insns
> 12: support of AVX512_IFMA insns
> 13: support remaining AVX512_VBMI2 insns
> 14: support AVX512_4FMAPS insns
> 15: support AVX512_4VNNIW insns
> 16: support AVX512_VNNI insns
> 17: support VPCLMULQDQ insns
> 18: support VAES insns
> 19: support GFNI insns
> 20: restore ordering within main switch statement
> 21: add an AES/VAES test case to the harness
> 22: add a SHA test case to the harness
> 23: add a PCLMUL/VPCLMUL test case to the harness

I realize I've sent patch 17 a second time instead of the correct
patch 23. I'll send the correct one in a minute.

Jan
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v9 01/23] x86emul: support AVX512{F, _VBMI2} compress/expand insns

Posted by Jan Beulich 2 weeks ago
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v9: Re-base. Drop "(solely)" from a comment.
v7: Re-base.
v6: Re-base. Add tests for the byte/word forms.
v5: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -109,6 +109,7 @@ static const struct test avx512f_all[] =
      INSN_FP(cmp,             0f, c2),
      INSN(comisd,       66,   0f, 2f,    el,      q, el),
      INSN(comiss,         ,   0f, 2f,    el,      d, el),
+    INSN(compress,     66, 0f38, 8a,    vl,     sd, el),
      INSN(cvtdq2pd,     f3,   0f, e6,    vl_2,    d, vl),
      INSN(cvtdq2ps,       ,   0f, 5b,    vl,      d, vl),
      INSN(cvtpd2dq,     f2,   0f, e6,    vl,      q, vl),
@@ -140,6 +141,7 @@ static const struct test avx512f_all[] =
      INSN(cvtusi2sd,    f2,   0f, 7b,    el,   dq64, el),
      INSN(cvtusi2ss,    f3,   0f, 7b,    el,   dq64, el),
      INSN_FP(div,             0f, 5e),
+    INSN(expand,       66, 0f38, 88,    vl,     sd, el),
      INSN(fixupimm,     66, 0f3a, 54,    vl,     sd, vl),
      INSN(fixupimm,     66, 0f3a, 55,    el,     sd, el),
      INSN(fmadd132,     66, 0f38, 98,    vl,     sd, vl),
@@ -214,6 +216,7 @@ static const struct test avx512f_all[] =
      INSN(pcmpgtd,      66,   0f, 66,    vl,      d, vl),
      INSN(pcmpgtq,      66, 0f38, 37,    vl,      q, vl),
      INSN(pcmpu,        66, 0f3a, 1e,    vl,     dq, vl),
+    INSN(pcompress,    66, 0f38, 8b,    vl,     dq, el),
      INSN(permi2,       66, 0f38, 76,    vl,     dq, vl),
      INSN(permi2,       66, 0f38, 77,    vl,     sd, vl),
      INSN(permilpd,     66, 0f38, 0d,    vl,      q, vl),
@@ -222,6 +225,7 @@ static const struct test avx512f_all[] =
      INSN(permilps,     66, 0f3a, 04,    vl,      d, vl),
      INSN(permt2,       66, 0f38, 7e,    vl,     dq, vl),
      INSN(permt2,       66, 0f38, 7f,    vl,     sd, vl),
+    INSN(pexpand,      66, 0f38, 89,    vl,     dq, el),
      INSN(pmaxs,        66, 0f38, 3d,    vl,     dq, vl),
      INSN(pmaxu,        66, 0f38, 3f,    vl,     dq, vl),
      INSN(pmins,        66, 0f38, 39,    vl,     dq, vl),
@@ -509,6 +513,11 @@ static const struct test avx512_vbmi_all
      INSN(permt2b,       66, 0f38, 7d, vl, b, vl),
  };
  
+static const struct test avx512_vbmi2_all[] = {
+    INSN(pcompress, 66, 0f38, 63, vl, bw, el),
+    INSN(pexpand,   66, 0f38, 62, vl, bw, el),
+};
+
  static const unsigned char vl_all[] = { VL_512, VL_128, VL_256 };
  static const unsigned char vl_128[] = { VL_128 };
  static const unsigned char vl_no128[] = { VL_512, VL_256 };
@@ -865,4 +874,5 @@ void evex_disp8_test(void *instr, struct
      RUN(avx512dq, 512);
      RUN(avx512er, 512);
      RUN(avx512_vbmi, all);
+    RUN(avx512_vbmi2, all);
  }
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -3995,6 +3995,227 @@ int main(int argc, char **argv)
      else
          printf("skipped\n");
  
+    /*
+     * The following compress/expand tests are not only making sure the
+     * accessed data is correct, but they also verify (by placing operands
+     * on the mapping boundaries) that elements controlled by clear mask
+     * bits don't get accessed.
+     */
+    if ( stack_exec && cpu_has_avx512f )
+    {
+        decl_insn(vpcompressd);
+        decl_insn(vpcompressq);
+        decl_insn(vpexpandd);
+        decl_insn(vpexpandq);
+        static const struct {
+            unsigned int d[16];
+        } dsrc = { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } };
+        static const struct {
+            unsigned long long q[8];
+        } qsrc = { { 0, 1, 2, 3, 4, 5, 6, 7 } };
+        unsigned int *ptr = res + MMAP_SZ / sizeof(*res) - 32;
+
+        printf("%-40s", "Testing vpcompressd %zmm1,24*4(%ecx){%k2}...");
+        asm volatile ( "kmovw %1, %%k2\n\t"
+                       "vmovdqu32 %2, %%zmm1\n"
+                       put_insn(vpcompressd,
+                                "vpcompressd %%zmm1, 24*4(%0)%{%%k2%}")
+                       :: "c" (NULL), "r" (0x55aa), "m" (dsrc) );
+
+        memset(ptr, 0xdb, 32 * 4);
+        set_insn(vpcompressd);
+        regs.ecx = (unsigned long)ptr;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vpcompressd) ||
+             memcmp(ptr, ptr + 8, 16 * 4) )
+            goto fail;
+        for ( i = 0; i < 4; ++i )
+            if ( ptr[24 + i] != 2 * i + 1 )
+                goto fail;
+        for ( ; i < 8; ++i )
+            if ( ptr[24 + i] != 2 * i )
+                goto fail;
+        printf("okay\n");
+
+        printf("%-40s", "Testing vpexpandd 8*4(%edx),%zmm3{%k2}{z}...");
+        asm volatile ( "vpternlogd $0x81, %%zmm3, %%zmm3, %%zmm3\n"
+                       put_insn(vpexpandd,
+                                "vpexpandd 8*4(%0), %%zmm3%{%%k2%}%{z%}")
+                       :: "d" (NULL) );
+        set_insn(vpexpandd);
+        regs.edx = (unsigned long)(ptr + 16);
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vpexpandd) )
+            goto fail;
+        asm ( "vmovdqa32 %%zmm1, %%zmm2%{%%k2%}%{z%}\n\t"
+              "vpcmpeqd %%zmm2, %%zmm3, %%k0\n\t"
+              "kmovw %%k0, %0"
+              : "=r" (rc) );
+        if ( rc != 0xffff )
+            goto fail;
+        printf("okay\n");
+
+        printf("%-40s", "Testing vpcompressq %zmm4,12*8(%edx){%k3}...");
+        asm volatile ( "kmovw %1, %%k3\n\t"
+                       "vmovdqu64 %2, %%zmm4\n"
+                       put_insn(vpcompressq,
+                                "vpcompressq %%zmm4, 12*8(%0)%{%%k3%}")
+                       :: "d" (NULL), "r" (0x5a), "m" (qsrc) );
+
+        memset(ptr, 0xdb, 16 * 8);
+        set_insn(vpcompressq);
+        regs.edx = (unsigned long)ptr;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vpcompressq) ||
+             memcmp(ptr, ptr + 8, 8 * 8) )
+            goto fail;
+        for ( i = 0; i < 2; ++i )
+        {
+            if ( ptr[(12 + i) * 2] != 2 * i + 1 ||
+                 ptr[(12 + i) * 2 + 1] )
+                goto fail;
+        }
+        for ( ; i < 4; ++i )
+        {
+            if ( ptr[(12 + i) * 2] != 2 * i ||
+                 ptr[(12 + i) * 2 + 1] )
+                goto fail;
+        }
+        printf("okay\n");
+
+        printf("%-40s", "Testing vpexpandq 4*8(%ecx),%zmm5{%k3}{z}...");
+        asm volatile ( "vpternlogq $0x81, %%zmm5, %%zmm5, %%zmm5\n"
+                       put_insn(vpexpandq,
+                                "vpexpandq 4*8(%0), %%zmm5%{%%k3%}%{z%}")
+                       :: "c" (NULL) );
+        set_insn(vpexpandq);
+        regs.ecx = (unsigned long)(ptr + 16);
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vpexpandq) )
+            goto fail;
+        asm ( "vmovdqa64 %%zmm4, %%zmm6%{%%k3%}%{z%}\n\t"
+              "vpcmpeqq %%zmm5, %%zmm6, %%k0\n\t"
+              "kmovw %%k0, %0"
+              : "=r" (rc) );
+        if ( rc != 0xff )
+            goto fail;
+        printf("okay\n");
+    }
+
+#if __GNUC__ > 7 /* can't check for __AVX512VBMI2__ here */
+    if ( stack_exec && cpu_has_avx512_vbmi2 )
+    {
+        decl_insn(vpcompressb);
+        decl_insn(vpcompressw);
+        decl_insn(vpexpandb);
+        decl_insn(vpexpandw);
+        static const struct {
+            unsigned char b[64];
+        } bsrc = { { 0,  1,  2,  3,  4,  5,  6,  7,
+                     8,  9, 10, 11, 12, 13, 14, 15,
+                    16, 17, 18, 19, 20, 21, 22, 23,
+                    24, 25, 26, 27, 28, 29, 30, 31,
+                    32, 33, 34, 35, 36, 37, 38, 39,
+                    40, 41, 42, 43, 44, 45, 46, 47,
+                    48, 49, 50, 51, 52, 53, 54, 55,
+                    56, 57, 58, 59, 60, 61, 62, 63 } };
+        static const struct {
+            unsigned short w[32];
+        } wsrc = { { 0,  1,  2,  3,  4,  5,  6,  7,
+                     8,  9, 10, 11, 12, 13, 14, 15,
+                    16, 17, 18, 19, 20, 21, 22, 23,
+                    24, 25, 26, 27, 28, 29, 30, 31 } };
+        unsigned char *ptr = (void *)res + MMAP_SZ - 128;
+        unsigned long long w = 0x55555555aaaaaaaaULL;
+
+        printf("%-40s", "Testing vpcompressb %zmm1,96*1(%ecx){%k2}...");
+        asm volatile ( "kmovq %1, %%k2\n\t"
+                       "vmovdqu8 %2, %%zmm1\n"
+                       put_insn(vpcompressb,
+                                "vpcompressb %%zmm1, 96*1(%0)%{%%k2%}")
+                       :: "c" (NULL), "m" (w), "m" (bsrc) );
+
+        memset(ptr, 0xdb, 128 * 1);
+        set_insn(vpcompressb);
+        regs.ecx = (unsigned long)ptr;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vpcompressb) ||
+             memcmp(ptr, ptr + 32, 64 * 1) )
+            goto fail;
+        for ( i = 0; i < 16; ++i )
+            if ( ptr[96 + i] != 2 * i + 1 )
+                goto fail;
+        for ( ; i < 32; ++i )
+            if ( ptr[96 + i] != 2 * i )
+                goto fail;
+        printf("okay\n");
+
+        printf("%-40s", "Testing vpexpandb 32*1(%edx),%zmm3{%k2}{z}...");
+        asm volatile ( "vpternlogd $0x81, %%zmm3, %%zmm3, %%zmm3\n"
+                       put_insn(vpexpandb,
+                                "vpexpandb 32*1(%0), %%zmm3%{%%k2%}%{z%}")
+                       :: "d" (NULL) );
+        set_insn(vpexpandb);
+        regs.edx = (unsigned long)(ptr + 64);
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vpexpandb) )
+            goto fail;
+        asm ( "vmovdqu8 %%zmm1, %%zmm2%{%%k2%}%{z%}\n\t"
+              "vpcmpeqb %%zmm2, %%zmm3, %%k0\n\t"
+              "kmovq %%k0, %0"
+              : "=m" (w) );
+        if ( w != 0xffffffffffffffffULL )
+            goto fail;
+        printf("okay\n");
+
+        printf("%-40s", "Testing vpcompressw %zmm4,48*2(%edx){%k3}...");
+        asm volatile ( "kmovd %1, %%k3\n\t"
+                       "vmovdqu16 %2, %%zmm4\n"
+                       put_insn(vpcompressw,
+                                "vpcompressw %%zmm4, 48*2(%0)%{%%k3%}")
+                       :: "d" (NULL), "r" (0x5555aaaa), "m" (wsrc) );
+
+        memset(ptr, 0xdb, 64 * 2);
+        set_insn(vpcompressw);
+        regs.edx = (unsigned long)ptr;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vpcompressw) ||
+             memcmp(ptr, ptr + 32, 32 * 2) )
+            goto fail;
+        for ( i = 0; i < 8; ++i )
+        {
+            if ( ptr[(48 + i) * 2] != 2 * i + 1 ||
+                 ptr[(48 + i) * 2 + 1] )
+                goto fail;
+        }
+        for ( ; i < 16; ++i )
+        {
+            if ( ptr[(48 + i) * 2] != 2 * i ||
+                 ptr[(48 + i) * 2 + 1] )
+                goto fail;
+        }
+        printf("okay\n");
+
+        printf("%-40s", "Testing vpexpandw 16*2(%ecx),%zmm5{%k3}{z}...");
+        asm volatile ( "vpternlogd $0x81, %%zmm5, %%zmm5, %%zmm5\n"
+                       put_insn(vpexpandw,
+                                "vpexpandw 16*2(%0), %%zmm5%{%%k3%}%{z%}")
+                       :: "c" (NULL) );
+        set_insn(vpexpandw);
+        regs.ecx = (unsigned long)(ptr + 64);
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vpexpandw) )
+            goto fail;
+        asm ( "vmovdqu16 %%zmm4, %%zmm6%{%%k3%}%{z%}\n\t"
+              "vpcmpeqw %%zmm5, %%zmm6, %%k0\n\t"
+              "kmovq %%k0, %0"
+              : "=m" (w) );
+        if ( w != 0xffffffff )
+            goto fail;
+        printf("okay\n");
+    }
+#endif
+
  #undef decl_insn
  #undef put_insn
  #undef set_insn
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -59,6 +59,9 @@
      (type *)((char *)mptr__ - offsetof(type, member)); \
  })
  
+#define hweight32 __builtin_popcount
+#define hweight64 __builtin_popcountll
+
  #define is_canonical_address(x) (((int64_t)(x) >> 47) == ((int64_t)(x) >> 63))
  
  extern uint32_t mxcsr_mask;
@@ -138,6 +141,7 @@ static inline bool xcr0_mask(uint64_t ma
  #define cpu_has_avx512bw  (cp.feat.avx512bw && xcr0_mask(0xe6))
  #define cpu_has_avx512vl  (cp.feat.avx512vl && xcr0_mask(0xe6))
  #define cpu_has_avx512_vbmi (cp.feat.avx512_vbmi && xcr0_mask(0xe6))
+#define cpu_has_avx512_vbmi2 (cp.feat.avx512_vbmi2 && xcr0_mask(0xe6))
  
  #define cpu_has_xgetbv1   (cpu_has_xsave && cp.xstate.xgetbv1)
  
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -482,6 +482,8 @@ static const struct ext0f38_table {
      [0x59] = { .simd_size = simd_other, .two_op = 1, .d8s = 3 },
      [0x5a] = { .simd_size = simd_128, .two_op = 1, .d8s = 4 },
      [0x5b] = { .simd_size = simd_256, .two_op = 1, .d8s = d8s_vl_by_2 },
+    [0x62] = { .simd_size = simd_packed_int, .two_op = 1, .d8s = d8s_bw },
+    [0x63] = { .simd_size = simd_packed_int, .to_mem = 1, .two_op = 1, .d8s = d8s_bw },
      [0x75 ... 0x76] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
      [0x77] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
      [0x78] = { .simd_size = simd_other, .two_op = 1 },
@@ -489,6 +491,10 @@ static const struct ext0f38_table {
      [0x7a ... 0x7c] = { .simd_size = simd_none, .two_op = 1 },
      [0x7d ... 0x7e] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
      [0x7f] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
+    [0x88] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_dq },
+    [0x89] = { .simd_size = simd_packed_int, .two_op = 1, .d8s = d8s_dq },
+    [0x8a] = { .simd_size = simd_packed_fp, .to_mem = 1, .two_op = 1, .d8s = d8s_dq },
+    [0x8b] = { .simd_size = simd_packed_int, .to_mem = 1, .two_op = 1, .d8s = d8s_dq },
      [0x8c] = { .simd_size = simd_packed_int },
      [0x8d] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
      [0x8e] = { .simd_size = simd_packed_int, .to_mem = 1 },
@@ -1868,6 +1874,7 @@ in_protmode(
  #define vcpu_has_avx512bw()    (ctxt->cpuid->feat.avx512bw)
  #define vcpu_has_avx512vl()    (ctxt->cpuid->feat.avx512vl)
  #define vcpu_has_avx512_vbmi() (ctxt->cpuid->feat.avx512_vbmi)
+#define vcpu_has_avx512_vbmi2() (ctxt->cpuid->feat.avx512_vbmi2)
  #define vcpu_has_rdpid()       (ctxt->cpuid->feat.rdpid)
  
  #define vcpu_must_have(feat) \
@@ -8881,6 +8888,36 @@ x86_emulate(
          generate_exception_if(ea.type != OP_MEM || !vex.l || vex.w, EXC_UD);
          goto simd_0f_avx2;
  
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x62): /* vpexpand{b,w} [xyz]mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x63): /* vpcompress{b,w} [xyz]mm,[xyz]mm/mem{k} */
+        host_and_vcpu_must_have(avx512_vbmi2);
+        elem_bytes = 1 << evex.w;
+        /* fall through */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x88): /* vexpandp{s,d} [xyz]mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x89): /* vpexpand{d,q} [xyz]mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x8a): /* vcompressp{s,d} [xyz]mm,[xyz]mm/mem{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x8b): /* vpcompress{d,q} [xyz]mm,[xyz]mm/mem{k} */
+        host_and_vcpu_must_have(avx512f);
+        generate_exception_if(evex.brs, EXC_UD);
+        avx512_vlen_check(false);
+        /*
+         * For the respective code below the main switch() to work we need to
+         * compact op_mask here: Memory accesses are non-sparse even if the
+         * mask register has sparsely set bits.
+         */
+        if ( likely(fault_suppression) )
+        {
+            n = 1 << ((b & 8 ? 2 : 4) + evex.lr - evex.w);
+            EXPECT(elem_bytes > 0);
+            ASSERT(op_bytes == n * elem_bytes);
+            op_mask &= ~0ULL >> (64 - n);
+            n = hweight64(op_mask);
+            op_bytes = n * elem_bytes;
+            if ( n )
+                op_mask = ~0ULL >> (64 - n);
+        }
+        goto simd_zmm;
+
      case X86EMUL_OPC_EVEX_66(0x0f38, 0x75): /* vpermi2{b,w} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
      case X86EMUL_OPC_EVEX_66(0x0f38, 0x7d): /* vpermt2{b,w} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
      case X86EMUL_OPC_EVEX_66(0x0f38, 0x8d): /* vperm{b,w} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -108,6 +108,7 @@
  
  /* CPUID level 0x00000007:0.ecx */
  #define cpu_has_avx512_vbmi     boot_cpu_has(X86_FEATURE_AVX512_VBMI)
+#define cpu_has_avx512_vbmi2    boot_cpu_has(X86_FEATURE_AVX512_VBMI2)
  #define cpu_has_rdpid           boot_cpu_has(X86_FEATURE_RDPID)
  
  /* CPUID level 0x80000007.edx */
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -228,6 +228,7 @@ XEN_CPUFEATURE(AVX512_VBMI,   6*32+ 1) /
  XEN_CPUFEATURE(UMIP,          6*32+ 2) /*S  User Mode Instruction Prevention */
  XEN_CPUFEATURE(PKU,           6*32+ 3) /*H  Protection Keys for Userspace */
  XEN_CPUFEATURE(OSPKE,         6*32+ 4) /*!  OS Protection Keys Enable */
+XEN_CPUFEATURE(AVX512_VBMI2,  6*32+ 6) /*A  Additional AVX-512 Vector Byte Manipulation Instrs */
  XEN_CPUFEATURE(AVX512_VPOPCNTDQ, 6*32+14) /*A  POPCNT for vectors of DW/QW */
  XEN_CPUFEATURE(RDPID,         6*32+22) /*A  RDPID instruction */
  
--- a/xen/tools/gen-cpuid.py
+++ b/xen/tools/gen-cpuid.py
@@ -265,10 +265,10 @@ def crunch_numbers(state):
                    AVX512BW, AVX512VL, AVX512_4VNNIW, AVX512_4FMAPS,
                    AVX512_VPOPCNTDQ],
  
-        # AVX512 extensions acting solely on vectors of bytes/words are made
+        # AVX512 extensions acting on vectors of bytes/words are made
          # dependents of AVX512BW (as to requiring wider than 16-bit mask
          # registers), despite the SDM not formally making this connection.
-        AVX512BW: [AVX512_BF16, AVX512_VBMI],
+        AVX512BW: [AVX512_BF16, AVX512_VBMI, AVX512_VBMI2],
  
          # The features:
          #   * Single Thread Indirect Branch Predictors

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v9 01/23] x86emul: support AVX512{F, _VBMI2} compress/expand insns

Posted by Andrew Cooper 2 weeks ago
On 01/07/2019 12:16, Jan Beulich wrote:
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v9 02/23] x86emul: support remaining misc AVX512{F, BW} insns

Posted by Jan Beulich 2 weeks ago
This completes support of AVX512BW in the insn emulator, and leaves just
the scatter/gather ones open in the AVX512F set.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
v5: New.
---
TBD: The *blendm* built-in functions don't reliably produce the intended
      insns, as the respective moves are about as good a fit for the
      compiler when looking for a match for the intended operation. We'd
      need to switch to inline assembly if we wanted to guarantee the
      testing of those insns. Thoughts?

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -105,6 +105,8 @@ enum esz {
  
  static const struct test avx512f_all[] = {
      INSN_FP(add,             0f, 58),
+    INSN(align,        66, 0f3a, 03,    vl,     dq, vl),
+    INSN(blendm,       66, 0f38, 65,    vl,     sd, vl),
      INSN(broadcastss,  66, 0f38, 18,    el,      d, el),
      INSN_FP(cmp,             0f, c2),
      INSN(comisd,       66,   0f, 2f,    el,      q, el),
@@ -207,6 +209,7 @@ static const struct test avx512f_all[] =
      INSN(paddq,        66,   0f, d4,    vl,      q, vl),
      INSN(pand,         66,   0f, db,    vl,     dq, vl),
      INSN(pandn,        66,   0f, df,    vl,     dq, vl),
+    INSN(pblendm,      66, 0f38, 64,    vl,     dq, vl),
  //       pbroadcast,   66, 0f38, 7c,          dq64
      INSN(pbroadcastd,  66, 0f38, 58,    el,      d, el),
      INSN(pbroadcastq,  66, 0f38, 59,    el,      q, el),
@@ -354,6 +357,7 @@ static const struct test avx512f_512[] =
  };
  
  static const struct test avx512bw_all[] = {
+    INSN(dbpsadbw,    66, 0f3a, 42,    vl,    b, vl),
      INSN(movdqu8,     f2,   0f, 6f,    vl,    b, vl),
      INSN(movdqu8,     f2,   0f, 7f,    vl,    b, vl),
      INSN(movdqu16,    f2,   0f, 6f,    vl,    w, vl),
@@ -373,6 +377,7 @@ static const struct test avx512bw_all[]
      INSN(palignr,     66, 0f3a, 0f,    vl,    b, vl),
      INSN(pavgb,       66,   0f, e0,    vl,    b, vl),
      INSN(pavgw,       66,   0f, e3,    vl,    w, vl),
+    INSN(pblendm,     66, 0f38, 66,    vl,   bw, vl),
      INSN(pbroadcastb, 66, 0f38, 78,    el,    b, el),
  //       pbroadcastb, 66, 0f38, 7a,           b
      INSN(pbroadcastw, 66, 0f38, 79,    el_2,  b, vl),
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -297,7 +297,7 @@ static inline vec_t movlhps(vec_t x, vec
  #   define max(x, y) BR_(maxps, _mask, x, y, undef(), ~0)
  #   define min(x, y) BR_(minps, _mask, x, y, undef(), ~0)
  #  endif
-#  define mix(x, y) B(movaps, _mask, x, y, (0b0101010101010101 & ALL_TRUE))
+#  define mix(x, y) B(blendmps_, _mask, x, y, (0b1010101010101010 & ALL_TRUE))
  #  define scale(x, y) BR(scalefps, _mask, x, y, undef(), ~0)
  #  if VEC_SIZE == 64 && defined(__AVX512ER__)
  #   define recip(x) BR(rcp28ps, _mask, x, undef(), ~0)
@@ -370,7 +370,7 @@ static inline vec_t movlhps(vec_t x, vec
  #   define max(x, y) BR_(maxpd, _mask, x, y, undef(), ~0)
  #   define min(x, y) BR_(minpd, _mask, x, y, undef(), ~0)
  #  endif
-#  define mix(x, y) B(movapd, _mask, x, y, 0b01010101)
+#  define mix(x, y) B(blendmpd_, _mask, x, y, 0b10101010)
  #  define scale(x, y) BR(scalefpd, _mask, x, y, undef(), ~0)
  #  if VEC_SIZE == 64 && defined(__AVX512ER__)
  #   define recip(x) BR(rcp28pd, _mask, x, undef(), ~0)
@@ -564,8 +564,9 @@ static inline vec_t movlhps(vec_t x, vec
                               0b00011011, (vsi_t)undef(), ~0))
  #   define swap2(x) ((vec_t)B_(permvarsi, _mask, (vsi_t)(x), (vsi_t)(inv - 1), (vsi_t)undef(), ~0))
  #  endif
-#  define mix(x, y) ((vec_t)B(movdqa32_, _mask, (vsi_t)(x), (vsi_t)(y), \
-                              (0b0101010101010101 & ((1 << ELEM_COUNT) - 1))))
+#  define mix(x, y) ((vec_t)B(blendmd_, _mask, (vsi_t)(x), (vsi_t)(y), \
+                              (0b1010101010101010 & ((1 << ELEM_COUNT) - 1))))
+#  define rotr(x, n) ((vec_t)B(alignd, _mask, (vsi_t)(x), (vsi_t)(x), n, (vsi_t)undef(), ~0))
  #  define shrink1(x) ((half_t)B(pmovqd, _mask, (vdi_t)(x), (vsi_half_t){}, ~0))
  # elif INT_SIZE == 8 || UINT_SIZE == 8
  #  define broadcast(x) ({ \
@@ -602,7 +603,8 @@ static inline vec_t movlhps(vec_t x, vec
                               0b01001110, (vsi_t)undef(), ~0))
  #   define swap2(x) ((vec_t)B(permvardi, _mask, (vdi_t)(x), (vdi_t)(inv - 1), (vdi_t)undef(), ~0))
  #  endif
-#  define mix(x, y) ((vec_t)B(movdqa64_, _mask, (vdi_t)(x), (vdi_t)(y), 0b01010101))
+#  define mix(x, y) ((vec_t)B(blendmq_, _mask, (vdi_t)(x), (vdi_t)(y), 0b10101010))
+#  define rotr(x, n) ((vec_t)B(alignq, _mask, (vdi_t)(x), (vdi_t)(x), n, (vdi_t)undef(), ~0))
  #  if VEC_SIZE == 32
  #   define swap3(x) ((vec_t)B_(permdi, _mask, (vdi_t)(x), 0b00011011, (vdi_t)undef(), ~0))
  #  elif VEC_SIZE == 64
@@ -654,8 +656,8 @@ static inline vec_t movlhps(vec_t x, vec
  #   define interleave_hi(x, y) ((vec_t)B(vpermi2varqi, _mask, (vqi_t)(x), interleave_hi, (vqi_t)(y), ~0))
  #   define interleave_lo(x, y) ((vec_t)B(vpermt2varqi, _mask, interleave_lo, (vqi_t)(x), (vqi_t)(y), ~0))
  #  endif
-#  define mix(x, y) ((vec_t)B(movdquqi, _mask, (vqi_t)(x), (vqi_t)(y), \
-                              (0b0101010101010101010101010101010101010101010101010101010101010101LL & ALL_TRUE)))
+#  define mix(x, y) ((vec_t)B(blendmb_, _mask, (vqi_t)(x), (vqi_t)(y), \
+                              (0b1010101010101010101010101010101010101010101010101010101010101010LL & ALL_TRUE)))
  #  define shrink1(x) ((half_t)B(pmovwb, _mask, (vhi_t)(x), (vqi_half_t){}, ~0))
  #  define shrink2(x) ((quarter_t)B(pmovdb, _mask, (vsi_t)(x), (vqi_quarter_t){}, ~0))
  #  define shrink3(x) ((eighth_t)B(pmovqb, _mask, (vdi_t)(x), (vqi_eighth_t){}, ~0))
@@ -687,8 +689,8 @@ static inline vec_t movlhps(vec_t x, vec
  #   define interleave_hi(x, y) ((vec_t)B(vpermi2varhi, _mask, (vhi_t)(x), interleave_hi, (vhi_t)(y), ~0))
  #   define interleave_lo(x, y) ((vec_t)B(vpermt2varhi, _mask, interleave_lo, (vhi_t)(x), (vhi_t)(y), ~0))
  #  endif
-#  define mix(x, y) ((vec_t)B(movdquhi, _mask, (vhi_t)(x), (vhi_t)(y), \
-                              (0b01010101010101010101010101010101 & ALL_TRUE)))
+#  define mix(x, y) ((vec_t)B(blendmw_, _mask, (vhi_t)(x), (vhi_t)(y), \
+                              (0b10101010101010101010101010101010 & ALL_TRUE)))
  #  define shrink1(x) ((half_t)B(pmovdw, _mask, (vsi_t)(x), (vhi_half_t){}, ~0))
  #  define shrink2(x) ((quarter_t)B(pmovqw, _mask, (vdi_t)(x), (vhi_quarter_t){}, ~0))
  #  define swap2(x) ((vec_t)B(permvarhi, _mask, (vhi_t)(x), (vhi_t)(inv - 1), (vhi_t)undef(), ~0))
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -484,6 +484,7 @@ static const struct ext0f38_table {
      [0x5b] = { .simd_size = simd_256, .two_op = 1, .d8s = d8s_vl_by_2 },
      [0x62] = { .simd_size = simd_packed_int, .two_op = 1, .d8s = d8s_bw },
      [0x63] = { .simd_size = simd_packed_int, .to_mem = 1, .two_op = 1, .d8s = d8s_bw },
+    [0x64 ... 0x66] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
      [0x75 ... 0x76] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
      [0x77] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
      [0x78] = { .simd_size = simd_other, .two_op = 1 },
@@ -550,6 +551,7 @@ static const struct ext0f3a_table {
      [0x00] = { .simd_size = simd_packed_int, .two_op = 1, .d8s = d8s_vl },
      [0x01] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl },
      [0x02] = { .simd_size = simd_packed_int },
+    [0x03] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
      [0x04 ... 0x05] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl },
      [0x06] = { .simd_size = simd_packed_fp },
      [0x08 ... 0x09] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl },
@@ -581,8 +583,7 @@ static const struct ext0f3a_table {
      [0x3b] = { .simd_size = simd_256, .to_mem = 1, .two_op = 1, .d8s = d8s_vl_by_2 },
      [0x3e ... 0x3f] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
      [0x40 ... 0x41] = { .simd_size = simd_packed_fp },
-    [0x42] = { .simd_size = simd_packed_int },
-    [0x43] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
+    [0x42 ... 0x43] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
      [0x44] = { .simd_size = simd_packed_int },
      [0x46] = { .simd_size = simd_packed_int },
      [0x48 ... 0x49] = { .simd_size = simd_packed_fp, .four_op = 1 },
@@ -6178,6 +6179,8 @@ x86_emulate(
      case X86EMUL_OPC_EVEX_66(0x0f38, 0x47): /* vpsllv{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
      case X86EMUL_OPC_EVEX_66(0x0f38, 0x4c): /* vrcp14p{s,d} [xyz]mm/mem,[xyz]mm{k} */
      case X86EMUL_OPC_EVEX_66(0x0f38, 0x4e): /* vrsqrt14p{s,d} [xyz]mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x64): /* vpblendm{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x65): /* vblendmp{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
      avx512f_no_sae:
          host_and_vcpu_must_have(avx512f);
          generate_exception_if(ea.type != OP_MEM && evex.brs, EXC_UD);
@@ -6937,6 +6940,7 @@ x86_emulate(
      case X86EMUL_OPC_EVEX_66(0x0f38, 0x0b): /* vpmulhrsw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
      case X86EMUL_OPC_EVEX_66(0x0f38, 0x1c): /* vpabsb [xyz]mm/mem,[xyz]mm{k} */
      case X86EMUL_OPC_EVEX_66(0x0f38, 0x1d): /* vpabsw [xyz]mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x66): /* vpblendm{b,w} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
          host_and_vcpu_must_have(avx512bw);
          generate_exception_if(evex.brs, EXC_UD);
          elem_bytes = 1 << (b & 1);
@@ -8106,10 +8110,12 @@ x86_emulate(
          goto simd_0f_to_gpr;
  
      CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0xc6): /* vshufp{s,d} $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
-        fault_suppression = false;
          generate_exception_if(evex.w != (evex.pfx & VEX_PREFIX_DOUBLE_MASK),
                                EXC_UD);
          /* fall through */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x03): /* valign{d,q} $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        fault_suppression = false;
+        /* fall through */
      case X86EMUL_OPC_EVEX_66(0x0f3a, 0x25): /* vpternlog{d,q} $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
      avx512f_imm8_no_sae:
          host_and_vcpu_must_have(avx512f);
@@ -9450,6 +9456,9 @@ x86_emulate(
          insn_bytes = PFX_BYTES + 4;
          break;
  
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x42): /* vdbpsadbw $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        generate_exception_if(evex.w, EXC_UD);
+        /* fall through */
      case X86EMUL_OPC_EVEX_66(0x0f3a, 0x0f): /* vpalignr $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
          fault_suppression = false;
          goto avx512bw_imm;

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v9 03/23] x86emul: prepare for AVX512F S/G insns

Posted by Jan Beulich 2 weeks ago
They require getting modrm_reg and sib_index set correctly in the EVEX
case, to account for the high 16 [XYZ]MM registers when used as
addressing index register. Extend the adjustments to modrm_rm as well,
such that x86_insn_modrm() would correctly report register numbers (this
was a latent issue only as we don't currently have callers of that
function which would care about an EVEX case).

The adjustment in turn requires dropping the assertion from decode_gpr()
as well as re-introducing the explicit masking, as we now need to
actively mask off the high bit when a GPR is meant.

_decode_gpr() invocations also need slight adjustments, when invoked in
generic code ahead of the main switch(). All other uses of modrm_reg and
modrm_rm already get suitably masked where necessary.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v9: New, split from main gather patch.

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -3022,7 +3022,8 @@ x86_decode(
  
          d &= ~ModRM;
  #undef ModRM /* Only its aliases are valid to use from here on. */
-        modrm_reg = ((rex_prefix & 4) << 1) | ((modrm & 0x38) >> 3);
+        modrm_reg = ((rex_prefix & 4) << 1) | ((modrm & 0x38) >> 3) |
+                    ((evex_encoded() && !evex.R) << 4);
          modrm_rm  = modrm & 0x07;
  
          /*
@@ -3192,7 +3193,8 @@ x86_decode(
          if ( modrm_mod == 3 )
          {
              generate_exception_if(d & vSIB, EXC_UD);
-            modrm_rm |= (rex_prefix & 1) << 3;
+            modrm_rm |= ((rex_prefix & 1) << 3) |
+                        (evex_encoded() && !evex.x) << 4;
              ea.type = OP_REG;
          }
          else if ( ad_bytes == 2 )
@@ -3257,7 +3259,10 @@ x86_decode(
  
                  state->sib_index = ((sib >> 3) & 7) | ((rex_prefix << 2) & 8);
                  state->sib_scale = (sib >> 6) & 3;
-                if ( state->sib_index != 4 && !(d & vSIB) )
+                if ( unlikely(d & vSIB) )
+                    state->sib_index |= (mode_64bit() && evex_encoded() &&
+                                         !evex.RX) << 4;
+                else if ( state->sib_index != 4 )
                  {
                      ea.mem.off = *decode_gpr(state->regs, state->sib_index);
                      ea.mem.off <<= state->sib_scale;
@@ -3560,7 +3565,7 @@ x86_emulate(
      generate_exception_if(state->not_64bit && mode_64bit(), EXC_UD);
  
      if ( ea.type == OP_REG )
-        ea.reg = _decode_gpr(&_regs, modrm_rm, (d & ByteOp) && !rex_prefix);
+        ea.reg = _decode_gpr(&_regs, modrm_rm, (d & ByteOp) && !rex_prefix && !vex.opcx);
  
      memset(mmvalp, 0xaa /* arbitrary */, sizeof(*mmvalp));
  
@@ -3574,7 +3579,7 @@ x86_emulate(
          src.type = OP_REG;
          if ( d & ByteOp )
          {
-            src.reg = _decode_gpr(&_regs, modrm_reg, !rex_prefix);
+            src.reg = _decode_gpr(&_regs, modrm_reg, !rex_prefix && !vex.opcx);
              src.val = *(uint8_t *)src.reg;
              src.bytes = 1;
          }
@@ -3681,7 +3686,7 @@ x86_emulate(
          dst.type = OP_REG;
          if ( d & ByteOp )
          {
-            dst.reg = _decode_gpr(&_regs, modrm_reg, !rex_prefix);
+            dst.reg = _decode_gpr(&_regs, modrm_reg, !rex_prefix && !vex.opcx);
              dst.val = *(uint8_t *)dst.reg;
              dst.bytes = 1;
          }
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -662,8 +662,6 @@ static inline unsigned long *decode_gpr(
      BUILD_BUG_ON(ARRAY_SIZE(cpu_user_regs_gpr_offsets) &
                   (ARRAY_SIZE(cpu_user_regs_gpr_offsets) - 1));
  
-    ASSERT(modrm < ARRAY_SIZE(cpu_user_regs_gpr_offsets));
-
      /* Note that this also acts as array_access_nospec() stand-in. */
      modrm &= ARRAY_SIZE(cpu_user_regs_gpr_offsets) - 1;
  

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v9 03/23] x86emul: prepare for AVX512F S/G insns

Posted by Andrew Cooper 2 weeks ago
On 01/07/2019 12:17, Jan Beulich wrote:
> --- a/xen/arch/x86/x86_emulate/x86_emulate.h
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.h
> @@ -662,8 +662,6 @@ static inline unsigned long *decode_gpr(
>       BUILD_BUG_ON(ARRAY_SIZE(cpu_user_regs_gpr_offsets) &
>                    (ARRAY_SIZE(cpu_user_regs_gpr_offsets) - 1));
>   
> -    ASSERT(modrm < ARRAY_SIZE(cpu_user_regs_gpr_offsets));
> -
>       /* Note that this also acts as array_access_nospec() stand-in. */

This comment needs adjusting to state that it is sometimes legitimate
for higher modrm bits to be set, and truncating is the appropriate
action to take, so noone is tempted to put the ASSERT() back in.

With something along these lines, Reviewed-by: Andrew Cooper
<andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v9 04/23] x86emul: test harness adjustments for AVX512F S/G insns

Posted by Jan Beulich 2 weeks ago
There was an encoding mistake in the EVEX Disp8 test code, which was
benign (due to %rdx getting set to zero) to all non-vSIB tests as it
mistakenly encoded <disp8>(%rdx,%rdx) instead of <disp8>(%rdx,%riz). In
the vSIB case this meant <disp8>(%rdx,%zmm2) instead of the intended
<disp8>(%rdx,%zmm4).

Likewise the access count check wasn't entirely correct for the S/G
case: In the quad-word-index but dword-data case only half the number
of full vector elements get accessed.

As an unrelated change in the main test harness source file distinguish
the "n/a" messages by bitness.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v9: New, split from main gather patch.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -698,7 +698,7 @@ static void test_one(const struct test *
      instr[3] = evex.raw[2];
      instr[4] = test->opc;
      instr[5] = 0x44 | (test->ext << 3); /* ModR/M */
-    instr[6] = 0x12; /* SIB: base rDX, index none / xMM4 */
+    instr[6] = 0x22; /* SIB: base rDX, index none / xMM4 */
      instr[7] = 1; /* Disp8 */
      instr[8] = 0; /* immediate, if any */
  
@@ -718,7 +718,8 @@ static void test_one(const struct test *
           if ( accessed[i] )
               goto fail;
      for ( ; i < (test->scale == SC_vl ? vsz : esz) + (sg ? esz : vsz); ++i )
-         if ( accessed[i] != (sg ? vsz / esz : 1) )
+         if ( accessed[i] != (sg ? (vsz / esz) >> (test->opc & 1 & !evex.w)
+                                 : 1) )
               goto fail;
      for ( ; i < ARRAY_SIZE(accessed); ++i )
           if ( accessed[i] )
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -4260,7 +4260,7 @@ int main(int argc, char **argv)
  
          if ( !blobs[j].size )
          {
-            printf("%-39s n/a\n", blobs[j].name);
+            printf("%-39s n/a (%u-bit)\n", blobs[j].name, blobs[j].bitness);
              continue;
          }
  

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v9 04/23] x86emul: test harness adjustments for AVX512F S/G insns

Posted by Andrew Cooper 2 weeks ago
On 01/07/2019 12:18, Jan Beulich wrote:
> There was an encoding mistake in the EVEX Disp8 test code, which was
> benign (due to %rdx getting set to zero) to all non-vSIB tests as it
> mistakenly encoded <disp8>(%rdx,%rdx) instead of <disp8>(%rdx,%riz). In
> the vSIB case this meant <disp8>(%rdx,%zmm2) instead of the intended
> <disp8>(%rdx,%zmm4).
>
> Likewise the access count check wasn't entirely correct for the S/G
> case: In the quad-word-index but dword-data case only half the number
> of full vector elements get accessed.
>
> As an unrelated change in the main test harness source file distinguish
> the "n/a" messages by bitness.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v9 05/23] x86emul: support AVX512F gather insns

Posted by Jan Beulich 2 weeks ago
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v9: Suppress general register update upon failures. Split out ModR/M
     handling changes as well as independent test harness ones into
     prereq patches. Re-base.
v8: Re-base.
v7: Fix ByteOp register decode. Re-base.
v6: New.

--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -18,7 +18,7 @@ CFLAGS += $(CFLAGS_xeninclude)
  
  SIMD := 3dnow sse sse2 sse4 avx avx2 xop avx512f avx512bw avx512dq avx512er
  FMA := fma4 fma
-SG := avx2-sg
+SG := avx2-sg avx512f-sg avx512vl-sg
  TESTCASES := blowfish $(SIMD) $(FMA) $(SG)
  
  OPMASK := avx512f avx512dq avx512bw
@@ -66,6 +66,14 @@ xop-flts := $(avx-flts)
  avx512f-vecs := 64 16 32
  avx512f-ints := 4 8
  avx512f-flts := 4 8
+avx512f-sg-vecs := 64
+avx512f-sg-idxs := 4 8
+avx512f-sg-ints := $(avx512f-ints)
+avx512f-sg-flts := $(avx512f-flts)
+avx512vl-sg-vecs := 16 32
+avx512vl-sg-idxs := $(avx512f-sg-idxs)
+avx512vl-sg-ints := $(avx512f-ints)
+avx512vl-sg-flts := $(avx512f-flts)
  avx512bw-vecs := $(avx512f-vecs)
  avx512bw-ints := 1 2
  avx512bw-flts :=
--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -176,6 +176,8 @@ static const struct test avx512f_all[] =
      INSN(fnmsub213,    66, 0f38, af,    el,     sd, el),
      INSN(fnmsub231,    66, 0f38, be,    vl,     sd, vl),
      INSN(fnmsub231,    66, 0f38, bf,    el,     sd, el),
+    INSN(gatherd,      66, 0f38, 92,    vl,     sd, el),
+    INSN(gatherq,      66, 0f38, 93,    vl,     sd, el),
      INSN(getexp,       66, 0f38, 42,    vl,     sd, vl),
      INSN(getexp,       66, 0f38, 43,    el,     sd, el),
      INSN(getmant,      66, 0f3a, 26,    vl,     sd, vl),
@@ -229,6 +231,8 @@ static const struct test avx512f_all[] =
      INSN(permt2,       66, 0f38, 7e,    vl,     dq, vl),
      INSN(permt2,       66, 0f38, 7f,    vl,     sd, vl),
      INSN(pexpand,      66, 0f38, 89,    vl,     dq, el),
+    INSN(pgatherd,     66, 0f38, 90,    vl,     dq, el),
+    INSN(pgatherq,     66, 0f38, 91,    vl,     dq, el),
      INSN(pmaxs,        66, 0f38, 3d,    vl,     dq, vl),
      INSN(pmaxu,        66, 0f38, 3f,    vl,     dq, vl),
      INSN(pmins,        66, 0f38, 39,    vl,     dq, vl),
--- a/tools/tests/x86_emulator/simd-sg.c
+++ b/tools/tests/x86_emulator/simd-sg.c
@@ -35,13 +35,78 @@ typedef long long __attribute__((vector_
  #define ITEM_COUNT (VEC_SIZE / ELEM_SIZE < IVEC_SIZE / IDX_SIZE ? \
                      VEC_SIZE / ELEM_SIZE : IVEC_SIZE / IDX_SIZE)
  
-#if VEC_SIZE == 16
-# define to_bool(cmp) __builtin_ia32_ptestc128(cmp, (vec_t){} == 0)
-#else
-# define to_bool(cmp) __builtin_ia32_ptestc256(cmp, (vec_t){} == 0)
-#endif
+#if defined(__AVX512F__)
+# define ALL_TRUE (~0ULL >> (64 - ELEM_COUNT))
+# if ELEM_SIZE == 4
+#  if IDX_SIZE == 4 || defined(__AVX512VL__)
+#   define to_mask(msk) B(ptestmd, , (vsi_t)(msk), (vsi_t)(msk), ~0)
+#   define eq(x, y) (B(pcmpeqd, _mask, (vsi_t)(x), (vsi_t)(y), -1) == ALL_TRUE)
+#  else
+#   define widen(x) __builtin_ia32_pmovzxdq512_mask((vsi_t)(x), (idi_t){}, ~0)
+#   define to_mask(msk) __builtin_ia32_ptestmq512(widen(msk), widen(msk), ~0)
+#   define eq(x, y) (__builtin_ia32_pcmpeqq512_mask(widen(x), widen(y), ~0) == ALL_TRUE)
+#  endif
+#  define BG_(dt, it, reg, mem, idx, msk, scl) \
+    __builtin_ia32_gather##it##dt(reg, mem, idx, to_mask(msk), scl)
+# else
+#  define eq(x, y) (B(pcmpeqq, _mask, (vdi_t)(x), (vdi_t)(y), -1) == ALL_TRUE)
+#  define BG_(dt, it, reg, mem, idx, msk, scl) \
+    __builtin_ia32_gather##it##dt(reg, mem, idx, B(ptestmq, , (vdi_t)(msk), (vdi_t)(msk), ~0), scl)
+# endif
+/*
+ * Instead of replicating the main IDX_SIZE conditional below three times, use
+ * a double layer of macro invocations, allowing for substitution of the
+ * respective relevant macro argument tokens.
+ */
+# define BG(dt, it, reg, mem, idx, msk, scl) BG_(dt, it, reg, mem, idx, msk, scl)
+# if VEC_MAX < 64
+/*
+ * The sub-512-bit built-ins have an extra "3" infix, presumably because the
+ * 512-bit names were chosen without the AVX512VL extension in mind (and hence
+ * making the latter collide with the AVX2 ones).
+ */
+#  define si 3si
+#  define di 3di
+# endif
+# if VEC_MAX == 16
+#  define v8df v2df
+#  define v8di v2di
+#  define v16sf v4sf
+#  define v16si v4si
+# elif VEC_MAX == 32
+#  define v8df v4df
+#  define v8di v4di
+#  define v16sf v8sf
+#  define v16si v8si
+# endif
+# if IDX_SIZE == 4
+#  if INT_SIZE == 4
+#   define gather(reg, mem, idx, msk, scl) BG(v16si, si, reg, mem, idx, msk, scl)
+#  elif INT_SIZE == 8
+#   define gather(reg, mem, idx, msk, scl) (vec_t)(BG(v8di, si, (vdi_t)(reg), mem, idx, msk, scl))
+#  elif FLOAT_SIZE == 4
+#   define gather(reg, mem, idx, msk, scl) BG(v16sf, si, reg, mem, idx, msk, scl)
+#  elif FLOAT_SIZE == 8
+#   define gather(reg, mem, idx, msk, scl) BG(v8df, si, reg, mem, idx, msk, scl)
+#  endif
+# elif IDX_SIZE == 8
+#  if INT_SIZE == 4
+#   define gather(reg, mem, idx, msk, scl) BG(v16si, di, reg, mem, (idi_t)(idx), msk, scl)
+#  elif INT_SIZE == 8
+#   define gather(reg, mem, idx, msk, scl) (vec_t)(BG(v8di, di, (vdi_t)(reg), mem, (idi_t)(idx), msk, scl))
+#  elif FLOAT_SIZE == 4
+#   define gather(reg, mem, idx, msk, scl) BG(v16sf, di, reg, mem, (idi_t)(idx), msk, scl)
+#  elif FLOAT_SIZE == 8
+#   define gather(reg, mem, idx, msk, scl) BG(v8df, di, reg, mem, (idi_t)(idx), msk, scl)
+#  endif
+# endif
+#elif defined(__AVX2__)
+# if VEC_SIZE == 16
+#  define to_bool(cmp) __builtin_ia32_ptestc128(cmp, (vec_t){} == 0)
+# else
+#  define to_bool(cmp) __builtin_ia32_ptestc256(cmp, (vec_t){} == 0)
+# endif
  
-#if defined(__AVX2__)
  # if VEC_MAX == 16
  #  if IDX_SIZE == 4
  #   if INT_SIZE == 4
@@ -111,6 +176,10 @@ typedef long long __attribute__((vector_
  # endif
  #endif
  
+#ifndef eq
+# define eq(x, y) to_bool((x) == (y))
+#endif
+
  #define GLUE_(x, y) x ## y
  #define GLUE(x, y) GLUE_(x, y)
  
@@ -119,6 +188,7 @@ typedef long long __attribute__((vector_
  #define PUT8(n)  PUT4(n),   PUT4((n) +  4)
  #define PUT16(n) PUT8(n),   PUT8((n) +  8)
  #define PUT32(n) PUT16(n), PUT16((n) + 16)
+#define PUT64(n) PUT32(n), PUT32((n) + 32)
  
  const typeof((vec_t){}[0]) array[] = {
      GLUE(PUT, VEC_MAX)(1),
@@ -174,7 +244,7 @@ int sg_test(void)
  
      y = gather(full, array + ITEM_COUNT, -idx, full, ELEM_SIZE);
  #if ITEM_COUNT == ELEM_COUNT
-    if ( !to_bool(y == x - 1) )
+    if ( !eq(y, x - 1) )
          return __LINE__;
  #else
      for ( i = 0; i < ITEM_COUNT; ++i )
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -22,6 +22,8 @@ asm ( ".pushsection .test, \"ax\", @prog
  #include "avx512dq-opmask.h"
  #include "avx512bw-opmask.h"
  #include "avx512f.h"
+#include "avx512f-sg.h"
+#include "avx512vl-sg.h"
  #include "avx512bw.h"
  #include "avx512dq.h"
  #include "avx512er.h"
@@ -90,11 +92,13 @@ static bool simd_check_avx512f(void)
      return cpu_has_avx512f;
  }
  #define simd_check_avx512f_opmask simd_check_avx512f
+#define simd_check_avx512f_sg simd_check_avx512f
  
  static bool simd_check_avx512f_vl(void)
  {
      return cpu_has_avx512f && cpu_has_avx512vl;
  }
+#define simd_check_avx512vl_sg simd_check_avx512f_vl
  
  static bool simd_check_avx512dq(void)
  {
@@ -291,6 +295,14 @@ static const struct {
      SIMD(AVX512F u32x16,      avx512f,      64u4),
      SIMD(AVX512F s64x8,       avx512f,      64i8),
      SIMD(AVX512F u64x8,       avx512f,      64u8),
+    SIMD(AVX512F S/G f32[16x32], avx512f_sg, 64x4f4),
+    SIMD(AVX512F S/G f64[ 8x32], avx512f_sg, 64x4f8),
+    SIMD(AVX512F S/G f32[ 8x64], avx512f_sg, 64x8f4),
+    SIMD(AVX512F S/G f64[ 8x64], avx512f_sg, 64x8f8),
+    SIMD(AVX512F S/G i32[16x32], avx512f_sg, 64x4i4),
+    SIMD(AVX512F S/G i64[ 8x32], avx512f_sg, 64x4i8),
+    SIMD(AVX512F S/G i32[ 8x64], avx512f_sg, 64x8i4),
+    SIMD(AVX512F S/G i64[ 8x64], avx512f_sg, 64x8i8),
      AVX512VL(VL f32x4,        avx512f,      16f4),
      AVX512VL(VL f64x2,        avx512f,      16f8),
      AVX512VL(VL f32x8,        avx512f,      32f4),
@@ -303,6 +315,22 @@ static const struct {
      AVX512VL(VL u64x2,        avx512f,      16u8),
      AVX512VL(VL s64x4,        avx512f,      32i8),
      AVX512VL(VL u64x4,        avx512f,      32u8),
+    SIMD(AVX512VL S/G f32[4x32], avx512vl_sg, 16x4f4),
+    SIMD(AVX512VL S/G f64[2x32], avx512vl_sg, 16x4f8),
+    SIMD(AVX512VL S/G f32[2x64], avx512vl_sg, 16x8f4),
+    SIMD(AVX512VL S/G f64[2x64], avx512vl_sg, 16x8f8),
+    SIMD(AVX512VL S/G f32[8x32], avx512vl_sg, 32x4f4),
+    SIMD(AVX512VL S/G f64[4x32], avx512vl_sg, 32x4f8),
+    SIMD(AVX512VL S/G f32[4x64], avx512vl_sg, 32x8f4),
+    SIMD(AVX512VL S/G f64[4x64], avx512vl_sg, 32x8f8),
+    SIMD(AVX512VL S/G i32[4x32], avx512vl_sg, 16x4i4),
+    SIMD(AVX512VL S/G i64[2x32], avx512vl_sg, 16x4i8),
+    SIMD(AVX512VL S/G i32[2x64], avx512vl_sg, 16x8i4),
+    SIMD(AVX512VL S/G i64[2x64], avx512vl_sg, 16x8i8),
+    SIMD(AVX512VL S/G i32[8x32], avx512vl_sg, 32x4i4),
+    SIMD(AVX512VL S/G i64[4x32], avx512vl_sg, 32x4i8),
+    SIMD(AVX512VL S/G i32[4x64], avx512vl_sg, 32x8i4),
+    SIMD(AVX512VL S/G i64[4x64], avx512vl_sg, 32x8i8),
      SIMD(AVX512BW s8x64,     avx512bw,      64i1),
      SIMD(AVX512BW u8x64,     avx512bw,      64u1),
      SIMD(AVX512BW s16x32,    avx512bw,      64i2),
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -499,7 +499,7 @@ static const struct ext0f38_table {
      [0x8c] = { .simd_size = simd_packed_int },
      [0x8d] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
      [0x8e] = { .simd_size = simd_packed_int, .to_mem = 1 },
-    [0x90 ... 0x93] = { .simd_size = simd_other, .vsib = 1 },
+    [0x90 ... 0x93] = { .simd_size = simd_other, .vsib = 1, .d8s = d8s_dq },
      [0x96 ... 0x98] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
      [0x99] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
      [0x9a] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
@@ -9100,6 +9100,133 @@ x86_emulate(
          put_stub(stub);
  
          if ( rc != X86EMUL_OKAY )
+            goto done;
+
+        state->simd_size = simd_none;
+        break;
+    }
+
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x90): /* vpgatherd{d,q} mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x91): /* vpgatherq{d,q} mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x92): /* vgatherdp{s,d} mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x93): /* vgatherqp{s,d} mem,[xyz]mm{k} */
+    {
+        typeof(evex) *pevex;
+        union {
+            int32_t dw[16];
+            int64_t qw[8];
+        } index;
+        bool done = false;
+
+        ASSERT(ea.type == OP_MEM);
+        generate_exception_if((!evex.opmsk || evex.brs || evex.z ||
+                               evex.reg != 0xf ||
+                               modrm_reg == state->sib_index),
+                              EXC_UD);
+        avx512_vlen_check(false);
+        host_and_vcpu_must_have(avx512f);
+        get_fpu(X86EMUL_FPU_zmm);
+
+        /* Read destination and index registers. */
+        opc = init_evex(stub);
+        pevex = copy_EVEX(opc, evex);
+        pevex->opcx = vex_0f;
+        opc[0] = 0x7f; /* vmovdqa{32,64} */
+        /*
+         * The register writeback below has to retain masked-off elements, but
+         * needs to clear upper portions in the index-wider-than-data cases.
+         * Therefore read (and write below) the full register. The alternative
+         * would have been to fiddle with the mask register used.
+         */
+        pevex->opmsk = 0;
+        /* Use (%rax) as destination and modrm_reg as source. */
+        pevex->b = 1;
+        opc[1] = (modrm_reg & 7) << 3;
+        pevex->RX = 1;
+        opc[2] = 0xc3;
+
+        invoke_stub("", "", "=m" (*mmvalp) : "a" (mmvalp));
+
+        pevex->pfx = vex_f3; /* vmovdqu{32,64} */
+        pevex->w = b & 1;
+        /* Switch to sib_index as source. */
+        pevex->r = !mode_64bit() || !(state->sib_index & 0x08);
+        pevex->R = !mode_64bit() || !(state->sib_index & 0x10);
+        opc[1] = (state->sib_index & 7) << 3;
+
+        invoke_stub("", "", "=m" (index) : "a" (&index));
+        put_stub(stub);
+
+        /* Clear untouched parts of the destination and mask values. */
+        n = 1 << (2 + evex.lr - ((b & 1) | evex.w));
+        op_bytes = 4 << evex.w;
+        memset((void *)mmvalp + n * op_bytes, 0, 64 - n * op_bytes);
+        op_mask &= (1 << n) - 1;
+
+        for ( i = 0; op_mask; ++i )
+        {
+            signed long idx = b & 1 ? index.qw[i] : index.dw[i];
+
+            if ( !(op_mask & (1 << i)) )
+                continue;
+
+            rc = ops->read(ea.mem.seg,
+                           truncate_ea(ea.mem.off + (idx << state->sib_scale)),
+                           (void *)mmvalp + i * op_bytes, op_bytes, ctxt);
+            if ( rc != X86EMUL_OKAY )
+            {
+                /*
+                 * If we've made some progress and the access did not fault,
+                 * force a retry instead. This is for example necessary to
+                 * cope with the limited capacity of HVM's MMIO cache.
+                 */
+                if ( rc != X86EMUL_EXCEPTION && done )
+                    rc = X86EMUL_RETRY;
+                break;
+            }
+
+            op_mask &= ~(1 << i);
+            done = true;
+
+#ifdef __XEN__
+            if ( op_mask && local_events_need_delivery() )
+            {
+                rc = X86EMUL_RETRY;
+                break;
+            }
+#endif
+        }
+
+        /* Write destination and mask registers. */
+        opc = init_evex(stub);
+        pevex = copy_EVEX(opc, evex);
+        pevex->opcx = vex_0f;
+        opc[0] = 0x6f; /* vmovdqa{32,64} */
+        pevex->opmsk = 0;
+        /* Use modrm_reg as destination and (%rax) as source. */
+        pevex->b = 1;
+        opc[1] = (modrm_reg & 7) << 3;
+        pevex->RX = 1;
+        opc[2] = 0xc3;
+
+        invoke_stub("", "", "+m" (*mmvalp) : "a" (mmvalp));
+
+        /*
+         * kmovw: This is VEX-encoded, so we can't use pevex. Avoid copy_VEX() etc
+         * as well, since we can easily use the 2-byte VEX form here.
+         */
+        opc -= EVEX_PFX_BYTES;
+        opc[0] = 0xc5;
+        opc[1] = 0xf8;
+        opc[2] = 0x90;
+        /* Use (%rax) as source. */
+        opc[3] = evex.opmsk << 3;
+        opc[4] = 0xc3;
+
+        invoke_stub("", "", "+m" (op_mask) : "a" (&op_mask));
+        put_stub(stub);
+
+        if ( rc != X86EMUL_OKAY )
              goto done;
  
          state->simd_size = simd_none;

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v9 05/23] x86emul: support AVX512F gather insns

Posted by Andrew Cooper 2 weeks ago
On 01/07/2019 12:18, Jan Beulich wrote:
> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -9100,6 +9100,133 @@ x86_emulate(
>           put_stub(stub);
>   
>           if ( rc != X86EMUL_OKAY )
> +            goto done;
> +
> +        state->simd_size = simd_none;
> +        break;
> +    }
> +
> +    case X86EMUL_OPC_EVEX_66(0x0f38, 0x90): /* vpgatherd{d,q} mem,[xyz]mm{k} */
> +    case X86EMUL_OPC_EVEX_66(0x0f38, 0x91): /* vpgatherq{d,q} mem,[xyz]mm{k} */
> +    case X86EMUL_OPC_EVEX_66(0x0f38, 0x92): /* vgatherdp{s,d} mem,[xyz]mm{k} */
> +    case X86EMUL_OPC_EVEX_66(0x0f38, 0x93): /* vgatherqp{s,d} mem,[xyz]mm{k} */
> +    {
> +        typeof(evex) *pevex;
> +        union {
> +            int32_t dw[16];
> +            int64_t qw[8];
> +        } index;
> +        bool done = false;
> +
> +        ASSERT(ea.type == OP_MEM);
> +        generate_exception_if((!evex.opmsk || evex.brs || evex.z ||
> +                               evex.reg != 0xf ||
> +                               modrm_reg == state->sib_index),
> +                              EXC_UD);
> +        avx512_vlen_check(false);
> +        host_and_vcpu_must_have(avx512f);
> +        get_fpu(X86EMUL_FPU_zmm);
> +
> +        /* Read destination and index registers. */
> +        opc = init_evex(stub);
> +        pevex = copy_EVEX(opc, evex);
> +        pevex->opcx = vex_0f;
> +        opc[0] = 0x7f; /* vmovdqa{32,64} */
> +        /*
> +         * The register writeback below has to retain masked-off elements, but
> +         * needs to clear upper portions in the index-wider-than-data cases.
> +         * Therefore read (and write below) the full register. The alternative
> +         * would have been to fiddle with the mask register used.
> +         */
> +        pevex->opmsk = 0;
> +        /* Use (%rax) as destination and modrm_reg as source. */
> +        pevex->b = 1;
> +        opc[1] = (modrm_reg & 7) << 3;
> +        pevex->RX = 1;
> +        opc[2] = 0xc3;
> +
> +        invoke_stub("", "", "=m" (*mmvalp) : "a" (mmvalp));
> +
> +        pevex->pfx = vex_f3; /* vmovdqu{32,64} */
> +        pevex->w = b & 1;
> +        /* Switch to sib_index as source. */
> +        pevex->r = !mode_64bit() || !(state->sib_index & 0x08);
> +        pevex->R = !mode_64bit() || !(state->sib_index & 0x10);
> +        opc[1] = (state->sib_index & 7) << 3;
> +
> +        invoke_stub("", "", "=m" (index) : "a" (&index));
> +        put_stub(stub);
> +
> +        /* Clear untouched parts of the destination and mask values. */
> +        n = 1 << (2 + evex.lr - ((b & 1) | evex.w));
> +        op_bytes = 4 << evex.w;
> +        memset((void *)mmvalp + n * op_bytes, 0, 64 - n * op_bytes);
> +        op_mask &= (1 << n) - 1;
> +
> +        for ( i = 0; op_mask; ++i )
> +        {
> +            signed long idx = b & 1 ? index.qw[i] : index.dw[i];

No signed.  However, surely this needs to be int64_t anyway, to function
correctly in a 32bit build of the test harness?

The SDM says VPGATHERQD is encodable in 32bit with quadword indices.

~Andrew

> +
> +            if ( !(op_mask & (1 << i)) )
> +                continue;
> +
> +            rc = ops->read(ea.mem.seg,
> +                           truncate_ea(ea.mem.off + (idx << state->sib_scale)),
> +                           (void *)mmvalp + i * op_bytes, op_bytes, ctxt);
> +            if ( rc != X86EMUL_OKAY )
> +            {
> +                /*
> +                 * If we've made some progress and the access did not fault,
> +                 * force a retry instead. This is for example necessary to
> +                 * cope with the limited capacity of HVM's MMIO cache.
> +                 */
> +                if ( rc != X86EMUL_EXCEPTION && done )
> +                    rc = X86EMUL_RETRY;
> +                break;
> +            }
> +
> +            op_mask &= ~(1 << i);
> +            done = true;
> +
> +#ifdef __XEN__
> +            if ( op_mask && local_events_need_delivery() )
> +            {
> +                rc = X86EMUL_RETRY;
> +                break;
> +            }
> +#endif
> +        }
> +
> +        /* Write destination and mask registers. */
> +        opc = init_evex(stub);
> +        pevex = copy_EVEX(opc, evex);
> +        pevex->opcx = vex_0f;
> +        opc[0] = 0x6f; /* vmovdqa{32,64} */
> +        pevex->opmsk = 0;
> +        /* Use modrm_reg as destination and (%rax) as source. */
> +        pevex->b = 1;
> +        opc[1] = (modrm_reg & 7) << 3;
> +        pevex->RX = 1;
> +        opc[2] = 0xc3;
> +
> +        invoke_stub("", "", "+m" (*mmvalp) : "a" (mmvalp));
> +
> +        /*
> +         * kmovw: This is VEX-encoded, so we can't use pevex. Avoid copy_VEX() etc
> +         * as well, since we can easily use the 2-byte VEX form here.
> +         */
> +        opc -= EVEX_PFX_BYTES;
> +        opc[0] = 0xc5;
> +        opc[1] = 0xf8;
> +        opc[2] = 0x90;
> +        /* Use (%rax) as source. */
> +        opc[3] = evex.opmsk << 3;
> +        opc[4] = 0xc3;
> +
> +        invoke_stub("", "", "+m" (op_mask) : "a" (&op_mask));
> +        put_stub(stub);
> +
> +        if ( rc != X86EMUL_OKAY )
>               goto done;
>   
>           state->simd_size = simd_none;
>


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v9 05/23] x86emul: support AVX512F gather insns

Posted by Jan Beulich 2 weeks ago
On 04.07.2019 16:10, Andrew Cooper wrote:
> On 01/07/2019 12:18, Jan Beulich wrote:
>> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
>> @@ -9100,6 +9100,133 @@ x86_emulate(
>>            put_stub(stub);
>>    
>>            if ( rc != X86EMUL_OKAY )
>> +            goto done;
>> +
>> +        state->simd_size = simd_none;
>> +        break;
>> +    }
>> +
>> +    case X86EMUL_OPC_EVEX_66(0x0f38, 0x90): /* vpgatherd{d,q} mem,[xyz]mm{k} */
>> +    case X86EMUL_OPC_EVEX_66(0x0f38, 0x91): /* vpgatherq{d,q} mem,[xyz]mm{k} */
>> +    case X86EMUL_OPC_EVEX_66(0x0f38, 0x92): /* vgatherdp{s,d} mem,[xyz]mm{k} */
>> +    case X86EMUL_OPC_EVEX_66(0x0f38, 0x93): /* vgatherqp{s,d} mem,[xyz]mm{k} */
>> +    {
>> +        typeof(evex) *pevex;
>> +        union {
>> +            int32_t dw[16];
>> +            int64_t qw[8];
>> +        } index;
>> +        bool done = false;
>> +
>> +        ASSERT(ea.type == OP_MEM);
>> +        generate_exception_if((!evex.opmsk || evex.brs || evex.z ||
>> +                               evex.reg != 0xf ||
>> +                               modrm_reg == state->sib_index),
>> +                              EXC_UD);
>> +        avx512_vlen_check(false);
>> +        host_and_vcpu_must_have(avx512f);
>> +        get_fpu(X86EMUL_FPU_zmm);
>> +
>> +        /* Read destination and index registers. */
>> +        opc = init_evex(stub);
>> +        pevex = copy_EVEX(opc, evex);
>> +        pevex->opcx = vex_0f;
>> +        opc[0] = 0x7f; /* vmovdqa{32,64} */
>> +        /*
>> +         * The register writeback below has to retain masked-off elements, but
>> +         * needs to clear upper portions in the index-wider-than-data cases.
>> +         * Therefore read (and write below) the full register. The alternative
>> +         * would have been to fiddle with the mask register used.
>> +         */
>> +        pevex->opmsk = 0;
>> +        /* Use (%rax) as destination and modrm_reg as source. */
>> +        pevex->b = 1;
>> +        opc[1] = (modrm_reg & 7) << 3;
>> +        pevex->RX = 1;
>> +        opc[2] = 0xc3;
>> +
>> +        invoke_stub("", "", "=m" (*mmvalp) : "a" (mmvalp));
>> +
>> +        pevex->pfx = vex_f3; /* vmovdqu{32,64} */
>> +        pevex->w = b & 1;
>> +        /* Switch to sib_index as source. */
>> +        pevex->r = !mode_64bit() || !(state->sib_index & 0x08);
>> +        pevex->R = !mode_64bit() || !(state->sib_index & 0x10);
>> +        opc[1] = (state->sib_index & 7) << 3;
>> +
>> +        invoke_stub("", "", "=m" (index) : "a" (&index));
>> +        put_stub(stub);
>> +
>> +        /* Clear untouched parts of the destination and mask values. */
>> +        n = 1 << (2 + evex.lr - ((b & 1) | evex.w));
>> +        op_bytes = 4 << evex.w;
>> +        memset((void *)mmvalp + n * op_bytes, 0, 64 - n * op_bytes);
>> +        op_mask &= (1 << n) - 1;
>> +
>> +        for ( i = 0; op_mask; ++i )
>> +        {
>> +            signed long idx = b & 1 ? index.qw[i] : index.dw[i];
> 
> No signed.

Hmm - would you mind this remaining consistent with the AVX
counterpart code? (As an aside I continue to think it is a bad
thing to not have explicit "signed" when we actually mean signed
quantities, seeing the still large amount of plain short/int/long
uses that actually should be unsigned.)

>  However, surely this needs to be int64_t anyway, to function
> correctly in a 32bit build of the test harness?

No, only 32 bits (or less, when the scale factor is larger than 1)
will be used for address calculation. And again this is no
different from pre-existing AVX code.

> The SDM says VPGATHERQD is encodable in 32bit with quadword indices.

Sure, truncating to just 32-bit values due to 32-bit addressing.

Jan
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v9 05/23] x86emul: support AVX512F gather insns

Posted by Andrew Cooper 2 weeks ago
On 04/07/2019 15:22, Jan Beulich wrote:
> On 04.07.2019 16:10, Andrew Cooper wrote:
>> On 01/07/2019 12:18, Jan Beulich wrote:
>>> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
>>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
>>> @@ -9100,6 +9100,133 @@ x86_emulate(
>>>            put_stub(stub);
>>>    
>>>            if ( rc != X86EMUL_OKAY )
>>> +            goto done;
>>> +
>>> +        state->simd_size = simd_none;
>>> +        break;
>>> +    }
>>> +
>>> +    case X86EMUL_OPC_EVEX_66(0x0f38, 0x90): /* vpgatherd{d,q} mem,[xyz]mm{k} */
>>> +    case X86EMUL_OPC_EVEX_66(0x0f38, 0x91): /* vpgatherq{d,q} mem,[xyz]mm{k} */
>>> +    case X86EMUL_OPC_EVEX_66(0x0f38, 0x92): /* vgatherdp{s,d} mem,[xyz]mm{k} */
>>> +    case X86EMUL_OPC_EVEX_66(0x0f38, 0x93): /* vgatherqp{s,d} mem,[xyz]mm{k} */
>>> +    {
>>> +        typeof(evex) *pevex;
>>> +        union {
>>> +            int32_t dw[16];
>>> +            int64_t qw[8];
>>> +        } index;
>>> +        bool done = false;
>>> +
>>> +        ASSERT(ea.type == OP_MEM);
>>> +        generate_exception_if((!evex.opmsk || evex.brs || evex.z ||
>>> +                               evex.reg != 0xf ||
>>> +                               modrm_reg == state->sib_index),
>>> +                              EXC_UD);
>>> +        avx512_vlen_check(false);
>>> +        host_and_vcpu_must_have(avx512f);
>>> +        get_fpu(X86EMUL_FPU_zmm);
>>> +
>>> +        /* Read destination and index registers. */
>>> +        opc = init_evex(stub);
>>> +        pevex = copy_EVEX(opc, evex);
>>> +        pevex->opcx = vex_0f;
>>> +        opc[0] = 0x7f; /* vmovdqa{32,64} */
>>> +        /*
>>> +         * The register writeback below has to retain masked-off elements, but
>>> +         * needs to clear upper portions in the index-wider-than-data cases.
>>> +         * Therefore read (and write below) the full register. The alternative
>>> +         * would have been to fiddle with the mask register used.
>>> +         */
>>> +        pevex->opmsk = 0;
>>> +        /* Use (%rax) as destination and modrm_reg as source. */
>>> +        pevex->b = 1;
>>> +        opc[1] = (modrm_reg & 7) << 3;
>>> +        pevex->RX = 1;
>>> +        opc[2] = 0xc3;
>>> +
>>> +        invoke_stub("", "", "=m" (*mmvalp) : "a" (mmvalp));
>>> +
>>> +        pevex->pfx = vex_f3; /* vmovdqu{32,64} */
>>> +        pevex->w = b & 1;
>>> +        /* Switch to sib_index as source. */
>>> +        pevex->r = !mode_64bit() || !(state->sib_index & 0x08);
>>> +        pevex->R = !mode_64bit() || !(state->sib_index & 0x10);
>>> +        opc[1] = (state->sib_index & 7) << 3;
>>> +
>>> +        invoke_stub("", "", "=m" (index) : "a" (&index));
>>> +        put_stub(stub);
>>> +
>>> +        /* Clear untouched parts of the destination and mask values. */
>>> +        n = 1 << (2 + evex.lr - ((b & 1) | evex.w));
>>> +        op_bytes = 4 << evex.w;
>>> +        memset((void *)mmvalp + n * op_bytes, 0, 64 - n * op_bytes);
>>> +        op_mask &= (1 << n) - 1;
>>> +
>>> +        for ( i = 0; op_mask; ++i )
>>> +        {
>>> +            signed long idx = b & 1 ? index.qw[i] : index.dw[i];
>> No signed.
> Hmm - would you mind this remaining consistent with the AVX
> counterpart code? (As an aside I continue to think it is a bad
> thing to not have explicit "signed" when we actually mean signed
> quantities, seeing the still large amount of plain short/int/long
> uses that actually should be unsigned.)

That was conclusively objected to by multiple other committers, for a
number of reasons.

It is unfortunate that some examples slipped in, but as the coding style
is not changing, they should be taken out.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v9 05/23] x86emul: support AVX512F gather insns

Posted by Andrew Cooper 2 weeks ago
On 04/07/2019 15:10, Andrew Cooper wrote:
> On 01/07/2019 12:18, Jan Beulich wrote:
>> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
>> @@ -9100,6 +9100,133 @@ x86_emulate(
>>           put_stub(stub);
>>   
>>           if ( rc != X86EMUL_OKAY )
>> +            goto done;
>> +
>> +        state->simd_size = simd_none;
>> +        break;
>> +    }
>> +
>> +    case X86EMUL_OPC_EVEX_66(0x0f38, 0x90): /* vpgatherd{d,q} mem,[xyz]mm{k} */
>> +    case X86EMUL_OPC_EVEX_66(0x0f38, 0x91): /* vpgatherq{d,q} mem,[xyz]mm{k} */
>> +    case X86EMUL_OPC_EVEX_66(0x0f38, 0x92): /* vgatherdp{s,d} mem,[xyz]mm{k} */
>> +    case X86EMUL_OPC_EVEX_66(0x0f38, 0x93): /* vgatherqp{s,d} mem,[xyz]mm{k} */
>> +    {
>> +        typeof(evex) *pevex;
>> +        union {
>> +            int32_t dw[16];
>> +            int64_t qw[8];
>> +        } index;
>> +        bool done = false;
>> +
>> +        ASSERT(ea.type == OP_MEM);
>> +        generate_exception_if((!evex.opmsk || evex.brs || evex.z ||
>> +                               evex.reg != 0xf ||
>> +                               modrm_reg == state->sib_index),
>> +                              EXC_UD);
>> +        avx512_vlen_check(false);
>> +        host_and_vcpu_must_have(avx512f);
>> +        get_fpu(X86EMUL_FPU_zmm);
>> +
>> +        /* Read destination and index registers. */
>> +        opc = init_evex(stub);
>> +        pevex = copy_EVEX(opc, evex);
>> +        pevex->opcx = vex_0f;
>> +        opc[0] = 0x7f; /* vmovdqa{32,64} */
>> +        /*
>> +         * The register writeback below has to retain masked-off elements, but
>> +         * needs to clear upper portions in the index-wider-than-data cases.
>> +         * Therefore read (and write below) the full register. The alternative
>> +         * would have been to fiddle with the mask register used.
>> +         */
>> +        pevex->opmsk = 0;
>> +        /* Use (%rax) as destination and modrm_reg as source. */
>> +        pevex->b = 1;
>> +        opc[1] = (modrm_reg & 7) << 3;
>> +        pevex->RX = 1;
>> +        opc[2] = 0xc3;
>> +
>> +        invoke_stub("", "", "=m" (*mmvalp) : "a" (mmvalp));
>> +
>> +        pevex->pfx = vex_f3; /* vmovdqu{32,64} */
>> +        pevex->w = b & 1;
>> +        /* Switch to sib_index as source. */
>> +        pevex->r = !mode_64bit() || !(state->sib_index & 0x08);
>> +        pevex->R = !mode_64bit() || !(state->sib_index & 0x10);
>> +        opc[1] = (state->sib_index & 7) << 3;
>> +
>> +        invoke_stub("", "", "=m" (index) : "a" (&index));
>> +        put_stub(stub);
>> +
>> +        /* Clear untouched parts of the destination and mask values. */
>> +        n = 1 << (2 + evex.lr - ((b & 1) | evex.w));
>> +        op_bytes = 4 << evex.w;
>> +        memset((void *)mmvalp + n * op_bytes, 0, 64 - n * op_bytes);
>> +        op_mask &= (1 << n) - 1;
>> +
>> +        for ( i = 0; op_mask; ++i )
>> +        {
>> +            signed long idx = b & 1 ? index.qw[i] : index.dw[i];
> No signed.  However, surely this needs to be int64_t anyway, to function
> correctly in a 32bit build of the test harness?
>
> The SDM says VPGATHERQD is encodable in 32bit with quadword indices.
>
> ~Andrew
>
>> +
>> +            if ( !(op_mask & (1 << i)) )
>> +                continue;
>> +
>> +            rc = ops->read(ea.mem.seg,
>> +                           truncate_ea(ea.mem.off + (idx << state->sib_scale)),

Actually, what SDM says is:

"The scaled index may require more bits to represent than the address
bits used by the processor (e.g., in 32-bit mode, if the scale is
greater than one). In this case, the most significant bits beyond the
number of address bits are ignored."

That reads as if it is it means "ea.mem.off + (u32)(idx <<
state->sib_scale)".

However, given the overall truncation, I'm not sure how to confirm what
the real behaviour is.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v9 05/23] x86emul: support AVX512F gather insns

Posted by Jan Beulich 2 weeks ago
On 04.07.2019 16:16, Andrew Cooper wrote:
> On 04/07/2019 15:10, Andrew Cooper wrote:
>> On 01/07/2019 12:18, Jan Beulich wrote:
>>> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
>>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
>>> @@ -9100,6 +9100,133 @@ x86_emulate(
>>>            put_stub(stub);
>>>    
>>>            if ( rc != X86EMUL_OKAY )
>>> +            goto done;
>>> +
>>> +        state->simd_size = simd_none;
>>> +        break;
>>> +    }
>>> +
>>> +    case X86EMUL_OPC_EVEX_66(0x0f38, 0x90): /* vpgatherd{d,q} mem,[xyz]mm{k} */
>>> +    case X86EMUL_OPC_EVEX_66(0x0f38, 0x91): /* vpgatherq{d,q} mem,[xyz]mm{k} */
>>> +    case X86EMUL_OPC_EVEX_66(0x0f38, 0x92): /* vgatherdp{s,d} mem,[xyz]mm{k} */
>>> +    case X86EMUL_OPC_EVEX_66(0x0f38, 0x93): /* vgatherqp{s,d} mem,[xyz]mm{k} */
>>> +    {
>>> +        typeof(evex) *pevex;
>>> +        union {
>>> +            int32_t dw[16];
>>> +            int64_t qw[8];
>>> +        } index;
>>> +        bool done = false;
>>> +
>>> +        ASSERT(ea.type == OP_MEM);
>>> +        generate_exception_if((!evex.opmsk || evex.brs || evex.z ||
>>> +                               evex.reg != 0xf ||
>>> +                               modrm_reg == state->sib_index),
>>> +                              EXC_UD);
>>> +        avx512_vlen_check(false);
>>> +        host_and_vcpu_must_have(avx512f);
>>> +        get_fpu(X86EMUL_FPU_zmm);
>>> +
>>> +        /* Read destination and index registers. */
>>> +        opc = init_evex(stub);
>>> +        pevex = copy_EVEX(opc, evex);
>>> +        pevex->opcx = vex_0f;
>>> +        opc[0] = 0x7f; /* vmovdqa{32,64} */
>>> +        /*
>>> +         * The register writeback below has to retain masked-off elements, but
>>> +         * needs to clear upper portions in the index-wider-than-data cases.
>>> +         * Therefore read (and write below) the full register. The alternative
>>> +         * would have been to fiddle with the mask register used.
>>> +         */
>>> +        pevex->opmsk = 0;
>>> +        /* Use (%rax) as destination and modrm_reg as source. */
>>> +        pevex->b = 1;
>>> +        opc[1] = (modrm_reg & 7) << 3;
>>> +        pevex->RX = 1;
>>> +        opc[2] = 0xc3;
>>> +
>>> +        invoke_stub("", "", "=m" (*mmvalp) : "a" (mmvalp));
>>> +
>>> +        pevex->pfx = vex_f3; /* vmovdqu{32,64} */
>>> +        pevex->w = b & 1;
>>> +        /* Switch to sib_index as source. */
>>> +        pevex->r = !mode_64bit() || !(state->sib_index & 0x08);
>>> +        pevex->R = !mode_64bit() || !(state->sib_index & 0x10);
>>> +        opc[1] = (state->sib_index & 7) << 3;
>>> +
>>> +        invoke_stub("", "", "=m" (index) : "a" (&index));
>>> +        put_stub(stub);
>>> +
>>> +        /* Clear untouched parts of the destination and mask values. */
>>> +        n = 1 << (2 + evex.lr - ((b & 1) | evex.w));
>>> +        op_bytes = 4 << evex.w;
>>> +        memset((void *)mmvalp + n * op_bytes, 0, 64 - n * op_bytes);
>>> +        op_mask &= (1 << n) - 1;
>>> +
>>> +        for ( i = 0; op_mask; ++i )
>>> +        {
>>> +            signed long idx = b & 1 ? index.qw[i] : index.dw[i];
>> No signed.  However, surely this needs to be int64_t anyway, to function
>> correctly in a 32bit build of the test harness?
>>
>> The SDM says VPGATHERQD is encodable in 32bit with quadword indices.
>>
>> ~Andrew
>>
>>> +
>>> +            if ( !(op_mask & (1 << i)) )
>>> +                continue;
>>> +
>>> +            rc = ops->read(ea.mem.seg,
>>> +                           truncate_ea(ea.mem.off + (idx << state->sib_scale)),
> 
> Actually, what SDM says is:
> 
> "The scaled index may require more bits to represent than the address
> bits used by the processor (e.g., in 32-bit mode, if the scale is
> greater than one). In this case, the most significant bits beyond the
> number of address bits are ignored."
> 
> That reads as if it is it means "ea.mem.off + (u32)(idx <<
> state->sib_scale)".

Why "reads as if"? What else could a 32-bit address computation look
like? (In essence truncate_ea() will truncate to 32 bits anyway when
32-bit addressing is in use, so the inner truncation is simply
redundant.)

Jan
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v9 05/23] x86emul: support AVX512F gather insns

Posted by Andrew Cooper 2 weeks ago
On 04/07/2019 15:25, Jan Beulich wrote:
> On 04.07.2019 16:16, Andrew Cooper wrote:
>> On 04/07/2019 15:10, Andrew Cooper wrote:
>>> On 01/07/2019 12:18, Jan Beulich wrote:
>>>> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
>>>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
>>>> @@ -9100,6 +9100,133 @@ x86_emulate(
>>>>            put_stub(stub);
>>>>    
>>>>            if ( rc != X86EMUL_OKAY )
>>>> +            goto done;
>>>> +
>>>> +        state->simd_size = simd_none;
>>>> +        break;
>>>> +    }
>>>> +
>>>> +    case X86EMUL_OPC_EVEX_66(0x0f38, 0x90): /* vpgatherd{d,q} mem,[xyz]mm{k} */
>>>> +    case X86EMUL_OPC_EVEX_66(0x0f38, 0x91): /* vpgatherq{d,q} mem,[xyz]mm{k} */
>>>> +    case X86EMUL_OPC_EVEX_66(0x0f38, 0x92): /* vgatherdp{s,d} mem,[xyz]mm{k} */
>>>> +    case X86EMUL_OPC_EVEX_66(0x0f38, 0x93): /* vgatherqp{s,d} mem,[xyz]mm{k} */
>>>> +    {
>>>> +        typeof(evex) *pevex;
>>>> +        union {
>>>> +            int32_t dw[16];
>>>> +            int64_t qw[8];
>>>> +        } index;
>>>> +        bool done = false;
>>>> +
>>>> +        ASSERT(ea.type == OP_MEM);
>>>> +        generate_exception_if((!evex.opmsk || evex.brs || evex.z ||
>>>> +                               evex.reg != 0xf ||
>>>> +                               modrm_reg == state->sib_index),
>>>> +                              EXC_UD);
>>>> +        avx512_vlen_check(false);
>>>> +        host_and_vcpu_must_have(avx512f);
>>>> +        get_fpu(X86EMUL_FPU_zmm);
>>>> +
>>>> +        /* Read destination and index registers. */
>>>> +        opc = init_evex(stub);
>>>> +        pevex = copy_EVEX(opc, evex);
>>>> +        pevex->opcx = vex_0f;
>>>> +        opc[0] = 0x7f; /* vmovdqa{32,64} */
>>>> +        /*
>>>> +         * The register writeback below has to retain masked-off elements, but
>>>> +         * needs to clear upper portions in the index-wider-than-data cases.
>>>> +         * Therefore read (and write below) the full register. The alternative
>>>> +         * would have been to fiddle with the mask register used.
>>>> +         */
>>>> +        pevex->opmsk = 0;
>>>> +        /* Use (%rax) as destination and modrm_reg as source. */
>>>> +        pevex->b = 1;
>>>> +        opc[1] = (modrm_reg & 7) << 3;
>>>> +        pevex->RX = 1;
>>>> +        opc[2] = 0xc3;
>>>> +
>>>> +        invoke_stub("", "", "=m" (*mmvalp) : "a" (mmvalp));
>>>> +
>>>> +        pevex->pfx = vex_f3; /* vmovdqu{32,64} */
>>>> +        pevex->w = b & 1;
>>>> +        /* Switch to sib_index as source. */
>>>> +        pevex->r = !mode_64bit() || !(state->sib_index & 0x08);
>>>> +        pevex->R = !mode_64bit() || !(state->sib_index & 0x10);
>>>> +        opc[1] = (state->sib_index & 7) << 3;
>>>> +
>>>> +        invoke_stub("", "", "=m" (index) : "a" (&index));
>>>> +        put_stub(stub);
>>>> +
>>>> +        /* Clear untouched parts of the destination and mask values. */
>>>> +        n = 1 << (2 + evex.lr - ((b & 1) | evex.w));
>>>> +        op_bytes = 4 << evex.w;
>>>> +        memset((void *)mmvalp + n * op_bytes, 0, 64 - n * op_bytes);
>>>> +        op_mask &= (1 << n) - 1;
>>>> +
>>>> +        for ( i = 0; op_mask; ++i )
>>>> +        {
>>>> +            signed long idx = b & 1 ? index.qw[i] : index.dw[i];
>>> No signed.  However, surely this needs to be int64_t anyway, to function
>>> correctly in a 32bit build of the test harness?
>>>
>>> The SDM says VPGATHERQD is encodable in 32bit with quadword indices.
>>>
>>> ~Andrew
>>>
>>>> +
>>>> +            if ( !(op_mask & (1 << i)) )
>>>> +                continue;
>>>> +
>>>> +            rc = ops->read(ea.mem.seg,
>>>> +                           truncate_ea(ea.mem.off + (idx << state->sib_scale)),
>> Actually, what SDM says is:
>>
>> "The scaled index may require more bits to represent than the address
>> bits used by the processor (e.g., in 32-bit mode, if the scale is
>> greater than one). In this case, the most significant bits beyond the
>> number of address bits are ignored."
>>
>> That reads as if it is it means "ea.mem.off + (u32)(idx <<
>> state->sib_scale)".
> Why "reads as if"? What else could a 32-bit address computation look
> like? (In essence truncate_ea() will truncate to 32 bits anyway when
> 32-bit addressing is in use, so the inner truncation is simply
> redundant.)

Ok - I think it will DTRT.

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v9 06/23] x86emul: add high register S/G test cases

Posted by Jan Beulich 2 weeks ago
In order to verify that in particular the index register decoding works
correctly in the S/G emulation paths, add dedicated (64-bit only) cases
disallowing the compiler to use the lower registers. Other than in the
generic SIMD case, where occasional uses of %xmm or %ymm registers in
generated code cause various internal compiler errors when disallowing
use of all of the lower 16 registers (apparently due to insn templates
trying to use AVX2 encodings), doing so here in the AVX512F case looks
to be fine.

While the main goal here is the AVX512F case, add an AVX2 variant as
well.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
v6: New.

--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -147,6 +147,12 @@ $(foreach flavor,$(SIMD) $(FMA),$(eval $
  $(foreach flavor,$(SG),$(eval $(call simd-sg-defs,$(flavor))))
  $(foreach flavor,$(OPMASK),$(eval $(call opmask-defs,$(flavor))))
  
+first-string = $(shell for s in $(1); do echo "$$s"; break; done)
+
+avx2-sg-cflags-x86_64    := "-D_high $(foreach n,7 6 5 4 3 2 1,-ffixed-ymm$(n)) $(call first-string,$(avx2-sg-cflags))"
+avx512f-sg-cflags-x86_64 := "-D_higher $(foreach n,7 6 5 4 3 2 1,-ffixed-zmm$(n)) $(call first-string,$(avx512f-sg-cflags))"
+avx512f-sg-cflags-x86_64 += "-D_highest $(foreach n,15 14 13 12 11 10 9 8,-ffixed-zmm$(n)) $(call first-string,$(avx512f-sg-cflags-x86_64))"
+
  $(addsuffix .h,$(TESTCASES)): %.h: %.c testcase.mk Makefile
  	rm -f $@.new $*.bin
  	$(foreach arch,$(filter-out $(XEN_COMPILE_ARCH),x86_32) $(XEN_COMPILE_ARCH), \
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -266,6 +266,9 @@ static const struct {
      SIMD(AVX2 S/G i64[4x32],  avx2_sg,    32x4i8),
      SIMD(AVX2 S/G i32[4x64],  avx2_sg,    32x8i4),
      SIMD(AVX2 S/G i64[4x64],  avx2_sg,    32x8i8),
+#ifdef __x86_64__
+    SIMD_(64, AVX2 S/G %ymm8+, avx2_sg,     high),
+#endif
      SIMD(XOP 128bit single,       xop,      16f4),
      SIMD(XOP 256bit single,       xop,      32f4),
      SIMD(XOP 128bit double,       xop,      16f8),
@@ -303,6 +306,10 @@ static const struct {
      SIMD(AVX512F S/G i64[ 8x32], avx512f_sg, 64x4i8),
      SIMD(AVX512F S/G i32[ 8x64], avx512f_sg, 64x8i4),
      SIMD(AVX512F S/G i64[ 8x64], avx512f_sg, 64x8i8),
+#ifdef __x86_64__
+    SIMD_(64, AVX512F S/G %zmm8+, avx512f_sg, higher),
+    SIMD_(64, AVX512F S/G %zmm16+, avx512f_sg, highest),
+#endif
      AVX512VL(VL f32x4,        avx512f,      16f4),
      AVX512VL(VL f64x2,        avx512f,      16f8),
      AVX512VL(VL f32x8,        avx512f,      32f4),

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v9 07/23] x86emul: support AVX512F scatter insns

Posted by Jan Beulich 2 weeks ago
This completes support of AVX512F in the insn emulator.

Note that in the test harness there's a little bit of trickery needed to
get around the not fully consistent naming of AVX512VL gather and
scatter compiler built-ins. To suppress expansion of the "di" and "si"
tokens they get constructed by token concatenation in BS(), which is
different from BG().

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
TBD: I couldn't really decide whether to duplicate code or merge scatter
      into gather emulation.
---
v9: Suppress general register update upon failures.
v7: Re-base.
v6: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -270,6 +270,8 @@ static const struct test avx512f_all[] =
      INSN(prolv,        66, 0f38, 15,    vl,     dq, vl),
      INSNX(pror,        66,   0f, 72, 0, vl,     dq, vl),
      INSN(prorv,        66, 0f38, 14,    vl,     dq, vl),
+    INSN(pscatterd,    66, 0f38, a0,    vl,     dq, el),
+    INSN(pscatterq,    66, 0f38, a1,    vl,     dq, el),
      INSN(pshufd,       66,   0f, 70,    vl,      d, vl),
      INSN(pslld,        66,   0f, f2,    el_4,    d, vl),
      INSNX(pslld,       66,   0f, 72, 6, vl,      d, vl),
@@ -305,6 +307,8 @@ static const struct test avx512f_all[] =
      INSN(rsqrt14,      66, 0f38, 4f,    el,     sd, el),
      INSN(scalef,       66, 0f38, 2c,    vl,     sd, vl),
      INSN(scalef,       66, 0f38, 2d,    el,     sd, el),
+    INSN(scatterd,     66, 0f38, a2,    vl,     sd, el),
+    INSN(scatterq,     66, 0f38, a3,    vl,     sd, el),
      INSN_PFP(shuf,           0f, c6),
      INSN_FP(sqrt,            0f, 51),
      INSN_FP(sub,             0f, 5c),
--- a/tools/tests/x86_emulator/simd-sg.c
+++ b/tools/tests/x86_emulator/simd-sg.c
@@ -48,10 +48,14 @@ typedef long long __attribute__((vector_
  #  endif
  #  define BG_(dt, it, reg, mem, idx, msk, scl) \
      __builtin_ia32_gather##it##dt(reg, mem, idx, to_mask(msk), scl)
+#  define BS_(dt, it, mem, idx, reg, msk, scl) \
+    __builtin_ia32_scatter##it##dt(mem, to_mask(msk), idx, reg, scl)
  # else
  #  define eq(x, y) (B(pcmpeqq, _mask, (vdi_t)(x), (vdi_t)(y), -1) == ALL_TRUE)
  #  define BG_(dt, it, reg, mem, idx, msk, scl) \
      __builtin_ia32_gather##it##dt(reg, mem, idx, B(ptestmq, , (vdi_t)(msk), (vdi_t)(msk), ~0), scl)
+#  define BS_(dt, it, mem, idx, reg, msk, scl) \
+    __builtin_ia32_scatter##it##dt(mem, B(ptestmq, , (vdi_t)(msk), (vdi_t)(msk), ~0), idx, reg, scl)
  # endif
  /*
   * Instead of replicating the main IDX_SIZE conditional below three times, use
@@ -59,6 +63,7 @@ typedef long long __attribute__((vector_
   * respective relevant macro argument tokens.
   */
  # define BG(dt, it, reg, mem, idx, msk, scl) BG_(dt, it, reg, mem, idx, msk, scl)
+# define BS(dt, it, mem, idx, reg, msk, scl) BS_(dt, it##i, mem, idx, reg, msk, scl)
  # if VEC_MAX < 64
  /*
   * The sub-512-bit built-ins have an extra "3" infix, presumably because the
@@ -82,22 +87,30 @@ typedef long long __attribute__((vector_
  # if IDX_SIZE == 4
  #  if INT_SIZE == 4
  #   define gather(reg, mem, idx, msk, scl) BG(v16si, si, reg, mem, idx, msk, scl)
+#   define scatter(mem, idx, reg, msk, scl) BS(v16si, s, mem, idx, reg, msk, scl)
  #  elif INT_SIZE == 8
  #   define gather(reg, mem, idx, msk, scl) (vec_t)(BG(v8di, si, (vdi_t)(reg), mem, idx, msk, scl))
+#   define scatter(mem, idx, reg, msk, scl) BS(v8di, s, mem, idx, (vdi_t)(reg), msk, scl)
  #  elif FLOAT_SIZE == 4
  #   define gather(reg, mem, idx, msk, scl) BG(v16sf, si, reg, mem, idx, msk, scl)
+#   define scatter(mem, idx, reg, msk, scl) BS(v16sf, s, mem, idx, reg, msk, scl)
  #  elif FLOAT_SIZE == 8
  #   define gather(reg, mem, idx, msk, scl) BG(v8df, si, reg, mem, idx, msk, scl)
+#   define scatter(mem, idx, reg, msk, scl) BS(v8df, s, mem, idx, reg, msk, scl)
  #  endif
  # elif IDX_SIZE == 8
  #  if INT_SIZE == 4
  #   define gather(reg, mem, idx, msk, scl) BG(v16si, di, reg, mem, (idi_t)(idx), msk, scl)
+#   define scatter(mem, idx, reg, msk, scl) BS(v16si, d, mem, (idi_t)(idx), reg, msk, scl)
  #  elif INT_SIZE == 8
  #   define gather(reg, mem, idx, msk, scl) (vec_t)(BG(v8di, di, (vdi_t)(reg), mem, (idi_t)(idx), msk, scl))
+#   define scatter(mem, idx, reg, msk, scl) BS(v8di, d, mem, (idi_t)(idx), (vdi_t)(reg), msk, scl)
  #  elif FLOAT_SIZE == 4
  #   define gather(reg, mem, idx, msk, scl) BG(v16sf, di, reg, mem, (idi_t)(idx), msk, scl)
+#   define scatter(mem, idx, reg, msk, scl) BS(v16sf, d, mem, (idi_t)(idx), reg, msk, scl)
  #  elif FLOAT_SIZE == 8
  #   define gather(reg, mem, idx, msk, scl) BG(v8df, di, reg, mem, (idi_t)(idx), msk, scl)
+#   define scatter(mem, idx, reg, msk, scl) BS(v8df, d, mem, (idi_t)(idx), reg, msk, scl)
  #  endif
  # endif
  #elif defined(__AVX2__)
@@ -195,6 +208,8 @@ const typeof((vec_t){}[0]) array[] = {
      GLUE(PUT, VEC_MAX)(VEC_MAX + 1)
  };
  
+typeof((vec_t){}[0]) out[VEC_MAX * 2];
+
  int sg_test(void)
  {
      unsigned int i;
@@ -275,5 +290,41 @@ int sg_test(void)
  # endif
  #endif
  
+#ifdef scatter
+
+    for ( i = 0; i < sizeof(out) / sizeof(*out); ++i )
+        out[i] = 0;
+
+    for ( i = 0; i < ITEM_COUNT; ++i )
+        x[i] = i + 1;
+
+    touch(x);
+
+    scatter(out, (idx_t){}, x, (vec_t){ 1 } != 0, 1);
+    if ( out[0] != 1 )
+        return __LINE__;
+    for ( i = 1; i < ITEM_COUNT; ++i )
+        if ( out[i] )
+            return __LINE__;
+
+    scatter(out, (idx_t){}, x, full, 1);
+    if ( out[0] != ITEM_COUNT )
+        return __LINE__;
+    for ( i = 1; i < ITEM_COUNT; ++i )
+        if ( out[i] )
+            return __LINE__;
+
+    scatter(out, idx, x, full, ELEM_SIZE);
+    for ( i = 1; i <= ITEM_COUNT; ++i )
+        if ( out[i] != i )
+            return __LINE__;
+
+    scatter(out, inv, x, full, ELEM_SIZE);
+    for ( i = 1; i <= ITEM_COUNT; ++i )
+        if ( out[i] != ITEM_COUNT + 1 - i )
+            return __LINE__;
+
+#endif
+
      return 0;
  }
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -508,6 +508,7 @@ static const struct ext0f38_table {
      [0x9d] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
      [0x9e] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
      [0x9f] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
+    [0xa0 ... 0xa3] = { .simd_size = simd_other, .vsib = 1, .d8s = d8s_dq },
      [0xa6 ... 0xa8] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
      [0xa9] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
      [0xaa] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
@@ -9312,6 +9313,105 @@ x86_emulate(
              avx512_vlen_check(true);
          goto simd_zmm;
  
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xa0): /* vpscatterd{d,q} [xyz]mm,mem{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xa1): /* vpscatterq{d,q} [xyz]mm,mem{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xa2): /* vscatterdp{s,d} [xyz]mm,mem{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xa3): /* vscatterqp{s,d} [xyz]mm,mem{k} */
+    {
+        typeof(evex) *pevex;
+        union {
+            int32_t dw[16];
+            int64_t qw[8];
+        } index;
+        bool done = false;
+
+        ASSERT(ea.type == OP_MEM);
+        fail_if(!ops->write);
+        generate_exception_if((!evex.opmsk || evex.brs || evex.z ||
+                               evex.reg != 0xf ||
+                               modrm_reg == state->sib_index),
+                              EXC_UD);
+        avx512_vlen_check(false);
+        host_and_vcpu_must_have(avx512f);
+        get_fpu(X86EMUL_FPU_zmm);
+
+        /* Read source and index registers. */
+        opc = init_evex(stub);
+        pevex = copy_EVEX(opc, evex);
+        pevex->opcx = vex_0f;
+        opc[0] = 0x7f; /* vmovdqa{32,64} */
+        /* Use (%rax) as destination and modrm_reg as source. */
+        pevex->b = 1;
+        opc[1] = (modrm_reg & 7) << 3;
+        pevex->RX = 1;
+        opc[2] = 0xc3;
+
+        invoke_stub("", "", "=m" (*mmvalp) : "a" (mmvalp));
+
+        pevex->pfx = vex_f3; /* vmovdqu{32,64} */
+        pevex->w = b & 1;
+        /* Switch to sib_index as source. */
+        pevex->r = !mode_64bit() || !(state->sib_index & 0x08);
+        pevex->R = !mode_64bit() || !(state->sib_index & 0x10);
+        opc[1] = (state->sib_index & 7) << 3;
+
+        invoke_stub("", "", "=m" (index) : "a" (&index));
+        put_stub(stub);
+
+        /* Clear untouched parts of the mask value. */
+        n = 1 << (2 + evex.lr - ((b & 1) | evex.w));
+        op_bytes = 4 << evex.w;
+        op_mask &= (1 << n) - 1;
+
+        for ( i = 0; op_mask; ++i )
+        {
+            signed long idx = b & 1 ? index.qw[i] : index.dw[i];
+
+            if ( !(op_mask & (1 << i)) )
+                continue;
+
+            rc = ops->write(ea.mem.seg,
+                            truncate_ea(ea.mem.off + (idx << state->sib_scale)),
+                            (void *)mmvalp + i * op_bytes, op_bytes, ctxt);
+            if ( rc != X86EMUL_OKAY )
+            {
+                /* See comment in gather emulation. */
+                if ( rc != X86EMUL_EXCEPTION && done )
+                    rc = X86EMUL_RETRY;
+                break;
+            }
+
+            op_mask &= ~(1 << i);
+            done = true;
+
+#ifdef __XEN__
+            if ( op_mask && local_events_need_delivery() )
+            {
+                rc = X86EMUL_RETRY;
+                break;
+            }
+#endif
+        }
+
+        /* Write mask register. See comment in gather emulation. */
+        opc = get_stub(stub);
+        opc[0] = 0xc5;
+        opc[1] = 0xf8;
+        opc[2] = 0x90;
+        /* Use (%rax) as source. */
+        opc[3] = evex.opmsk << 3;
+        opc[4] = 0xc3;
+
+        invoke_stub("", "", "+m" (op_mask) : "a" (&op_mask));
+        put_stub(stub);
+
+        if ( rc != X86EMUL_OKAY )
+            goto done;
+
+        state->simd_size = simd_none;
+        break;
+    }
+
      case X86EMUL_OPC(0x0f38, 0xc8):     /* sha1nexte xmm/m128,xmm */
      case X86EMUL_OPC(0x0f38, 0xc9):     /* sha1msg1 xmm/m128,xmm */
      case X86EMUL_OPC(0x0f38, 0xca):     /* sha1msg2 xmm/m128,xmm */

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v9 07/23] x86emul: support AVX512F scatter insns

Posted by Andrew Cooper 2 weeks ago
On 01/07/2019 12:20, Jan Beulich wrote:
> This completes support of AVX512F in the insn emulator.
>
> Note that in the test harness there's a little bit of trickery needed to
> get around the not fully consistent naming of AVX512VL gather and
> scatter compiler built-ins. To suppress expansion of the "di" and "si"
> tokens they get constructed by token concatenation in BS(), which is
> different from BG().
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>, subject to the
resolution of the related comments on patch 5.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v9 08/23] x86emul: support AVX512PF insns

Posted by Jan Beulich 2 weeks ago
Some adjustments are necessary to the EVEX Disp8 scaling test code to
account for the zero byte reads/writes, which get issued for the test
harness only.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v9: Suppress general register update upon failures. Re-base.
v8: #GP/#SS don't arise here. Add previously missed change to
     emul_test_init().
v7: Re-base.
v6: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -520,6 +520,17 @@ static const struct test avx512er_512[]
      INSN(rsqrt28, 66, 0f38, cd, el, sd, el),
  };
  
+static const struct test avx512pf_512[] = {
+    INSNX(gatherpf0d,  66, 0f38, c6, 1, vl, sd, el),
+    INSNX(gatherpf0q,  66, 0f38, c7, 1, vl, sd, el),
+    INSNX(gatherpf1d,  66, 0f38, c6, 2, vl, sd, el),
+    INSNX(gatherpf1q,  66, 0f38, c7, 2, vl, sd, el),
+    INSNX(scatterpf0d, 66, 0f38, c6, 5, vl, sd, el),
+    INSNX(scatterpf0q, 66, 0f38, c7, 5, vl, sd, el),
+    INSNX(scatterpf1d, 66, 0f38, c6, 6, vl, sd, el),
+    INSNX(scatterpf1q, 66, 0f38, c7, 6, vl, sd, el),
+};
+
  static const struct test avx512_vbmi_all[] = {
      INSN(permb,         66, 0f38, 8d, vl, b, vl),
      INSN(permi2b,       66, 0f38, 75, vl, b, vl),
@@ -580,7 +591,7 @@ static bool record_access(enum x86_segme
  static int read(enum x86_segment seg, unsigned long offset, void *p_data,
                  unsigned int bytes, struct x86_emulate_ctxt *ctxt)
  {
-    if ( !record_access(seg, offset, bytes) )
+    if ( !record_access(seg, offset, bytes + !bytes) )
          return X86EMUL_UNHANDLEABLE;
      memset(p_data, 0, bytes);
      return X86EMUL_OKAY;
@@ -589,7 +600,7 @@ static int read(enum x86_segment seg, un
  static int write(enum x86_segment seg, unsigned long offset, void *p_data,
                   unsigned int bytes, struct x86_emulate_ctxt *ctxt)
  {
-    if ( !record_access(seg, offset, bytes) )
+    if ( !record_access(seg, offset, bytes + !bytes) )
          return X86EMUL_UNHANDLEABLE;
      return X86EMUL_OKAY;
  }
@@ -597,7 +608,7 @@ static int write(enum x86_segment seg, u
  static void test_one(const struct test *test, enum vl vl,
                       unsigned char *instr, struct x86_emulate_ctxt *ctxt)
  {
-    unsigned int vsz, esz, i;
+    unsigned int vsz, esz, i, n;
      int rc;
      bool sg = strstr(test->mnemonic, "gather") ||
                strstr(test->mnemonic, "scatter");
@@ -725,10 +736,20 @@ static void test_one(const struct test *
      for ( i = 0; i < (test->scale == SC_vl ? vsz : esz); ++i )
           if ( accessed[i] )
               goto fail;
-    for ( ; i < (test->scale == SC_vl ? vsz : esz) + (sg ? esz : vsz); ++i )
+
+    n = test->scale == SC_vl ? vsz : esz;
+    if ( !sg )
+        n += vsz;
+    else if ( !strstr(test->mnemonic, "pf") )
+        n += esz;
+    else
+        ++n;
+
+    for ( ; i < n; ++i )
           if ( accessed[i] != (sg ? (vsz / esz) >> (test->opc & 1 & !evex.w)
                                   : 1) )
               goto fail;
+
      for ( ; i < ARRAY_SIZE(accessed); ++i )
           if ( accessed[i] )
               goto fail;
@@ -887,6 +908,8 @@ void evex_disp8_test(void *instr, struct
      RUN(avx512dq, no128);
      RUN(avx512dq, 512);
      RUN(avx512er, 512);
+#define cpu_has_avx512pf cpu_has_avx512f
+    RUN(avx512pf, 512);
      RUN(avx512_vbmi, all);
      RUN(avx512_vbmi2, all);
  }
--- a/tools/tests/x86_emulator/x86-emulate.c
+++ b/tools/tests/x86_emulator/x86-emulate.c
@@ -73,6 +73,7 @@ bool emul_test_init(void)
       */
      cp.basic.movbe = true;
      cp.feat.adx = true;
+    cp.feat.avx512pf = cp.feat.avx512f;
      cp.feat.rdpid = true;
      cp.extd.clzero = true;
  
@@ -135,12 +136,14 @@ int emul_test_cpuid(
          res->c |= 1U << 22;
  
      /*
-     * The emulator doesn't itself use ADCX/ADOX/RDPID, so we can always run
-     * the respective tests.
+     * The emulator doesn't itself use ADCX/ADOX/RDPID nor the S/G prefetch
+     * insns, so we can always run the respective tests.
       */
      if ( leaf == 7 && subleaf == 0 )
      {
          res->b |= 1U << 19;
+        if ( res->b & (1U << 16) )
+            res->b |= 1U << 26;
          res->c |= 1U << 22;
      }
  
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -525,6 +525,7 @@ static const struct ext0f38_table {
      [0xbd] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
      [0xbe] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
      [0xbf] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
+    [0xc6 ... 0xc7] = { .simd_size = simd_other, .vsib = 1, .d8s = d8s_dq },
      [0xc8] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl },
      [0xc9] = { .simd_size = simd_other },
      [0xca] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl },
@@ -1871,6 +1872,7 @@ in_protmode(
  #define vcpu_has_smap()        (ctxt->cpuid->feat.smap)
  #define vcpu_has_clflushopt()  (ctxt->cpuid->feat.clflushopt)
  #define vcpu_has_clwb()        (ctxt->cpuid->feat.clwb)
+#define vcpu_has_avx512pf()    (ctxt->cpuid->feat.avx512pf)
  #define vcpu_has_avx512er()    (ctxt->cpuid->feat.avx512er)
  #define vcpu_has_sha()         (ctxt->cpuid->feat.sha)
  #define vcpu_has_avx512bw()    (ctxt->cpuid->feat.avx512bw)
@@ -9410,6 +9412,97 @@ x86_emulate(
  
          state->simd_size = simd_none;
          break;
+    }
+
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xc6):
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xc7):
+    {
+#ifndef __XEN__
+        typeof(evex) *pevex;
+        union {
+            int32_t dw[16];
+            int64_t qw[8];
+        } index;
+#endif
+
+        ASSERT(ea.type == OP_MEM);
+        generate_exception_if((!cpu_has_avx512f || !evex.opmsk || evex.brs ||
+                               evex.z || evex.reg != 0xf || evex.lr != 2),
+                              EXC_UD);
+
+        switch ( modrm_reg & 7 )
+        {
+        case 1: /* vgatherpf0{d,q}p{s,d} mem{k} */
+        case 2: /* vgatherpf1{d,q}p{s,d} mem{k} */
+        case 5: /* vscatterpf0{d,q}p{s,d} mem{k} */
+        case 6: /* vscatterpf1{d,q}p{s,d} mem{k} */
+            vcpu_must_have(avx512pf);
+            break;
+        default:
+            generate_exception(EXC_UD);
+        }
+
+        get_fpu(X86EMUL_FPU_zmm);
+
+#ifndef __XEN__
+        /*
+         * For the test harness perform zero byte memory accesses, such that
+         * in particular correct Disp8 scaling can be verified.
+         */
+        fail_if((modrm_reg & 4) && !ops->write);
+
+        /* Read index register. */
+        opc = init_evex(stub);
+        pevex = copy_EVEX(opc, evex);
+        pevex->opcx = vex_0f;
+        /* vmovdqu{32,64} */
+        opc[0] = 0x7f;
+        pevex->pfx = vex_f3;
+        pevex->w = b & 1;
+        /* Use (%rax) as destination and sib_index as source. */
+        pevex->b = 1;
+        opc[1] = (state->sib_index & 7) << 3;
+        pevex->r = !mode_64bit() || !(state->sib_index & 0x08);
+        pevex->R = !mode_64bit() || !(state->sib_index & 0x10);
+        pevex->RX = 1;
+        opc[2] = 0xc3;
+
+        invoke_stub("", "", "=m" (index) : "a" (&index));
+        put_stub(stub);
+
+        /* Clear untouched parts of the mask value. */
+        n = 1 << (4 - ((b & 1) | evex.w));
+        op_mask &= (1 << n) - 1;
+
+        for ( i = 0; rc == X86EMUL_OKAY && op_mask; ++i )
+        {
+            signed long idx = b & 1 ? index.qw[i] : index.dw[i];
+
+            if ( !(op_mask & (1 << i)) )
+                continue;
+
+            rc = (modrm_reg & 4
+                  ? ops->write
+                  : ops->read)(ea.mem.seg,
+                               truncate_ea(ea.mem.off +
+                                           (idx << state->sib_scale)),
+                               NULL, 0, ctxt);
+            if ( rc == X86EMUL_EXCEPTION )
+            {
+                /* Squash memory access related exceptions. */
+                x86_emul_reset_event(ctxt);
+                rc = X86EMUL_OKAY;
+            }
+
+            op_mask &= ~(1 << i);
+        }
+
+        if ( rc != X86EMUL_OKAY )
+            goto done;
+#endif
+
+        state->simd_size = simd_none;
+        break;
      }
  
      case X86EMUL_OPC(0x0f38, 0xc8):     /* sha1nexte xmm/m128,xmm */

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v9 08/23] x86emul: support AVX512PF insns

Posted by Andrew Cooper 2 weeks ago
On 01/07/2019 12:20, Jan Beulich wrote:
> +        /* Clear untouched parts of the mask value. */
> +        n = 1 << (4 - ((b & 1) | evex.w));
> +        op_mask &= (1 << n) - 1;
> +
> +        for ( i = 0; rc == X86EMUL_OKAY && op_mask; ++i )
> +        {
> +            signed long idx = b & 1 ? index.qw[i] : index.dw[i];
> +
> +            if ( !(op_mask & (1 << i)) )
> +                continue;

It occurs from my recent foray into UBSAN that op_mask is 64 bits wide,
although it looks like n can be at maximum 16 in this specific case.

If nothing else, using (1u << 1) would reduce the size of the UBSAN
build, but I expect we're soon going to have subtle bugs when we get to
the int8 instructions.

Are there current S/G instructions which can take 32 iterations?

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v9 08/23] x86emul: support AVX512PF insns

Posted by Jan Beulich 2 weeks ago
On 04.07.2019 16:44, Andrew Cooper wrote:
> On 01/07/2019 12:20, Jan Beulich wrote:
>> +        /* Clear untouched parts of the mask value. */
>> +        n = 1 << (4 - ((b & 1) | evex.w));
>> +        op_mask &= (1 << n) - 1;
>> +
>> +        for ( i = 0; rc == X86EMUL_OKAY && op_mask; ++i )
>> +        {
>> +            signed long idx = b & 1 ? index.qw[i] : index.dw[i];
>> +
>> +            if ( !(op_mask & (1 << i)) )
>> +                continue;
> 
> It occurs from my recent foray into UBSAN that op_mask is 64 bits wide,
> although it looks like n can be at maximum 16 in this specific case.
> 
> If nothing else, using (1u << 1) would reduce the size of the UBSAN
> build, but I expect we're soon going to have subtle bugs when we get to
> the int8 instructions.
> 
> Are there current S/G instructions which can take 32 iterations?

No, S/G insns currently only act on vector elements 32 or 64 bits
in size, which means 16 or 8 elements per vector max.

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v9 08/23] x86emul: support AVX512PF insns

Posted by Andrew Cooper 2 weeks ago
On 04/07/2019 15:50, Jan Beulich wrote:
> On 04.07.2019 16:44, Andrew Cooper wrote:
>> On 01/07/2019 12:20, Jan Beulich wrote:
>>> +        /* Clear untouched parts of the mask value. */
>>> +        n = 1 << (4 - ((b & 1) | evex.w));
>>> +        op_mask &= (1 << n) - 1;
>>> +
>>> +        for ( i = 0; rc == X86EMUL_OKAY && op_mask; ++i )
>>> +        {
>>> +            signed long idx = b & 1 ? index.qw[i] : index.dw[i];
>>> +
>>> +            if ( !(op_mask & (1 << i)) )
>>> +                continue;
>> It occurs from my recent foray into UBSAN that op_mask is 64 bits wide,
>> although it looks like n can be at maximum 16 in this specific case.
>>
>> If nothing else, using (1u << 1) would reduce the size of the UBSAN
>> build, but I expect we're soon going to have subtle bugs when we get to
>> the int8 instructions.
>>
>> Are there current S/G instructions which can take 32 iterations?
> No, S/G insns currently only act on vector elements 32 or 64 bits
> in size, which means 16 or 8 elements per vector max.

In which case we're fine for now.

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v9 09/23] x86emul: support AVX512CD insns

Posted by Jan Beulich 2 weeks ago
Since the insns here and in particular their memory access patterns
follow the usual scheme I didn't think it was necessary to add
contrived tests specifically for them, beyond the Disp8 scaling ones.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
v9: Re-base.
v6: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -458,6 +458,13 @@ static const struct test avx512bw_128[]
      INSN(pinsrw, 66,   0f, c4, el, w, el),
  };
  
+static const struct test avx512cd_all[] = {
+//       pbroadcastmb2q, f3, 0f38, 2a,      q
+//       pbroadcastmw2d, f3, 0f38, 3a,      d
+    INSN(pconflict,      66, 0f38, c4, vl, dq, vl),
+    INSN(plzcnt,         66, 0f38, 44, vl, dq, vl),
+};
+
  static const struct test avx512dq_all[] = {
      INSN_PFP(and,              0f, 54),
      INSN_PFP(andn,             0f, 55),
@@ -903,6 +910,7 @@ void evex_disp8_test(void *instr, struct
      RUN(avx512f, 512);
      RUN(avx512bw, all);
      RUN(avx512bw, 128);
+    RUN(avx512cd, all);
      RUN(avx512dq, all);
      RUN(avx512dq, 128);
      RUN(avx512dq, no128);
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -138,6 +138,7 @@ static inline bool xcr0_mask(uint64_t ma
  #define cpu_has_avx512f   (cp.feat.avx512f  && xcr0_mask(0xe6))
  #define cpu_has_avx512dq  (cp.feat.avx512dq && xcr0_mask(0xe6))
  #define cpu_has_avx512er  (cp.feat.avx512er && xcr0_mask(0xe6))
+#define cpu_has_avx512cd  (cp.feat.avx512cd && xcr0_mask(0xe6))
  #define cpu_has_avx512bw  (cp.feat.avx512bw && xcr0_mask(0xe6))
  #define cpu_has_avx512vl  (cp.feat.avx512vl && xcr0_mask(0xe6))
  #define cpu_has_avx512_vbmi (cp.feat.avx512_vbmi && xcr0_mask(0xe6))
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -473,6 +473,7 @@ static const struct ext0f38_table {
      [0x41] = { .simd_size = simd_packed_int, .two_op = 1 },
      [0x42] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl },
      [0x43] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
+    [0x44] = { .simd_size = simd_packed_int, .two_op = 1, .d8s = d8s_vl },
      [0x45 ... 0x47] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
      [0x4c] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl },
      [0x4d] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
@@ -525,6 +526,7 @@ static const struct ext0f38_table {
      [0xbd] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
      [0xbe] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
      [0xbf] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
+    [0xc4] = { .simd_size = simd_packed_int, .two_op = 1, .d8s = d8s_vl },
      [0xc6 ... 0xc7] = { .simd_size = simd_other, .vsib = 1, .d8s = d8s_dq },
      [0xc8] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl },
      [0xc9] = { .simd_size = simd_other },
@@ -1874,6 +1876,7 @@ in_protmode(
  #define vcpu_has_clwb()        (ctxt->cpuid->feat.clwb)
  #define vcpu_has_avx512pf()    (ctxt->cpuid->feat.avx512pf)
  #define vcpu_has_avx512er()    (ctxt->cpuid->feat.avx512er)
+#define vcpu_has_avx512cd()    (ctxt->cpuid->feat.avx512cd)
  #define vcpu_has_sha()         (ctxt->cpuid->feat.sha)
  #define vcpu_has_avx512bw()    (ctxt->cpuid->feat.avx512bw)
  #define vcpu_has_avx512vl()    (ctxt->cpuid->feat.avx512vl)
@@ -8792,6 +8795,20 @@ x86_emulate(
          evex.opcx = vex_0f;
          goto vmovdqa;
  
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x2a): /* vpbroadcastmb2q k,[xyz]mm */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x3a): /* vpbroadcastmw2d k,[xyz]mm */
+        generate_exception_if((ea.type != OP_REG || evex.opmsk ||
+                               evex.w == ((b >> 4) & 1)),
+                              EXC_UD);
+        d |= TwoOp;
+        /* fall through */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xc4): /* vpconflict{d,q} [xyz]mm/mem,[xyz]mm{k} */
+        fault_suppression = false;
+        /* fall through */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x44): /* vplzcnt{d,q} [xyz]mm/mem,[xyz]mm{k} */
+        host_and_vcpu_must_have(avx512cd);
+        goto avx512f_no_sae;
+
      case X86EMUL_OPC_VEX_66(0x0f38, 0x2c): /* vmaskmovps mem,{x,y}mm,{x,y}mm */
      case X86EMUL_OPC_VEX_66(0x0f38, 0x2d): /* vmaskmovpd mem,{x,y}mm,{x,y}mm */
      case X86EMUL_OPC_VEX_66(0x0f38, 0x2e): /* vmaskmovps {x,y}mm,{x,y}mm,mem */
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -102,6 +102,7 @@
  #define cpu_has_rdseed          boot_cpu_has(X86_FEATURE_RDSEED)
  #define cpu_has_smap            boot_cpu_has(X86_FEATURE_SMAP)
  #define cpu_has_avx512er        boot_cpu_has(X86_FEATURE_AVX512ER)
+#define cpu_has_avx512cd        boot_cpu_has(X86_FEATURE_AVX512CD)
  #define cpu_has_sha             boot_cpu_has(X86_FEATURE_SHA)
  #define cpu_has_avx512bw        boot_cpu_has(X86_FEATURE_AVX512BW)
  #define cpu_has_avx512vl        boot_cpu_has(X86_FEATURE_AVX512VL)

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v9 10/23] x86emul: complete support of AVX512_VBMI insns

Posted by Jan Beulich 2 weeks ago
Also add testing of ones support for which was added before. Sadly gcc's
command line option naming is not in line with Intel's naming of the
feature, which makes it necessary to mis-name things in the test harness.

Since the only new insn here and in particular its memory access pattern
follows the usual scheme, I didn't think it was necessary to add a
contrived test specifically for it, beyond the Disp8 scaling one.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
v6: New.

--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -16,7 +16,7 @@ vpath %.c $(XEN_ROOT)/xen/lib/x86
  
  CFLAGS += $(CFLAGS_xeninclude)
  
-SIMD := 3dnow sse sse2 sse4 avx avx2 xop avx512f avx512bw avx512dq avx512er
+SIMD := 3dnow sse sse2 sse4 avx avx2 xop avx512f avx512bw avx512dq avx512er avx512vbmi
  FMA := fma4 fma
  SG := avx2-sg avx512f-sg avx512vl-sg
  TESTCASES := blowfish $(SIMD) $(FMA) $(SG)
@@ -83,6 +83,9 @@ avx512dq-flts := $(avx512f-flts)
  avx512er-vecs := 64
  avx512er-ints :=
  avx512er-flts := 4 8
+avx512vbmi-vecs := $(avx512bw-vecs)
+avx512vbmi-ints := $(avx512bw-ints)
+avx512vbmi-flts := $(avx512bw-flts)
  
  avx512f-opmask-vecs := 2
  avx512dq-opmask-vecs := 1 2
--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -542,6 +542,7 @@ static const struct test avx512_vbmi_all
      INSN(permb,         66, 0f38, 8d, vl, b, vl),
      INSN(permi2b,       66, 0f38, 75, vl, b, vl),
      INSN(permt2b,       66, 0f38, 7d, vl, b, vl),
+    INSN(pmultishiftqb, 66, 0f38, 83, vl, q, vl),
  };
  
  static const struct test avx512_vbmi2_all[] = {
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -27,6 +27,7 @@ asm ( ".pushsection .test, \"ax\", @prog
  #include "avx512bw.h"
  #include "avx512dq.h"
  #include "avx512er.h"
+#include "avx512vbmi.h"
  
  #define verbose false /* Switch to true for far more logging. */
  
@@ -127,6 +128,16 @@ static bool simd_check_avx512bw_vl(void)
      return cpu_has_avx512bw && cpu_has_avx512vl;
  }
  
+static bool simd_check_avx512vbmi(void)
+{
+    return cpu_has_avx512_vbmi;
+}
+
+static bool simd_check_avx512vbmi_vl(void)
+{
+    return cpu_has_avx512_vbmi && cpu_has_avx512vl;
+}
+
  static void simd_set_regs(struct cpu_user_regs *regs)
  {
      if ( cpu_has_mmx )
@@ -372,6 +383,18 @@ static const struct {
      SIMD(AVX512ER f32x16,    avx512er,      64f4),
      SIMD(AVX512ER f64 scalar,avx512er,        f8),
      SIMD(AVX512ER f64x8,     avx512er,      64f8),
+    SIMD(AVX512_VBMI s8x64,  avx512vbmi,    64i1),
+    SIMD(AVX512_VBMI u8x64,  avx512vbmi,    64u1),
+    SIMD(AVX512_VBMI s16x32, avx512vbmi,    64i2),
+    SIMD(AVX512_VBMI u16x32, avx512vbmi,    64u2),
+    AVX512VL(_VBMI+VL s8x16, avx512vbmi,    16i1),
+    AVX512VL(_VBMI+VL u8x16, avx512vbmi,    16u1),
+    AVX512VL(_VBMI+VL s8x32, avx512vbmi,    32i1),
+    AVX512VL(_VBMI+VL u8x32, avx512vbmi,    32u1),
+    AVX512VL(_VBMI+VL s16x8, avx512vbmi,    16i2),
+    AVX512VL(_VBMI+VL u16x8, avx512vbmi,    16u2),
+    AVX512VL(_VBMI+VL s16x16, avx512vbmi,   32i2),
+    AVX512VL(_VBMI+VL u16x16, avx512vbmi,   32u2),
  #undef AVX512VL_
  #undef AVX512VL
  #undef SIMD_
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -493,6 +493,7 @@ static const struct ext0f38_table {
      [0x7a ... 0x7c] = { .simd_size = simd_none, .two_op = 1 },
      [0x7d ... 0x7e] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
      [0x7f] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
+    [0x83] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
      [0x88] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_dq },
      [0x89] = { .simd_size = simd_packed_int, .two_op = 1, .d8s = d8s_dq },
      [0x8a] = { .simd_size = simd_packed_fp, .to_mem = 1, .two_op = 1, .d8s = d8s_dq },
@@ -8999,6 +9000,12 @@ x86_emulate(
          ASSERT(!state->simd_size);
          break;
  
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x83): /* vpmultishiftqb [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        generate_exception_if(!evex.w, EXC_UD);
+        host_and_vcpu_must_have(avx512_vbmi);
+        fault_suppression = false;
+        goto avx512f_no_sae;
+
      case X86EMUL_OPC_VEX_66(0x0f38, 0x8c): /* vpmaskmov{d,q} mem,{x,y}mm,{x,y}mm */
      case X86EMUL_OPC_VEX_66(0x0f38, 0x8e): /* vpmaskmov{d,q} {x,y}mm,{x,y}mm,mem */
          generate_exception_if(ea.type != OP_MEM, EXC_UD);

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v9 11/23] x86emul: support of AVX512* population count insns

Posted by Jan Beulich 2 weeks ago
Plus the only other AVX512_BITALG one.

As in a few cases before, since the insns here and in particular their
memory access patterns follow the usual scheme, I didn't think it was
necessary to add a contrived test specifically for them, beyond the
Disp8 scaling one.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v9: Re-base.
v7: Re-base.
v6: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -538,6 +538,11 @@ static const struct test avx512pf_512[]
      INSNX(scatterpf1q, 66, 0f38, c7, 6, vl, sd, el),
  };
  
+static const struct test avx512_bitalg_all[] = {
+    INSN(popcnt,      66, 0f38, 54, vl, bw, vl),
+    INSN(pshufbitqmb, 66, 0f38, 8f, vl,  b, vl),
+};
+
  static const struct test avx512_vbmi_all[] = {
      INSN(permb,         66, 0f38, 8d, vl, b, vl),
      INSN(permi2b,       66, 0f38, 75, vl, b, vl),
@@ -550,6 +555,10 @@ static const struct test avx512_vbmi2_al
      INSN(pexpand,   66, 0f38, 62, vl, bw, el),
  };
  
+static const struct test avx512_vpopcntdq_all[] = {
+    INSN(popcnt, 66, 0f38, 55, vl, dq, vl)
+};
+
  static const unsigned char vl_all[] = { VL_512, VL_128, VL_256 };
  static const unsigned char vl_128[] = { VL_128 };
  static const unsigned char vl_no128[] = { VL_512, VL_256 };
@@ -919,6 +928,8 @@ void evex_disp8_test(void *instr, struct
      RUN(avx512er, 512);
  #define cpu_has_avx512pf cpu_has_avx512f
      RUN(avx512pf, 512);
+    RUN(avx512_bitalg, all);
      RUN(avx512_vbmi, all);
      RUN(avx512_vbmi2, all);
+    RUN(avx512_vpopcntdq, all);
  }
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -143,6 +143,8 @@ static inline bool xcr0_mask(uint64_t ma
  #define cpu_has_avx512vl  (cp.feat.avx512vl && xcr0_mask(0xe6))
  #define cpu_has_avx512_vbmi (cp.feat.avx512_vbmi && xcr0_mask(0xe6))
  #define cpu_has_avx512_vbmi2 (cp.feat.avx512_vbmi2 && xcr0_mask(0xe6))
+#define cpu_has_avx512_bitalg (cp.feat.avx512_bitalg && xcr0_mask(0xe6))
+#define cpu_has_avx512_vpopcntdq (cp.feat.avx512_vpopcntdq && xcr0_mask(0xe6))
  
  #define cpu_has_xgetbv1   (cpu_has_xsave && cp.xstate.xgetbv1)
  
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -479,6 +479,7 @@ static const struct ext0f38_table {
      [0x4d] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
      [0x4e] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl },
      [0x4f] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
+    [0x54 ... 0x55] = { .simd_size = simd_packed_int, .two_op = 1, .d8s = d8s_vl },
      [0x58] = { .simd_size = simd_other, .two_op = 1, .d8s = 2 },
      [0x59] = { .simd_size = simd_other, .two_op = 1, .d8s = 3 },
      [0x5a] = { .simd_size = simd_128, .two_op = 1, .d8s = 4 },
@@ -501,6 +502,7 @@ static const struct ext0f38_table {
      [0x8c] = { .simd_size = simd_packed_int },
      [0x8d] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
      [0x8e] = { .simd_size = simd_packed_int, .to_mem = 1 },
+    [0x8f] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
      [0x90 ... 0x93] = { .simd_size = simd_other, .vsib = 1, .d8s = d8s_dq },
      [0x96 ... 0x98] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
      [0x99] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
@@ -1883,6 +1885,8 @@ in_protmode(
  #define vcpu_has_avx512vl()    (ctxt->cpuid->feat.avx512vl)
  #define vcpu_has_avx512_vbmi() (ctxt->cpuid->feat.avx512_vbmi)
  #define vcpu_has_avx512_vbmi2() (ctxt->cpuid->feat.avx512_vbmi2)
+#define vcpu_has_avx512_bitalg() (ctxt->cpuid->feat.avx512_bitalg)
+#define vcpu_has_avx512_vpopcntdq() (ctxt->cpuid->feat.avx512_vpopcntdq)
  #define vcpu_has_rdpid()       (ctxt->cpuid->feat.rdpid)
  
  #define vcpu_must_have(feat) \
@@ -8899,6 +8903,19 @@ x86_emulate(
          generate_exception_if(vex.l, EXC_UD);
          goto simd_0f_avx;
  
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x8f): /* vpshufbitqmb [xyz]mm/mem,[xyz]mm,k{k} */
+        generate_exception_if(evex.w || !evex.r || !evex.R || evex.z, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x54): /* vpopcnt{b,w} [xyz]mm/mem,[xyz]mm{k} */
+        host_and_vcpu_must_have(avx512_bitalg);
+        generate_exception_if(evex.brs, EXC_UD);
+        elem_bytes = 1 << evex.w;
+        goto avx512f_no_sae;
+
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x55): /* vpopcnt{d,q} [xyz]mm/mem,[xyz]mm{k} */
+        host_and_vcpu_must_have(avx512_vpopcntdq);
+        goto avx512f_no_sae;
+
      case X86EMUL_OPC_VEX_66(0x0f38, 0x58): /* vpbroadcastd xmm/m32,{x,y}mm */
      case X86EMUL_OPC_VEX_66(0x0f38, 0x59): /* vpbroadcastq xmm/m64,{x,y}mm */
      case X86EMUL_OPC_VEX_66(0x0f38, 0x78): /* vpbroadcastb xmm/m8,{x,y}mm */
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -110,6 +110,8 @@
  /* CPUID level 0x00000007:0.ecx */
  #define cpu_has_avx512_vbmi     boot_cpu_has(X86_FEATURE_AVX512_VBMI)
  #define cpu_has_avx512_vbmi2    boot_cpu_has(X86_FEATURE_AVX512_VBMI2)
+#define cpu_has_avx512_bitalg   boot_cpu_has(X86_FEATURE_AVX512_BITALG)
+#define cpu_has_avx512_vpopcntdq boot_cpu_has(X86_FEATURE_AVX512_VPOPCNTDQ)
  #define cpu_has_rdpid           boot_cpu_has(X86_FEATURE_RDPID)
  
  /* CPUID level 0x80000007.edx */
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -229,6 +229,7 @@ XEN_CPUFEATURE(UMIP,          6*32+ 2) /
  XEN_CPUFEATURE(PKU,           6*32+ 3) /*H  Protection Keys for Userspace */
  XEN_CPUFEATURE(OSPKE,         6*32+ 4) /*!  OS Protection Keys Enable */
  XEN_CPUFEATURE(AVX512_VBMI2,  6*32+ 6) /*A  Additional AVX-512 Vector Byte Manipulation Instrs */
+XEN_CPUFEATURE(AVX512_BITALG, 6*32+12) /*A  Support for VPOPCNT[B,W] and VPSHUFBITQMB */
  XEN_CPUFEATURE(AVX512_VPOPCNTDQ, 6*32+14) /*A  POPCNT for vectors of DW/QW */
  XEN_CPUFEATURE(RDPID,         6*32+22) /*A  RDPID instruction */
  
--- a/xen/tools/gen-cpuid.py
+++ b/xen/tools/gen-cpuid.py
@@ -268,7 +268,7 @@ def crunch_numbers(state):
          # AVX512 extensions acting on vectors of bytes/words are made
          # dependents of AVX512BW (as to requiring wider than 16-bit mask
          # registers), despite the SDM not formally making this connection.
-        AVX512BW: [AVX512_BF16, AVX512_VBMI, AVX512_VBMI2],
+        AVX512BW: [AVX512_BF16, AVX512_BITALG, AVX512_VBMI, AVX512_VBMI2],
  
          # The features:
          #   * Single Thread Indirect Branch Predictors

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v9 11/23] x86emul: support of AVX512* population count insns

Posted by Andrew Cooper 2 weeks ago
On 01/07/2019 12:22, Jan Beulich wrote:
> --- a/xen/tools/gen-cpuid.py
> +++ b/xen/tools/gen-cpuid.py
> @@ -268,7 +268,7 @@ def crunch_numbers(state):
>           # AVX512 extensions acting on vectors of bytes/words are made
>           # dependents of AVX512BW (as to requiring wider than 16-bit mask
>           # registers), despite the SDM not formally making this connection.
> -        AVX512BW: [AVX512_BF16, AVX512_VBMI, AVX512_VBMI2],
> +        AVX512BW: [AVX512_BF16, AVX512_BITALG, AVX512_VBMI, AVX512_VBMI2],

BITALG should be after VBMI2, because everything in this table is
ordered by bit number.

With this fixed, Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v9 11/23] x86emul: support of AVX512* population count insns

Posted by Jan Beulich 2 weeks ago
On 04.07.2019 16:47, Andrew Cooper wrote:
> On 01/07/2019 12:22, Jan Beulich wrote:
>> --- a/xen/tools/gen-cpuid.py
>> +++ b/xen/tools/gen-cpuid.py
>> @@ -268,7 +268,7 @@ def crunch_numbers(state):
>>            # AVX512 extensions acting on vectors of bytes/words are made
>>            # dependents of AVX512BW (as to requiring wider than 16-bit mask
>>            # registers), despite the SDM not formally making this connection.
>> -        AVX512BW: [AVX512_BF16, AVX512_VBMI, AVX512_VBMI2],
>> +        AVX512BW: [AVX512_BF16, AVX512_BITALG, AVX512_VBMI, AVX512_VBMI2],
> 
> BITALG should be after VBMI2, because everything in this table is
> ordered by bit number.

As said before - there's no ordering by bit number possible here.
The individual features may live on different (sub)leaves. By
what you say BF16 shouldn't be first. The list here clearly is
sorted alphabetically, and imo that's the only future proof sorting
possible (and also for AVX512F, where I had previously offered to
put together a patch to switch to alphabetical ordering, if only we
could agree on that).

> With this fixed, Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

As per above I'm not going to apply this without hearing back from
you.

Jan
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v9 11/23] x86emul: support of AVX512* population count insns

Posted by Andrew Cooper 2 weeks ago
On 04/07/2019 15:54, Jan Beulich wrote:
> On 04.07.2019 16:47, Andrew Cooper wrote:
>> On 01/07/2019 12:22, Jan Beulich wrote:
>>> --- a/xen/tools/gen-cpuid.py
>>> +++ b/xen/tools/gen-cpuid.py
>>> @@ -268,7 +268,7 @@ def crunch_numbers(state):
>>>            # AVX512 extensions acting on vectors of bytes/words are made
>>>            # dependents of AVX512BW (as to requiring wider than 16-bit mask
>>>            # registers), despite the SDM not formally making this connection.
>>> -        AVX512BW: [AVX512_BF16, AVX512_VBMI, AVX512_VBMI2],
>>> +        AVX512BW: [AVX512_BF16, AVX512_BITALG, AVX512_VBMI, AVX512_VBMI2],
>> BITALG should be after VBMI2, because everything in this table is
>> ordered by bit number.
> As said before - there's no ordering by bit number possible here.

Its perfectly easy.  Each feature has a unique number.

> The individual features may live on different (sub)leaves. By
> what you say BF16 shouldn't be first. The list here clearly is
> sorted alphabetically, and imo that's the only future proof sorting
> possible (and also for AVX512F, where I had previously offered to
> put together a patch to switch to alphabetical ordering, if only we
> could agree on that).

In which case I missed it during review.

This feature matrix is deliberately sorted by feature number in an
effort to preserve chronology, which is a much more useful way of
reasoning about feature dependencies.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v9 11/23] x86emul: support of AVX512* population count insns

Posted by Jan Beulich 2 weeks ago
On 04.07.2019 20:38, Andrew Cooper wrote:
> On 04/07/2019 15:54, Jan Beulich wrote:
>> On 04.07.2019 16:47, Andrew Cooper wrote:
>>> On 01/07/2019 12:22, Jan Beulich wrote:
>>>> --- a/xen/tools/gen-cpuid.py
>>>> +++ b/xen/tools/gen-cpuid.py
>>>> @@ -268,7 +268,7 @@ def crunch_numbers(state):
>>>>             # AVX512 extensions acting on vectors of bytes/words are made
>>>>             # dependents of AVX512BW (as to requiring wider than 16-bit mask
>>>>             # registers), despite the SDM not formally making this connection.
>>>> -        AVX512BW: [AVX512_BF16, AVX512_VBMI, AVX512_VBMI2],
>>>> +        AVX512BW: [AVX512_BF16, AVX512_BITALG, AVX512_VBMI, AVX512_VBMI2],
>>> BITALG should be after VBMI2, because everything in this table is
>>> ordered by bit number.
>> As said before - there's no ordering by bit number possible here.
> 
> Its perfectly easy.  Each feature has a unique number.

Well, okay, for sub-leaves of the same main leaf I can accept
this. But what sorting do you suggest between basic and extended
leaves?

>> The individual features may live on different (sub)leaves. By
>> what you say BF16 shouldn't be first. The list here clearly is
>> sorted alphabetically, and imo that's the only future proof sorting
>> possible (and also for AVX512F, where I had previously offered to
>> put together a patch to switch to alphabetical ordering, if only we
>> could agree on that).
> 
> In which case I missed it during review.
> 
> This feature matrix is deliberately sorted by feature number in an
> effort to preserve chronology, which is a much more useful way of
> reasoning about feature dependencies.

Except that bit numbers are often, but not always an indication of
chronological order.

While I clearly disagree, for there to be progress here, do you
expect me to re-arrange the dependency list above, i.e. going
beyond your initial request? I certainly object to doing _just_
the originally requested adjustment ...

Jan
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v9 12/23] x86emul: support of AVX512_IFMA insns

Posted by Jan Beulich 2 weeks ago
Once again take the liberty and also correct the (public interface) name
of the AVX512_IFMA feature flag to match the SDM, on the assumption that
no external consumer has actually been using that flag so far.

As in a few cases before, since the insns here and in particular their
memory access patterns follow the usual scheme, I didn't think it was
necessary to add a contrived test specifically for them, beyond the
Disp8 scaling one.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
v9: Re-base.
v7: Reject EVEX.W=0.
v6: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -543,6 +543,11 @@ static const struct test avx512_bitalg_a
      INSN(pshufbitqmb, 66, 0f38, 8f, vl,  b, vl),
  };
  
+static const struct test avx512_ifma_all[] = {
+    INSN(pmadd52huq, 66, 0f38, b5, vl, q, vl),
+    INSN(pmadd52luq, 66, 0f38, b4, vl, q, vl),
+};
+
  static const struct test avx512_vbmi_all[] = {
      INSN(permb,         66, 0f38, 8d, vl, b, vl),
      INSN(permi2b,       66, 0f38, 75, vl, b, vl),
@@ -929,6 +934,7 @@ void evex_disp8_test(void *instr, struct
  #define cpu_has_avx512pf cpu_has_avx512f
      RUN(avx512pf, 512);
      RUN(avx512_bitalg, all);
+    RUN(avx512_ifma, all);
      RUN(avx512_vbmi, all);
      RUN(avx512_vbmi2, all);
      RUN(avx512_vpopcntdq, all);
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -137,6 +137,7 @@ static inline bool xcr0_mask(uint64_t ma
  #define cpu_has_bmi2       cp.feat.bmi2
  #define cpu_has_avx512f   (cp.feat.avx512f  && xcr0_mask(0xe6))
  #define cpu_has_avx512dq  (cp.feat.avx512dq && xcr0_mask(0xe6))
+#define cpu_has_avx512_ifma (cp.feat.avx512_ifma && xcr0_mask(0xe6))
  #define cpu_has_avx512er  (cp.feat.avx512er && xcr0_mask(0xe6))
  #define cpu_has_avx512cd  (cp.feat.avx512cd && xcr0_mask(0xe6))
  #define cpu_has_avx512bw  (cp.feat.avx512bw && xcr0_mask(0xe6))
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -521,6 +521,7 @@ static const struct ext0f38_table {
      [0xad] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
      [0xae] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
      [0xaf] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
+    [0xb4 ... 0xb5] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
      [0xb6 ... 0xb8] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
      [0xb9] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
      [0xba] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
@@ -1875,6 +1876,7 @@ in_protmode(
  #define vcpu_has_rdseed()      (ctxt->cpuid->feat.rdseed)
  #define vcpu_has_adx()         (ctxt->cpuid->feat.adx)
  #define vcpu_has_smap()        (ctxt->cpuid->feat.smap)
+#define vcpu_has_avx512_ifma() (ctxt->cpuid->feat.avx512_ifma)
  #define vcpu_has_clflushopt()  (ctxt->cpuid->feat.clflushopt)
  #define vcpu_has_clwb()        (ctxt->cpuid->feat.clwb)
  #define vcpu_has_avx512pf()    (ctxt->cpuid->feat.avx512pf)
@@ -9455,6 +9457,12 @@ x86_emulate(
          break;
      }
  
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xb4): /* vpmadd52luq [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xb5): /* vpmadd52huq [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        host_and_vcpu_must_have(avx512_ifma);
+        generate_exception_if(!evex.w, EXC_UD);
+        goto avx512f_no_sae;
+
      case X86EMUL_OPC_EVEX_66(0x0f38, 0xc6):
      case X86EMUL_OPC_EVEX_66(0x0f38, 0xc7):
      {
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -101,6 +101,7 @@
  #define cpu_has_avx512dq        boot_cpu_has(X86_FEATURE_AVX512DQ)
  #define cpu_has_rdseed          boot_cpu_has(X86_FEATURE_RDSEED)
  #define cpu_has_smap            boot_cpu_has(X86_FEATURE_SMAP)
+#define cpu_has_avx512_ifma     boot_cpu_has(X86_FEATURE_AVX512_IFMA)
  #define cpu_has_avx512er        boot_cpu_has(X86_FEATURE_AVX512ER)
  #define cpu_has_avx512cd        boot_cpu_has(X86_FEATURE_AVX512CD)
  #define cpu_has_sha             boot_cpu_has(X86_FEATURE_SHA)
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -212,7 +212,7 @@ XEN_CPUFEATURE(AVX512DQ,      5*32+17) /
  XEN_CPUFEATURE(RDSEED,        5*32+18) /*A  RDSEED instruction */
  XEN_CPUFEATURE(ADX,           5*32+19) /*A  ADCX, ADOX instructions */
  XEN_CPUFEATURE(SMAP,          5*32+20) /*S  Supervisor Mode Access Prevention */
-XEN_CPUFEATURE(AVX512IFMA,    5*32+21) /*A  AVX-512 Integer Fused Multiply Add */
+XEN_CPUFEATURE(AVX512_IFMA,   5*32+21) /*A  AVX-512 Integer Fused Multiply Add */
  XEN_CPUFEATURE(CLFLUSHOPT,    5*32+23) /*A  CLFLUSHOPT instruction */
  XEN_CPUFEATURE(CLWB,          5*32+24) /*A  CLWB instruction */
  XEN_CPUFEATURE(AVX512PF,      5*32+26) /*A  AVX-512 Prefetch Instructions */
--- a/xen/tools/gen-cpuid.py
+++ b/xen/tools/gen-cpuid.py
@@ -261,7 +261,7 @@ def crunch_numbers(state):
          # (which in practice depends on the EVEX prefix to encode) as well
          # as mask registers, and the instructions themselves. All further
          # AVX512 features are built on top of AVX512F
-        AVX512F: [AVX512DQ, AVX512IFMA, AVX512PF, AVX512ER, AVX512CD,
+        AVX512F: [AVX512DQ, AVX512_IFMA, AVX512PF, AVX512ER, AVX512CD,
                    AVX512BW, AVX512VL, AVX512_4VNNIW, AVX512_4FMAPS,
                    AVX512_VPOPCNTDQ],
  

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v9 13/23] x86emul: support remaining AVX512_VBMI2 insns

Posted by Jan Beulich 2 weeks ago
As in a few cases before, since the insns here and in particular their
memory access patterns follow the usual scheme, I didn't think it was
necessary to add a contrived test specifically for them, beyond the
Disp8 scaling one.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
v7: Re-base over change earlier in the series.
v6: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -558,6 +558,14 @@ static const struct test avx512_vbmi_all
  static const struct test avx512_vbmi2_all[] = {
      INSN(pcompress, 66, 0f38, 63, vl, bw, el),
      INSN(pexpand,   66, 0f38, 62, vl, bw, el),
+    INSN(pshld,     66, 0f3a, 71, vl, dq, vl),
+    INSN(pshldv,    66, 0f38, 71, vl, dq, vl),
+    INSN(pshldvw,   66, 0f38, 70, vl,  w, vl),
+    INSN(pshldw,    66, 0f3a, 70, vl,  w, vl),
+    INSN(pshrd,     66, 0f3a, 73, vl, dq, vl),
+    INSN(pshrdv,    66, 0f38, 73, vl, dq, vl),
+    INSN(pshrdvw,   66, 0f38, 72, vl,  w, vl),
+    INSN(pshrdw,    66, 0f3a, 72, vl,  w, vl),
  };
  
  static const struct test avx512_vpopcntdq_all[] = {
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -487,6 +487,7 @@ static const struct ext0f38_table {
      [0x62] = { .simd_size = simd_packed_int, .two_op = 1, .d8s = d8s_bw },
      [0x63] = { .simd_size = simd_packed_int, .to_mem = 1, .two_op = 1, .d8s = d8s_bw },
      [0x64 ... 0x66] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
+    [0x70 ... 0x73] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
      [0x75 ... 0x76] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
      [0x77] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
      [0x78] = { .simd_size = simd_other, .two_op = 1 },
@@ -611,6 +612,7 @@ static const struct ext0f3a_table {
      [0x6a ... 0x6b] = { .simd_size = simd_scalar_opc, .four_op = 1 },
      [0x6c ... 0x6d] = { .simd_size = simd_packed_fp, .four_op = 1 },
      [0x6e ... 0x6f] = { .simd_size = simd_scalar_opc, .four_op = 1 },
+    [0x70 ... 0x73] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
      [0x78 ... 0x79] = { .simd_size = simd_packed_fp, .four_op = 1 },
      [0x7a ... 0x7b] = { .simd_size = simd_scalar_opc, .four_op = 1 },
      [0x7c ... 0x7d] = { .simd_size = simd_packed_fp, .four_op = 1 },
@@ -8969,6 +8971,16 @@ x86_emulate(
          }
          goto simd_zmm;
  
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x70): /* vpshldvw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x72): /* vpshrdvw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        generate_exception_if(!evex.w, EXC_UD);
+        elem_bytes = 2;
+        /* fall through */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x71): /* vpshldv{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x73): /* vpshrdv{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        host_and_vcpu_must_have(avx512_vbmi2);
+        goto avx512f_no_sae;
+
      case X86EMUL_OPC_EVEX_66(0x0f38, 0x75): /* vpermi2{b,w} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
      case X86EMUL_OPC_EVEX_66(0x0f38, 0x7d): /* vpermt2{b,w} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
      case X86EMUL_OPC_EVEX_66(0x0f38, 0x8d): /* vperm{b,w} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
@@ -10281,6 +10293,16 @@ x86_emulate(
          avx512_vlen_check(true);
          goto simd_imm8_zmm;
  
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x70): /* vpshldw $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x72): /* vpshrdw $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        generate_exception_if(!evex.w, EXC_UD);
+        elem_bytes = 2;
+        /* fall through */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x71): /* vpshld{d,q} $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x73): /* vpshrd{d,q} $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        host_and_vcpu_must_have(avx512_vbmi2);
+        goto avx512f_imm8_no_sae;
+
      case X86EMUL_OPC(0x0f3a, 0xcc):     /* sha1rnds4 $imm8,xmm/m128,xmm */
          host_and_vcpu_must_have(sha);
          op_bytes = 16;

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v9 14/23] x86emul: support AVX512_4FMAPS insns

Posted by Jan Beulich 2 weeks ago
A decoder adjustment is needed here because of the current sharing of
table entries between different (implied) opcode prefixes: The same
major opcodes are used for vfmsub{132,213}{p,s}{s,d}, which have a
different memory operand size and different Disp8 scaling.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v9: Re-base. Explain need for decoder special case.
v8: Correct vcpu_has_*() insertion point.
v7: Re-base.
v6: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -538,6 +538,13 @@ static const struct test avx512pf_512[]
      INSNX(scatterpf1q, 66, 0f38, c7, 6, vl, sd, el),
  };
  
+static const struct test avx512_4fmaps_512[] = {
+    INSN(4fmaddps,  f2, 0f38, 9a, el_4, d, vl),
+    INSN(4fmaddss,  f2, 0f38, 9b, el_4, d, vl),
+    INSN(4fnmaddps, f2, 0f38, aa, el_4, d, vl),
+    INSN(4fnmaddss, f2, 0f38, ab, el_4, d, vl),
+};
+
  static const struct test avx512_bitalg_all[] = {
      INSN(popcnt,      66, 0f38, 54, vl, bw, vl),
      INSN(pshufbitqmb, 66, 0f38, 8f, vl,  b, vl),
@@ -941,6 +948,7 @@ void evex_disp8_test(void *instr, struct
      RUN(avx512er, 512);
  #define cpu_has_avx512pf cpu_has_avx512f
      RUN(avx512pf, 512);
+    RUN(avx512_4fmaps, 512);
      RUN(avx512_bitalg, all);
      RUN(avx512_ifma, all);
      RUN(avx512_vbmi, all);
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -4274,6 +4274,81 @@ int main(int argc, char **argv)
      }
  #endif
  
+    printf("%-40s", "Testing v4fmaddps 32(%ecx),%zmm4,%zmm4{%k5}...");
+    if ( stack_exec && cpu_has_avx512_4fmaps )
+    {
+        decl_insn(v4fmaddps);
+        static const struct {
+            float f[16];
+        } in = {{
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+        }}, out = {{
+            1 + 1 * 9 + 2 * 10 + 3 * 11 + 4 * 12,
+            2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+            16 + 16 * 9 + 17 * 10 + 18 * 11 + 19 * 12
+        }};
+
+        asm volatile ( "vmovups %1, %%zmm4\n\t"
+                       "vbroadcastss %%xmm4, %%zmm7\n\t"
+                       "vaddps %%zmm4, %%zmm7, %%zmm5\n\t"
+                       "vaddps %%zmm5, %%zmm7, %%zmm6\n\t"
+                       "vaddps %%zmm6, %%zmm7, %%zmm7\n\t"
+                       "kmovw %2, %%k5\n"
+                       put_insn(v4fmaddps,
+                                "v4fmaddps 32(%0), %%zmm4, %%zmm4%{%%k5%}")
+                       :: "c" (NULL), "m" (in), "rmk" (0x8001) );
+
+        set_insn(v4fmaddps);
+        regs.ecx = (unsigned long)&in;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(v4fmaddps) )
+            goto fail;
+
+        asm ( "vcmpeqps %1, %%zmm4, %%k0\n\t"
+              "kmovw %%k0, %0" : "=g" (rc) : "m" (out) );
+        if ( rc != 0xffff )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing v4fnmaddss 16(%edx),%zmm4,%zmm4{%k3}...");
+    if ( stack_exec && cpu_has_avx512_4fmaps )
+    {
+        decl_insn(v4fnmaddss);
+        static const struct {
+            float f[16];
+        } in = {{
+            1, 2, 3, 4, 5, 6, 7, 8
+        }}, out = {{
+            1 - 1 * 5 - 2 * 6 - 3 * 7 - 4 * 8, 2, 3, 4
+        }};
+
+        asm volatile ( "vmovups %1, %%xmm4\n\t"
+                       "vaddss %%xmm4, %%xmm4, %%xmm5\n\t"
+                       "vaddss %%xmm5, %%xmm4, %%xmm6\n\t"
+                       "vaddss %%xmm6, %%xmm4, %%xmm7\n\t"
+                       "kmovw %2, %%k3\n"
+                       put_insn(v4fnmaddss,
+                                "v4fnmaddss 16(%0), %%xmm4, %%xmm4%{%%k3%}")
+                       :: "d" (NULL), "m" (in), "rmk" (1) );
+
+        set_insn(v4fnmaddss);
+        regs.edx = (unsigned long)&in;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(v4fnmaddss) )
+            goto fail;
+
+        asm ( "vcmpeqps %1, %%zmm4, %%k0\n\t"
+              "kmovw %%k0, %0" : "=g" (rc) : "m" (out) );
+        if ( rc != 0xffff )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
  #undef decl_insn
  #undef put_insn
  #undef set_insn
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -146,6 +146,7 @@ static inline bool xcr0_mask(uint64_t ma
  #define cpu_has_avx512_vbmi2 (cp.feat.avx512_vbmi2 && xcr0_mask(0xe6))
  #define cpu_has_avx512_bitalg (cp.feat.avx512_bitalg && xcr0_mask(0xe6))
  #define cpu_has_avx512_vpopcntdq (cp.feat.avx512_vpopcntdq && xcr0_mask(0xe6))
+#define cpu_has_avx512_4fmaps (cp.feat.avx512_4fmaps && xcr0_mask(0xe6))
  
  #define cpu_has_xgetbv1   (cpu_has_xsave && cp.xstate.xgetbv1)
  
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1892,6 +1892,7 @@ in_protmode(
  #define vcpu_has_avx512_bitalg() (ctxt->cpuid->feat.avx512_bitalg)
  #define vcpu_has_avx512_vpopcntdq() (ctxt->cpuid->feat.avx512_vpopcntdq)
  #define vcpu_has_rdpid()       (ctxt->cpuid->feat.rdpid)
+#define vcpu_has_avx512_4fmaps() (ctxt->cpuid->feat.avx512_4fmaps)
  
  #define vcpu_must_have(feat) \
      generate_exception_if(!vcpu_has_##feat(), EXC_UD)
@@ -3173,6 +3174,18 @@ x86_decode(
                                                     state);
                      state->simd_size = simd_other;
                  }
+
+                switch ( b )
+                {
+                /* v4f{,n}madd{p,s}s need special casing */
+                case 0x9a: case 0x9b: case 0xaa: case 0xab:
+                    if ( evex.pfx == vex_f2 )
+                    {
+                        disp8scale = 4;
+                        state->simd_size = simd_128;
+                    }
+                    break;
+                }
              }
              break;
  
@@ -9370,6 +9383,24 @@ x86_emulate(
              avx512_vlen_check(true);
          goto simd_zmm;
  
+    case X86EMUL_OPC_EVEX_F2(0x0f38, 0x9a): /* v4fmaddps m128,zmm+3,zmm{k} */
+    case X86EMUL_OPC_EVEX_F2(0x0f38, 0xaa): /* v4fnmaddps m128,zmm+3,zmm{k} */
+        host_and_vcpu_must_have(avx512_4fmaps);
+        generate_exception_if((ea.type != OP_MEM || evex.w || evex.brs ||
+                               evex.lr != 2),
+                              EXC_UD);
+        op_mask = op_mask & 0xffff ? 0xf : 0;
+        goto simd_zmm;
+
+    case X86EMUL_OPC_EVEX_F2(0x0f38, 0x9b): /* v4fmaddss m128,xmm+3,xmm{k} */
+    case X86EMUL_OPC_EVEX_F2(0x0f38, 0xab): /* v4fnmaddss m128,xmm+3,xmm{k} */
+        host_and_vcpu_must_have(avx512_4fmaps);
+        generate_exception_if((ea.type != OP_MEM || evex.w || evex.brs ||
+                               evex.lr == 3),
+                              EXC_UD);
+        op_mask = op_mask & 1 ? 0xf : 0;
+        goto simd_zmm;
+
      case X86EMUL_OPC_EVEX_66(0x0f38, 0xa0): /* vpscatterd{d,q} [xyz]mm,mem{k} */
      case X86EMUL_OPC_EVEX_66(0x0f38, 0xa1): /* vpscatterq{d,q} [xyz]mm,mem{k} */
      case X86EMUL_OPC_EVEX_66(0x0f38, 0xa2): /* vscatterdp{s,d} [xyz]mm,mem{k} */
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -119,6 +119,7 @@
  #define cpu_has_itsc            boot_cpu_has(X86_FEATURE_ITSC)
  
  /* CPUID level 0x00000007:0.edx */
+#define cpu_has_avx512_4fmaps   boot_cpu_has(X86_FEATURE_AVX512_4FMAPS)
  #define cpu_has_tsx_force_abort boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)
  
  /* Synthesized. */

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v9 14/23] x86emul: support AVX512_4FMAPS insns

Posted by Andrew Cooper 2 weeks ago
On 01/07/2019 12:23, Jan Beulich wrote:
> A decoder adjustment is needed here because of the current sharing of
> table entries between different (implied) opcode prefixes: The same
> major opcodes are used for vfmsub{132,213}{p,s}{s,d}, which have a
> different memory operand size and different Disp8 scaling.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v9 15/23] x86emul: support AVX512_4VNNIW insns

Posted by Jan Beulich 2 weeks ago
As in a few cases before, since the insns here and in particular their
memory access patterns follow the AVX512_4FMAPS scheme, I didn't think
it was necessary to add contrived tests specifically for them, beyond
the Disp8 scaling ones.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
v9: Re-base.
v8: Correct vcpu_has_*() insertion point.
v7: Re-base.
v6: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -545,6 +545,11 @@ static const struct test avx512_4fmaps_5
      INSN(4fnmaddss, f2, 0f38, ab, el_4, d, vl),
  };
  
+static const struct test avx512_4vnniw_512[] = {
+    INSN(p4dpwssd,  f2, 0f38, 52, el_4, d, vl),
+    INSN(p4dpwssds, f2, 0f38, 53, el_4, d, vl),
+};
+
  static const struct test avx512_bitalg_all[] = {
      INSN(popcnt,      66, 0f38, 54, vl, bw, vl),
      INSN(pshufbitqmb, 66, 0f38, 8f, vl,  b, vl),
@@ -949,6 +954,7 @@ void evex_disp8_test(void *instr, struct
  #define cpu_has_avx512pf cpu_has_avx512f
      RUN(avx512pf, 512);
      RUN(avx512_4fmaps, 512);
+    RUN(avx512_4vnniw, 512);
      RUN(avx512_bitalg, all);
      RUN(avx512_ifma, all);
      RUN(avx512_vbmi, all);
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -146,6 +146,7 @@ static inline bool xcr0_mask(uint64_t ma
  #define cpu_has_avx512_vbmi2 (cp.feat.avx512_vbmi2 && xcr0_mask(0xe6))
  #define cpu_has_avx512_bitalg (cp.feat.avx512_bitalg && xcr0_mask(0xe6))
  #define cpu_has_avx512_vpopcntdq (cp.feat.avx512_vpopcntdq && xcr0_mask(0xe6))
+#define cpu_has_avx512_4vnniw (cp.feat.avx512_4vnniw && xcr0_mask(0xe6))
  #define cpu_has_avx512_4fmaps (cp.feat.avx512_4fmaps && xcr0_mask(0xe6))
  
  #define cpu_has_xgetbv1   (cpu_has_xsave && cp.xstate.xgetbv1)
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -479,6 +479,7 @@ static const struct ext0f38_table {
      [0x4d] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
      [0x4e] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl },
      [0x4f] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
+    [0x52 ... 0x53] = { .simd_size = simd_128, .d8s = 4 },
      [0x54 ... 0x55] = { .simd_size = simd_packed_int, .two_op = 1, .d8s = d8s_vl },
      [0x58] = { .simd_size = simd_other, .two_op = 1, .d8s = 2 },
      [0x59] = { .simd_size = simd_other, .two_op = 1, .d8s = 3 },
@@ -1892,6 +1893,7 @@ in_protmode(
  #define vcpu_has_avx512_bitalg() (ctxt->cpuid->feat.avx512_bitalg)
  #define vcpu_has_avx512_vpopcntdq() (ctxt->cpuid->feat.avx512_vpopcntdq)
  #define vcpu_has_rdpid()       (ctxt->cpuid->feat.rdpid)
+#define vcpu_has_avx512_4vnniw() (ctxt->cpuid->feat.avx512_4vnniw)
  #define vcpu_has_avx512_4fmaps() (ctxt->cpuid->feat.avx512_4fmaps)
  
  #define vcpu_must_have(feat) \
@@ -8920,6 +8922,15 @@ x86_emulate(
          generate_exception_if(vex.l, EXC_UD);
          goto simd_0f_avx;
  
+    case X86EMUL_OPC_EVEX_F2(0x0f38, 0x52): /* vp4dpwssd m128,zmm+3,zmm{k} */
+    case X86EMUL_OPC_EVEX_F2(0x0f38, 0x53): /* vp4dpwssds m128,zmm+3,zmm{k} */
+        host_and_vcpu_must_have(avx512_4vnniw);
+        generate_exception_if((ea.type != OP_MEM || evex.w || evex.brs ||
+                               evex.lr != 2),
+                              EXC_UD);
+        op_mask = op_mask & 0xffff ? 0xf : 0;
+        goto simd_zmm;
+
      case X86EMUL_OPC_EVEX_66(0x0f38, 0x8f): /* vpshufbitqmb [xyz]mm/mem,[xyz]mm,k{k} */
          generate_exception_if(evex.w || !evex.r || !evex.R || evex.z, EXC_UD);
          /* fall through */
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -119,6 +119,7 @@
  #define cpu_has_itsc            boot_cpu_has(X86_FEATURE_ITSC)
  
  /* CPUID level 0x00000007:0.edx */
+#define cpu_has_avx512_4vnniw   boot_cpu_has(X86_FEATURE_AVX512_4VNNIW)
  #define cpu_has_avx512_4fmaps   boot_cpu_has(X86_FEATURE_AVX512_4FMAPS)
  #define cpu_has_tsx_force_abort boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)
  

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v9 16/23] x86emul: support AVX512_VNNI insns

Posted by Jan Beulich 2 weeks ago
Along the lines of the 4FMAPS case, convert the 4VNNIW-based table
entries to a decoder adjustment. Because of the current sharing of table
entries between different (implied) opcode prefixes and with the same
major opcodes being used for vp4dpwssd{,s}, which have a different
memory operand size and different Disp8 scaling, the pre-existing table
entries get converted to a decoder override. The table entries will now
represent the insns here, in line with other table entries preferably
representing the prefix-66 insns.

As in a few cases before, since the insns here and in particular their
memory access patterns follow the usual scheme, I didn't think it was
necessary to add a contrived test specifically for them, beyond the
Disp8 scaling one.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v9: Re-base. Explain need for decoder special case.
v8: Re-base.
v7: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -580,6 +580,13 @@ static const struct test avx512_vbmi2_al
      INSN(pshrdw,    66, 0f3a, 72, vl,  w, vl),
  };
  
+static const struct test avx512_vnni_all[] = {
+    INSN(pdpbusd,  66, 0f38, 50, vl, d, vl),
+    INSN(pdpbusds, 66, 0f38, 51, vl, d, vl),
+    INSN(pdpwssd,  66, 0f38, 52, vl, d, vl),
+    INSN(pdpwssds, 66, 0f38, 53, vl, d, vl),
+};
+
  static const struct test avx512_vpopcntdq_all[] = {
      INSN(popcnt, 66, 0f38, 55, vl, dq, vl)
  };
@@ -959,5 +966,6 @@ void evex_disp8_test(void *instr, struct
      RUN(avx512_ifma, all);
      RUN(avx512_vbmi, all);
      RUN(avx512_vbmi2, all);
+    RUN(avx512_vnni, all);
      RUN(avx512_vpopcntdq, all);
  }
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -144,6 +144,7 @@ static inline bool xcr0_mask(uint64_t ma
  #define cpu_has_avx512vl  (cp.feat.avx512vl && xcr0_mask(0xe6))
  #define cpu_has_avx512_vbmi (cp.feat.avx512_vbmi && xcr0_mask(0xe6))
  #define cpu_has_avx512_vbmi2 (cp.feat.avx512_vbmi2 && xcr0_mask(0xe6))
+#define cpu_has_avx512_vnni (cp.feat.avx512_vnni && xcr0_mask(0xe6))
  #define cpu_has_avx512_bitalg (cp.feat.avx512_bitalg && xcr0_mask(0xe6))
  #define cpu_has_avx512_vpopcntdq (cp.feat.avx512_vpopcntdq && xcr0_mask(0xe6))
  #define cpu_has_avx512_4vnniw (cp.feat.avx512_4vnniw && xcr0_mask(0xe6))
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -479,7 +479,7 @@ static const struct ext0f38_table {
      [0x4d] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
      [0x4e] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl },
      [0x4f] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
-    [0x52 ... 0x53] = { .simd_size = simd_128, .d8s = 4 },
+    [0x50 ... 0x53] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
      [0x54 ... 0x55] = { .simd_size = simd_packed_int, .two_op = 1, .d8s = d8s_vl },
      [0x58] = { .simd_size = simd_other, .two_op = 1, .d8s = 2 },
      [0x59] = { .simd_size = simd_other, .two_op = 1, .d8s = 3 },
@@ -1890,6 +1890,7 @@ in_protmode(
  #define vcpu_has_avx512vl()    (ctxt->cpuid->feat.avx512vl)
  #define vcpu_has_avx512_vbmi() (ctxt->cpuid->feat.avx512_vbmi)
  #define vcpu_has_avx512_vbmi2() (ctxt->cpuid->feat.avx512_vbmi2)
+#define vcpu_has_avx512_vnni() (ctxt->cpuid->feat.avx512_vnni)
  #define vcpu_has_avx512_bitalg() (ctxt->cpuid->feat.avx512_bitalg)
  #define vcpu_has_avx512_vpopcntdq() (ctxt->cpuid->feat.avx512_vpopcntdq)
  #define vcpu_has_rdpid()       (ctxt->cpuid->feat.rdpid)
@@ -3179,6 +3180,8 @@ x86_decode(
  
                  switch ( b )
                  {
+                /* vp4dpwssd{,s} need special casing */
+                case 0x52: case 0x53:
                  /* v4f{,n}madd{p,s}s need special casing */
                  case 0x9a: case 0x9b: case 0xaa: case 0xab:
                      if ( evex.pfx == vex_f2 )
@@ -9394,6 +9397,14 @@ x86_emulate(
              avx512_vlen_check(true);
          goto simd_zmm;
  
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x50): /* vpdpbusd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x51): /* vpdpbusds [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x52): /* vpdpwssd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x53): /* vpdpwssds [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        host_and_vcpu_must_have(avx512_vnni);
+        generate_exception_if(evex.w, EXC_UD);
+        goto avx512f_no_sae;
+
      case X86EMUL_OPC_EVEX_F2(0x0f38, 0x9a): /* v4fmaddps m128,zmm+3,zmm{k} */
      case X86EMUL_OPC_EVEX_F2(0x0f38, 0xaa): /* v4fnmaddps m128,zmm+3,zmm{k} */
          host_and_vcpu_must_have(avx512_4fmaps);
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -111,6 +111,7 @@
  /* CPUID level 0x00000007:0.ecx */
  #define cpu_has_avx512_vbmi     boot_cpu_has(X86_FEATURE_AVX512_VBMI)
  #define cpu_has_avx512_vbmi2    boot_cpu_has(X86_FEATURE_AVX512_VBMI2)
+#define cpu_has_avx512_vnni     boot_cpu_has(X86_FEATURE_AVX512_VNNI)
  #define cpu_has_avx512_bitalg   boot_cpu_has(X86_FEATURE_AVX512_BITALG)
  #define cpu_has_avx512_vpopcntdq boot_cpu_has(X86_FEATURE_AVX512_VPOPCNTDQ)
  #define cpu_has_rdpid           boot_cpu_has(X86_FEATURE_RDPID)
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -229,6 +229,7 @@ XEN_CPUFEATURE(UMIP,          6*32+ 2) /
  XEN_CPUFEATURE(PKU,           6*32+ 3) /*H  Protection Keys for Userspace */
  XEN_CPUFEATURE(OSPKE,         6*32+ 4) /*!  OS Protection Keys Enable */
  XEN_CPUFEATURE(AVX512_VBMI2,  6*32+ 6) /*A  Additional AVX-512 Vector Byte Manipulation Instrs */
+XEN_CPUFEATURE(AVX512_VNNI,   6*32+11) /*A  Vector Neural Network Instrs */
  XEN_CPUFEATURE(AVX512_BITALG, 6*32+12) /*A  Support for VPOPCNT[B,W] and VPSHUFBITQMB */
  XEN_CPUFEATURE(AVX512_VPOPCNTDQ, 6*32+14) /*A  POPCNT for vectors of DW/QW */
  XEN_CPUFEATURE(RDPID,         6*32+22) /*A  RDPID instruction */
--- a/xen/tools/gen-cpuid.py
+++ b/xen/tools/gen-cpuid.py
@@ -263,7 +263,7 @@ def crunch_numbers(state):
          # AVX512 features are built on top of AVX512F
          AVX512F: [AVX512DQ, AVX512_IFMA, AVX512PF, AVX512ER, AVX512CD,
                    AVX512BW, AVX512VL, AVX512_4VNNIW, AVX512_4FMAPS,
-                  AVX512_VPOPCNTDQ],
+                  AVX512_VNNI, AVX512_VPOPCNTDQ],
  
          # AVX512 extensions acting on vectors of bytes/words are made
          # dependents of AVX512BW (as to requiring wider than 16-bit mask

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v9 16/23] x86emul: support AVX512_VNNI insns

Posted by Andrew Cooper 2 weeks ago
On 01/07/2019 12:24, Jan Beulich wrote:
> Along the lines of the 4FMAPS case, convert the 4VNNIW-based table
> entries to a decoder adjustment. Because of the current sharing of table
> entries between different (implied) opcode prefixes and with the same
> major opcodes being used for vp4dpwssd{,s}, which have a different
> memory operand size and different Disp8 scaling, the pre-existing table
> entries get converted to a decoder override. The table entries will now
> represent the insns here, in line with other table entries preferably
> representing the prefix-66 insns.
>
> As in a few cases before, since the insns here and in particular their
> memory access patterns follow the usual scheme, I didn't think it was
> necessary to add a contrived test specifically for them, beyond the
> Disp8 scaling one.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v9 17/23] x86emul: support VPCLMULQDQ insns

Posted by Jan Beulich 2 weeks ago
As to the feature dependency adjustment, while strictly speaking AVX is
a sufficient prereq (to have YMM registers), 256-bit vectors of integers
have got fully introduced with AVX2 only. Sadly gcc can't be used as a
reference here: They don't provide any AVX512-independent built-in at
all.

Along the lines of PCLMULQDQ, since the insns here and in particular
their memory access patterns follow the usual scheme, I didn't think it
was necessary to add a contrived test specifically for them, beyond the
Disp8 scaling one.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
v9: Re-base. Make VPCLMULQDQ also depend on PCLMULQDQ.
v8: No need to set fault_suppression to false.
v7: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -591,6 +591,10 @@ static const struct test avx512_vpopcntd
      INSN(popcnt, 66, 0f38, 55, vl, dq, vl)
  };
  
+static const struct test vpclmulqdq_all[] = {
+    INSN(pclmulqdq, 66, 0f3a, 44, vl, q_nb, vl)
+};
+
  static const unsigned char vl_all[] = { VL_512, VL_128, VL_256 };
  static const unsigned char vl_128[] = { VL_128 };
  static const unsigned char vl_no128[] = { VL_512, VL_256 };
@@ -968,4 +972,9 @@ void evex_disp8_test(void *instr, struct
      RUN(avx512_vbmi2, all);
      RUN(avx512_vnni, all);
      RUN(avx512_vpopcntdq, all);
+
+    if ( cpu_has_avx512f )
+    {
+        RUN(vpclmulqdq, all);
+    }
  }
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -144,6 +144,7 @@ static inline bool xcr0_mask(uint64_t ma
  #define cpu_has_avx512vl  (cp.feat.avx512vl && xcr0_mask(0xe6))
  #define cpu_has_avx512_vbmi (cp.feat.avx512_vbmi && xcr0_mask(0xe6))
  #define cpu_has_avx512_vbmi2 (cp.feat.avx512_vbmi2 && xcr0_mask(0xe6))
+#define cpu_has_vpclmulqdq (cp.feat.vpclmulqdq && xcr0_mask(6))
  #define cpu_has_avx512_vnni (cp.feat.avx512_vnni && xcr0_mask(0xe6))
  #define cpu_has_avx512_bitalg (cp.feat.avx512_bitalg && xcr0_mask(0xe6))
  #define cpu_has_avx512_vpopcntdq (cp.feat.avx512_vpopcntdq && xcr0_mask(0xe6))
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -594,7 +594,7 @@ static const struct ext0f3a_table {
      [0x3e ... 0x3f] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
      [0x40 ... 0x41] = { .simd_size = simd_packed_fp },
      [0x42 ... 0x43] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
-    [0x44] = { .simd_size = simd_packed_int },
+    [0x44] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
      [0x46] = { .simd_size = simd_packed_int },
      [0x48 ... 0x49] = { .simd_size = simd_packed_fp, .four_op = 1 },
      [0x4a ... 0x4b] = { .simd_size = simd_packed_fp, .four_op = 1 },
@@ -1890,6 +1890,7 @@ in_protmode(
  #define vcpu_has_avx512vl()    (ctxt->cpuid->feat.avx512vl)
  #define vcpu_has_avx512_vbmi() (ctxt->cpuid->feat.avx512_vbmi)
  #define vcpu_has_avx512_vbmi2() (ctxt->cpuid->feat.avx512_vbmi2)
+#define vcpu_has_vpclmulqdq()  (ctxt->cpuid->feat.vpclmulqdq)
  #define vcpu_has_avx512_vnni() (ctxt->cpuid->feat.avx512_vnni)
  #define vcpu_has_avx512_bitalg() (ctxt->cpuid->feat.avx512_bitalg)
  #define vcpu_has_avx512_vpopcntdq() (ctxt->cpuid->feat.avx512_vpopcntdq)
@@ -10207,13 +10208,19 @@ x86_emulate(
          goto opmask_shift_imm;
  
      case X86EMUL_OPC_66(0x0f3a, 0x44):     /* pclmulqdq $imm8,xmm/m128,xmm */
-    case X86EMUL_OPC_VEX_66(0x0f3a, 0x44): /* vpclmulqdq $imm8,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x44): /* vpclmulqdq $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
          host_and_vcpu_must_have(pclmulqdq);
          if ( vex.opcx == vex_none )
              goto simd_0f3a_common;
-        generate_exception_if(vex.l, EXC_UD);
+        if ( vex.l )
+            host_and_vcpu_must_have(vpclmulqdq);
          goto simd_0f_imm8_avx;
  
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x44): /* vpclmulqdq $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm */
+        host_and_vcpu_must_have(vpclmulqdq);
+        generate_exception_if(evex.brs || evex.opmsk, EXC_UD);
+        goto avx512f_imm8_no_sae;
+
      case X86EMUL_OPC_VEX_66(0x0f3a, 0x4a): /* vblendvps {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
      case X86EMUL_OPC_VEX_66(0x0f3a, 0x4b): /* vblendvpd {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
          generate_exception_if(vex.w, EXC_UD);
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -111,6 +111,7 @@
  /* CPUID level 0x00000007:0.ecx */
  #define cpu_has_avx512_vbmi     boot_cpu_has(X86_FEATURE_AVX512_VBMI)
  #define cpu_has_avx512_vbmi2    boot_cpu_has(X86_FEATURE_AVX512_VBMI2)
+#define cpu_has_vpclmulqdq      boot_cpu_has(X86_FEATURE_VPCLMULQDQ)
  #define cpu_has_avx512_vnni     boot_cpu_has(X86_FEATURE_AVX512_VNNI)
  #define cpu_has_avx512_bitalg   boot_cpu_has(X86_FEATURE_AVX512_BITALG)
  #define cpu_has_avx512_vpopcntdq boot_cpu_has(X86_FEATURE_AVX512_VPOPCNTDQ)
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -121,7 +121,7 @@ XEN_CPUFEATURE(PBE,           0*32+31) /
  
  /* Intel-defined CPU features, CPUID level 0x00000001.ecx, word 1 */
  XEN_CPUFEATURE(SSE3,          1*32+ 0) /*A  Streaming SIMD Extensions-3 */
-XEN_CPUFEATURE(PCLMULQDQ,     1*32+ 1) /*A  Carry-less mulitplication */
+XEN_CPUFEATURE(PCLMULQDQ,     1*32+ 1) /*A  Carry-less multiplication */
  XEN_CPUFEATURE(DTES64,        1*32+ 2) /*   64-bit Debug Store */
  XEN_CPUFEATURE(MONITOR,       1*32+ 3) /*   Monitor/Mwait support */
  XEN_CPUFEATURE(DSCPL,         1*32+ 4) /*   CPL Qualified Debug Store */
@@ -229,6 +229,7 @@ XEN_CPUFEATURE(UMIP,          6*32+ 2) /
  XEN_CPUFEATURE(PKU,           6*32+ 3) /*H  Protection Keys for Userspace */
  XEN_CPUFEATURE(OSPKE,         6*32+ 4) /*!  OS Protection Keys Enable */
  XEN_CPUFEATURE(AVX512_VBMI2,  6*32+ 6) /*A  Additional AVX-512 Vector Byte Manipulation Instrs */
+XEN_CPUFEATURE(VPCLMULQDQ,    6*32+10) /*A  Vector Carry-less Multiplication Instrs */
  XEN_CPUFEATURE(AVX512_VNNI,   6*32+11) /*A  Vector Neural Network Instrs */
  XEN_CPUFEATURE(AVX512_BITALG, 6*32+12) /*A  Support for VPOPCNT[B,W] and VPSHUFBITQMB */
  XEN_CPUFEATURE(AVX512_VPOPCNTDQ, 6*32+14) /*A  POPCNT for vectors of DW/QW */
--- a/xen/tools/gen-cpuid.py
+++ b/xen/tools/gen-cpuid.py
@@ -254,8 +254,9 @@ def crunch_numbers(state):
  
          # This is just the dependency between AVX512 and AVX2 of XSTATE
          # feature flags.  If want to use AVX512, AVX2 must be supported and
-        # enabled.
-        AVX2: [AVX512F],
+        # enabled.  Certain later extensions, acting on 256-bit vectors of
+        # integers, better depend on AVX2 than AVX.
+        AVX2: [AVX512F, VPCLMULQDQ],
  
          # AVX512F is taken to mean hardware support for 512bit registers
          # (which in practice depends on the EVEX prefix to encode) as well
@@ -270,6 +271,10 @@ def crunch_numbers(state):
          # registers), despite the SDM not formally making this connection.
          AVX512BW: [AVX512_BF16, AVX512_BITALG, AVX512_VBMI, AVX512_VBMI2],
  
+        # Extensions with VEX/EVEX encodings keyed to a separate feature
+        # flag are made dependents of their respective legacy feature.
+        PCLMULQDQ: [VPCLMULQDQ],
+
          # The features:
          #   * Single Thread Indirect Branch Predictors
          #   * Speculative Store Bypass Disable

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v9 18/23] x86emul: support VAES insns

Posted by Jan Beulich 2 weeks ago
As to the feature dependency adjustment, just like for VPCLMULQDQ while
strictly speaking AVX is a sufficient prereq (to have YMM registers),
256-bit vectors of integers have got fully introduced with AVX2 only.

A new test case (also covering AESNI) will be added to the harness by a
subsequent patch.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Andrew Cooper <andrew.cooper3@citirx.com>
---
v9: Re-base. Make VAES also depend on AESNI
v8: No need to set fault_suppression to false.
v7: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -591,6 +591,18 @@ static const struct test avx512_vpopcntd
      INSN(popcnt, 66, 0f38, 55, vl, dq, vl)
  };
  
+/*
+ * The uses of b in this table are simply (one of) the shortest form(s) of
+ * saying "no broadcast" without introducing a 128-bit granularity enumerator.
+ * Due to all of the insns being WIG, w, d_nb, and q_nb would all also fit.
+ */
+static const struct test vaes_all[] = {
+    INSN(aesdec,     66, 0f38, de, vl, b, vl),
+    INSN(aesdeclast, 66, 0f38, df, vl, b, vl),
+    INSN(aesenc,     66, 0f38, dc, vl, b, vl),
+    INSN(aesenclast, 66, 0f38, dd, vl, b, vl),
+};
+
  static const struct test vpclmulqdq_all[] = {
      INSN(pclmulqdq, 66, 0f3a, 44, vl, q_nb, vl)
  };
@@ -975,6 +987,7 @@ void evex_disp8_test(void *instr, struct
  
      if ( cpu_has_avx512f )
      {
+        RUN(vaes, all);
          RUN(vpclmulqdq, all);
      }
  }
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -144,6 +144,7 @@ static inline bool xcr0_mask(uint64_t ma
  #define cpu_has_avx512vl  (cp.feat.avx512vl && xcr0_mask(0xe6))
  #define cpu_has_avx512_vbmi (cp.feat.avx512_vbmi && xcr0_mask(0xe6))
  #define cpu_has_avx512_vbmi2 (cp.feat.avx512_vbmi2 && xcr0_mask(0xe6))
+#define cpu_has_vaes      (cp.feat.vaes && xcr0_mask(6))
  #define cpu_has_vpclmulqdq (cp.feat.vpclmulqdq && xcr0_mask(6))
  #define cpu_has_avx512_vnni (cp.feat.avx512_vnni && xcr0_mask(0xe6))
  #define cpu_has_avx512_bitalg (cp.feat.avx512_bitalg && xcr0_mask(0xe6))
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -541,7 +541,7 @@ static const struct ext0f38_table {
      [0xcc] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl },
      [0xcd] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
      [0xdb] = { .simd_size = simd_packed_int, .two_op = 1 },
-    [0xdc ... 0xdf] = { .simd_size = simd_packed_int },
+    [0xdc ... 0xdf] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
      [0xf0] = { .two_op = 1 },
      [0xf1] = { .to_mem = 1, .two_op = 1 },
      [0xf2 ... 0xf3] = {},
@@ -1890,6 +1890,7 @@ in_protmode(
  #define vcpu_has_avx512vl()    (ctxt->cpuid->feat.avx512vl)
  #define vcpu_has_avx512_vbmi() (ctxt->cpuid->feat.avx512_vbmi)
  #define vcpu_has_avx512_vbmi2() (ctxt->cpuid->feat.avx512_vbmi2)
+#define vcpu_has_vaes()        (ctxt->cpuid->feat.vaes)
  #define vcpu_has_vpclmulqdq()  (ctxt->cpuid->feat.vpclmulqdq)
  #define vcpu_has_avx512_vnni() (ctxt->cpuid->feat.avx512_vnni)
  #define vcpu_has_avx512_bitalg() (ctxt->cpuid->feat.avx512_bitalg)
@@ -8911,13 +8912,9 @@ x86_emulate(
      case X86EMUL_OPC_66(0x0f38, 0xdb):     /* aesimc xmm/m128,xmm */
      case X86EMUL_OPC_VEX_66(0x0f38, 0xdb): /* vaesimc xmm/m128,xmm */
      case X86EMUL_OPC_66(0x0f38, 0xdc):     /* aesenc xmm/m128,xmm,xmm */
-    case X86EMUL_OPC_VEX_66(0x0f38, 0xdc): /* vaesenc xmm/m128,xmm,xmm */
      case X86EMUL_OPC_66(0x0f38, 0xdd):     /* aesenclast xmm/m128,xmm,xmm */
-    case X86EMUL_OPC_VEX_66(0x0f38, 0xdd): /* vaesenclast xmm/m128,xmm,xmm */
      case X86EMUL_OPC_66(0x0f38, 0xde):     /* aesdec xmm/m128,xmm,xmm */
-    case X86EMUL_OPC_VEX_66(0x0f38, 0xde): /* vaesdec xmm/m128,xmm,xmm */
      case X86EMUL_OPC_66(0x0f38, 0xdf):     /* aesdeclast xmm/m128,xmm,xmm */
-    case X86EMUL_OPC_VEX_66(0x0f38, 0xdf): /* vaesdeclast xmm/m128,xmm,xmm */
          host_and_vcpu_must_have(aesni);
          if ( vex.opcx == vex_none )
              goto simd_0f38_common;
@@ -9643,6 +9640,24 @@ x86_emulate(
          host_and_vcpu_must_have(avx512er);
          goto simd_zmm_scalar_sae;
  
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xdc):  /* vaesenc {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xdd):  /* vaesenclast {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xde):  /* vaesdec {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xdf):  /* vaesdeclast {x,y}mm/mem,{x,y}mm,{x,y}mm */
+        if ( !vex.l )
+            host_and_vcpu_must_have(aesni);
+        else
+            host_and_vcpu_must_have(vaes);
+        goto simd_0f_avx;
+
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xdc): /* vaesenc [xyz]mm/mem,[xyz]mm,[xyz]mm */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xdd): /* vaesenclast [xyz]mm/mem,[xyz]mm,[xyz]mm */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xde): /* vaesdec [xyz]mm/mem,[xyz]mm,[xyz]mm */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xdf): /* vaesdeclast [xyz]mm/mem,[xyz]mm,[xyz]mm */
+        host_and_vcpu_must_have(vaes);
+        generate_exception_if(evex.brs || evex.opmsk, EXC_UD);
+        goto avx512f_no_sae;
+
      case X86EMUL_OPC(0x0f38, 0xf0): /* movbe m,r */
      case X86EMUL_OPC(0x0f38, 0xf1): /* movbe r,m */
          vcpu_must_have(movbe);
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -111,6 +111,7 @@
  /* CPUID level 0x00000007:0.ecx */
  #define cpu_has_avx512_vbmi     boot_cpu_has(X86_FEATURE_AVX512_VBMI)
  #define cpu_has_avx512_vbmi2    boot_cpu_has(X86_FEATURE_AVX512_VBMI2)
+#define cpu_has_vaes            boot_cpu_has(X86_FEATURE_VAES)
  #define cpu_has_vpclmulqdq      boot_cpu_has(X86_FEATURE_VPCLMULQDQ)
  #define cpu_has_avx512_vnni     boot_cpu_has(X86_FEATURE_AVX512_VNNI)
  #define cpu_has_avx512_bitalg   boot_cpu_has(X86_FEATURE_AVX512_BITALG)
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -229,6 +229,7 @@ XEN_CPUFEATURE(UMIP,          6*32+ 2) /
  XEN_CPUFEATURE(PKU,           6*32+ 3) /*H  Protection Keys for Userspace */
  XEN_CPUFEATURE(OSPKE,         6*32+ 4) /*!  OS Protection Keys Enable */
  XEN_CPUFEATURE(AVX512_VBMI2,  6*32+ 6) /*A  Additional AVX-512 Vector Byte Manipulation Instrs */
+XEN_CPUFEATURE(VAES,          6*32+ 9) /*A  Vector AES Instrs */
  XEN_CPUFEATURE(VPCLMULQDQ,    6*32+10) /*A  Vector Carry-less Multiplication Instrs */
  XEN_CPUFEATURE(AVX512_VNNI,   6*32+11) /*A  Vector Neural Network Instrs */
  XEN_CPUFEATURE(AVX512_BITALG, 6*32+12) /*A  Support for VPOPCNT[B,W] and VPSHUFBITQMB */
--- a/xen/tools/gen-cpuid.py
+++ b/xen/tools/gen-cpuid.py
@@ -256,7 +256,7 @@ def crunch_numbers(state):
          # feature flags.  If want to use AVX512, AVX2 must be supported and
          # enabled.  Certain later extensions, acting on 256-bit vectors of
          # integers, better depend on AVX2 than AVX.
-        AVX2: [AVX512F, VPCLMULQDQ],
+        AVX2: [AVX512F, VAES, VPCLMULQDQ],
  
          # AVX512F is taken to mean hardware support for 512bit registers
          # (which in practice depends on the EVEX prefix to encode) as well
@@ -274,6 +274,7 @@ def crunch_numbers(state):
          # Extensions with VEX/EVEX encodings keyed to a separate feature
          # flag are made dependents of their respective legacy feature.
          PCLMULQDQ: [VPCLMULQDQ],
+        AESNI: [VAES],
  
          # The features:
          #   * Single Thread Indirect Branch Predictors

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v9 19/23] x86emul: support GFNI insns

Posted by Jan Beulich 2 weeks ago
As to the feature dependency adjustment, while strictly speaking SSE is
a sufficient prereq (to have XMM registers), vectors of bytes and qwords
have got introduced only with SSE2. gcc, for example, uses a similar
connection in its respective intrinsics header.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v9: Re-base. Drop stale part of description.
v8: Add {evex}-producing vgf2p8mulb alias to simd.h. Add missing simd.h
     dependency. Re-base.
v7: New.

--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -19,7 +19,8 @@ CFLAGS += $(CFLAGS_xeninclude)
  SIMD := 3dnow sse sse2 sse4 avx avx2 xop avx512f avx512bw avx512dq avx512er avx512vbmi
  FMA := fma4 fma
  SG := avx2-sg avx512f-sg avx512vl-sg
-TESTCASES := blowfish $(SIMD) $(FMA) $(SG)
+GF := sse2-gf avx2-gf avx512bw-gf
+TESTCASES := blowfish $(SIMD) $(FMA) $(SG) $(GF)
  
  OPMASK := avx512f avx512dq avx512bw
  
@@ -142,12 +143,17 @@ $(1)-cflags := \
  	   $(foreach flt,$($(1)-flts), \
  	     "-D_$(vec)x$(idx)f$(flt) -m$(1:-sg=) $(call non-sse,$(1)) -Os -DVEC_MAX=$(vec) -DIDX_SIZE=$(idx) -DFLOAT_SIZE=$(flt)")))
  endef
+define simd-gf-defs
+$(1)-cflags := $(foreach vec,$($(1:-gf=)-vecs), \
+	         "-D_$(vec) -mgfni -m$(1:-gf=) $(call non-sse,$(1)) -Os -DVEC_SIZE=$(vec)")
+endef
  define opmask-defs
  $(1)-opmask-cflags := $(foreach vec,$($(1)-opmask-vecs), "-D_$(vec) -m$(1) -Os -DSIZE=$(vec)")
  endef
  
  $(foreach flavor,$(SIMD) $(FMA),$(eval $(call simd-defs,$(flavor))))
  $(foreach flavor,$(SG),$(eval $(call simd-sg-defs,$(flavor))))
+$(foreach flavor,$(GF),$(eval $(call simd-gf-defs,$(flavor))))
  $(foreach flavor,$(OPMASK),$(eval $(call opmask-defs,$(flavor))))
  
  first-string = $(shell for s in $(1); do echo "$$s"; break; done)
@@ -197,7 +203,10 @@ $(addsuffix .c,$(FMA)):
  $(addsuffix .c,$(SG)):
  	ln -sf simd-sg.c $@
  
-$(addsuffix .h,$(SIMD) $(FMA) $(SG)): simd.h
+$(addsuffix .c,$(GF)):
+	ln -sf simd-gf.c $@
+
+$(addsuffix .h,$(SIMD) $(FMA) $(SG) $(GF)): simd.h
  
  xop.h avx512f.h: simd-fma.c
  
--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -591,6 +591,12 @@ static const struct test avx512_vpopcntd
      INSN(popcnt, 66, 0f38, 55, vl, dq, vl)
  };
  
+static const struct test gfni_all[] = {
+    INSN(gf2p8affineinvqb, 66, 0f3a, cf, vl, q, vl),
+    INSN(gf2p8affineqb,    66, 0f3a, ce, vl, q, vl),
+    INSN(gf2p8mulb,        66, 0f38, cf, vl, b, vl),
+};
+
  /*
   * The uses of b in this table are simply (one of) the shortest form(s) of
   * saying "no broadcast" without introducing a 128-bit granularity enumerator.
@@ -987,6 +993,7 @@ void evex_disp8_test(void *instr, struct
  
      if ( cpu_has_avx512f )
      {
+        RUN(gfni, all);
          RUN(vaes, all);
          RUN(vpclmulqdq, all);
      }
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -371,6 +371,7 @@ OVR(cvttsd2siq);
  OVR(cvttss2si);
  OVR(cvttss2sil);
  OVR(cvttss2siq);
+OVR(gf2p8mulb);
  OVR(movddup);
  OVR(movntdq);
  OVR(movntdqa);
--- /dev/null
+++ b/tools/tests/x86_emulator/simd-gf.c
@@ -0,0 +1,80 @@
+#define UINT_SIZE 1
+
+#include "simd.h"
+ENTRY(gf_test);
+
+#if VEC_SIZE == 16
+# define GF(op, s, a...) __builtin_ia32_vgf2p8 ## op ## _v16qi ## s(a)
+#elif VEC_SIZE == 32
+# define GF(op, s, a...) __builtin_ia32_vgf2p8 ## op ## _v32qi ## s(a)
+#elif VEC_SIZE == 64
+# define GF(op, s, a...) __builtin_ia32_vgf2p8 ## op ## _v64qi ## s(a)
+#endif
+
+#ifdef __AVX512BW__
+# define ALL_TRUE (~0ULL >> (64 - ELEM_COUNT))
+# define eq(x, y) (B(pcmpeqb, _mask, (vqi_t)(x), (vqi_t)(y), -1) == ALL_TRUE)
+# define mul(x, y) GF(mulb, _mask, (vqi_t)(x), (vqi_t)(y), (vqi_t)undef(), ~0)
+# define transform(m, dir, x, c) ({ \
+    vec_t t_; \
+    asm ( "vgf2p8affine" #dir "qb %[imm], %[matrix]%{1to%c[n]%}, %[src], %[dst]" \
+          : [dst] "=v" (t_) \
+          : [matrix] "m" (m), [src] "v" (x), [imm] "i" (c), [n] "i" (VEC_SIZE / 8) ); \
+    t_; \
+})
+#else
+# if defined(__AVX2__)
+#  define bcstq(x) ({ \
+    vdi_t t_; \
+    asm ( "vpbroadcastq %1, %0" : "=x" (t_) : "m" (x) ); \
+    t_; \
+})
+#  define to_bool(cmp) B(ptestc, , cmp, (vdi_t){} == 0)
+# else
+#  define bcstq(x) ((vdi_t){x, x})
+#  define to_bool(cmp) (__builtin_ia32_pmovmskb128(cmp) == 0xffff)
+# endif
+# define eq(x, y) to_bool((x) == (y))
+# define mul(x, y) GF(mulb, , (vqi_t)(x), (vqi_t)(y))
+# define transform(m, dir, x, c) ({ \
+    vdi_t m_ = bcstq(m); \
+    touch(m_); \
+    ((vec_t)GF(affine ## dir ## qb, , (vqi_t)(x), (vqi_t)m_, c)); \
+})
+#endif
+
+const unsigned __attribute__((mode(DI))) ident = 0x0102040810204080ULL;
+
+int gf_test(void)
+{
+    unsigned int i;
+    vec_t src, one;
+
+    for ( i = 0; i < ELEM_COUNT; ++i )
+    {
+        src[i] = i;
+        one[i] = 1;
+    }
+
+    /* Special case for first iteration. */
+    one[0] = 0;
+
+    do {
+        vec_t inv = transform(ident, inv, src, 0);
+
+        touch(src);
+        touch(inv);
+        if ( !eq(mul(src, inv), one) ) return __LINE__;
+
+        touch(src);
+        touch(inv);
+        if ( !eq(mul(inv, src), one) ) return __LINE__;
+
+        one[0] = 1;
+
+        src += ELEM_COUNT;
+        i += ELEM_COUNT;
+    } while ( i < 256 );
+
+    return 0;
+}
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -11,12 +11,14 @@ asm ( ".pushsection .test, \"ax\", @prog
  #include "3dnow.h"
  #include "sse.h"
  #include "sse2.h"
+#include "sse2-gf.h"
  #include "sse4.h"
  #include "avx.h"
  #include "fma4.h"
  #include "fma.h"
  #include "avx2.h"
  #include "avx2-sg.h"
+#include "avx2-gf.h"
  #include "xop.h"
  #include "avx512f-opmask.h"
  #include "avx512dq-opmask.h"
@@ -25,6 +27,7 @@ asm ( ".pushsection .test, \"ax\", @prog
  #include "avx512f-sg.h"
  #include "avx512vl-sg.h"
  #include "avx512bw.h"
+#include "avx512bw-gf.h"
  #include "avx512dq.h"
  #include "avx512er.h"
  #include "avx512vbmi.h"
@@ -138,6 +141,26 @@ static bool simd_check_avx512vbmi_vl(voi
      return cpu_has_avx512_vbmi && cpu_has_avx512vl;
  }
  
+static bool simd_check_sse2_gf(void)
+{
+    return cpu_has_gfni && cpu_has_sse2;
+}
+
+static bool simd_check_avx2_gf(void)
+{
+    return cpu_has_gfni && cpu_has_avx2;
+}
+
+static bool simd_check_avx512bw_gf(void)
+{
+    return cpu_has_gfni && cpu_has_avx512bw;
+}
+
+static bool simd_check_avx512bw_gf_vl(void)
+{
+    return cpu_has_gfni && cpu_has_avx512vl;
+}
+
  static void simd_set_regs(struct cpu_user_regs *regs)
  {
      if ( cpu_has_mmx )
@@ -395,6 +418,12 @@ static const struct {
      AVX512VL(_VBMI+VL u16x8, avx512vbmi,    16u2),
      AVX512VL(_VBMI+VL s16x16, avx512vbmi,   32i2),
      AVX512VL(_VBMI+VL u16x16, avx512vbmi,   32u2),
+    SIMD(GFNI (legacy),       sse2_gf,        16),
+    SIMD(GFNI (VEX/x16),      avx2_gf,        16),
+    SIMD(GFNI (VEX/x32),      avx2_gf,        32),
+    SIMD(GFNI (EVEX/x64), avx512bw_gf,        64),
+    AVX512VL(VL+GFNI (x16), avx512bw_gf,      16),
+    AVX512VL(VL+GFNI (x32), avx512bw_gf,      32),
  #undef AVX512VL_
  #undef AVX512VL
  #undef SIMD_
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -144,6 +144,7 @@ static inline bool xcr0_mask(uint64_t ma
  #define cpu_has_avx512vl  (cp.feat.avx512vl && xcr0_mask(0xe6))
  #define cpu_has_avx512_vbmi (cp.feat.avx512_vbmi && xcr0_mask(0xe6))
  #define cpu_has_avx512_vbmi2 (cp.feat.avx512_vbmi2 && xcr0_mask(0xe6))
+#define cpu_has_gfni       cp.feat.gfni
  #define cpu_has_vaes      (cp.feat.vaes && xcr0_mask(6))
  #define cpu_has_vpclmulqdq (cp.feat.vpclmulqdq && xcr0_mask(6))
  #define cpu_has_avx512_vnni (cp.feat.avx512_vnni && xcr0_mask(0xe6))
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -540,6 +540,7 @@ static const struct ext0f38_table {
      [0xcb] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
      [0xcc] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl },
      [0xcd] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
+    [0xcf] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
      [0xdb] = { .simd_size = simd_packed_int, .two_op = 1 },
      [0xdc ... 0xdf] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
      [0xf0] = { .two_op = 1 },
@@ -619,6 +620,7 @@ static const struct ext0f3a_table {
      [0x7c ... 0x7d] = { .simd_size = simd_packed_fp, .four_op = 1 },
      [0x7e ... 0x7f] = { .simd_size = simd_scalar_opc, .four_op = 1 },
      [0xcc] = { .simd_size = simd_other },
+    [0xce ... 0xcf] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
      [0xdf] = { .simd_size = simd_packed_int, .two_op = 1 },
      [0xf0] = {},
  };
@@ -1890,6 +1892,7 @@ in_protmode(
  #define vcpu_has_avx512vl()    (ctxt->cpuid->feat.avx512vl)
  #define vcpu_has_avx512_vbmi() (ctxt->cpuid->feat.avx512_vbmi)
  #define vcpu_has_avx512_vbmi2() (ctxt->cpuid->feat.avx512_vbmi2)
+#define vcpu_has_gfni()        (ctxt->cpuid->feat.gfni)
  #define vcpu_has_vaes()        (ctxt->cpuid->feat.vaes)
  #define vcpu_has_vpclmulqdq()  (ctxt->cpuid->feat.vpclmulqdq)
  #define vcpu_has_avx512_vnni() (ctxt->cpuid->feat.avx512_vnni)
@@ -9640,6 +9643,21 @@ x86_emulate(
          host_and_vcpu_must_have(avx512er);
          goto simd_zmm_scalar_sae;
  
+    case X86EMUL_OPC_66(0x0f38, 0xcf):      /* gf2p8mulb xmm/m128,xmm */
+        host_and_vcpu_must_have(gfni);
+        goto simd_0f38_common;
+
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xcf):  /* vgf2p8mulb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+        host_and_vcpu_must_have(gfni);
+        generate_exception_if(vex.w, EXC_UD);
+        goto simd_0f_avx;
+
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xcf): /* vgf2p8mulb [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        host_and_vcpu_must_have(gfni);
+        generate_exception_if(evex.w || evex.brs, EXC_UD);
+        elem_bytes = 1;
+        goto avx512f_no_sae;
+
      case X86EMUL_OPC_VEX_66(0x0f38, 0xdc):  /* vaesenc {x,y}mm/mem,{x,y}mm,{x,y}mm */
      case X86EMUL_OPC_VEX_66(0x0f38, 0xdd):  /* vaesenclast {x,y}mm/mem,{x,y}mm,{x,y}mm */
      case X86EMUL_OPC_VEX_66(0x0f38, 0xde):  /* vaesdec {x,y}mm/mem,{x,y}mm,{x,y}mm */
@@ -10383,6 +10401,24 @@ x86_emulate(
          op_bytes = 16;
          goto simd_0f3a_common;
  
+    case X86EMUL_OPC_66(0x0f3a, 0xce):      /* gf2p8affineqb $imm8,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f3a, 0xcf):      /* gf2p8affineinvqb $imm8,xmm/m128,xmm */
+        host_and_vcpu_must_have(gfni);
+        goto simd_0f3a_common;
+
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0xce):  /* vgf2p8affineqb $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0xcf):  /* vgf2p8affineinvqb $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+        host_and_vcpu_must_have(gfni);
+        generate_exception_if(!vex.w, EXC_UD);
+        goto simd_0f_imm8_avx;
+
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0xce): /* vgf2p8affineqb $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0xcf): /* vgf2p8affineinvqb $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        host_and_vcpu_must_have(gfni);
+        generate_exception_if(!evex.w, EXC_UD);
+        fault_suppression = false;
+        goto avx512f_imm8_no_sae;
+
      case X86EMUL_OPC_66(0x0f3a, 0xdf):     /* aeskeygenassist $imm8,xmm/m128,xmm */
      case X86EMUL_OPC_VEX_66(0x0f3a, 0xdf): /* vaeskeygenassist $imm8,xmm/m128,xmm */
          host_and_vcpu_must_have(aesni);
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -111,6 +111,7 @@
  /* CPUID level 0x00000007:0.ecx */
  #define cpu_has_avx512_vbmi     boot_cpu_has(X86_FEATURE_AVX512_VBMI)
  #define cpu_has_avx512_vbmi2    boot_cpu_has(X86_FEATURE_AVX512_VBMI2)
+#define cpu_has_gfni            boot_cpu_has(X86_FEATURE_GFNI)
  #define cpu_has_vaes            boot_cpu_has(X86_FEATURE_VAES)
  #define cpu_has_vpclmulqdq      boot_cpu_has(X86_FEATURE_VPCLMULQDQ)
  #define cpu_has_avx512_vnni     boot_cpu_has(X86_FEATURE_AVX512_VNNI)
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -229,6 +229,7 @@ XEN_CPUFEATURE(UMIP,          6*32+ 2) /
  XEN_CPUFEATURE(PKU,           6*32+ 3) /*H  Protection Keys for Userspace */
  XEN_CPUFEATURE(OSPKE,         6*32+ 4) /*!  OS Protection Keys Enable */
  XEN_CPUFEATURE(AVX512_VBMI2,  6*32+ 6) /*A  Additional AVX-512 Vector Byte Manipulation Instrs */
+XEN_CPUFEATURE(GFNI,          6*32+ 8) /*A  Galois Field Instrs */
  XEN_CPUFEATURE(VAES,          6*32+ 9) /*A  Vector AES Instrs */
  XEN_CPUFEATURE(VPCLMULQDQ,    6*32+10) /*A  Vector Carry-less Multiplication Instrs */
  XEN_CPUFEATURE(AVX512_VNNI,   6*32+11) /*A  Vector Neural Network Instrs */
--- a/xen/tools/gen-cpuid.py
+++ b/xen/tools/gen-cpuid.py
@@ -201,7 +201,7 @@ def crunch_numbers(state):
          # SSE2 was re-specified as core instructions for 64bit.  Also ISA
          # extensions dealing with vectors of integers are added here rather
          # than to SSE.
-        SSE2: [SSE3, LM, AESNI, PCLMULQDQ, SHA],
+        SSE2: [SSE3, LM, AESNI, PCLMULQDQ, SHA, GFNI],
  
          # Other SSEn each depend on their predecessor versions.
          SSE3: [SSSE3],

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v9 19/23] x86emul: support GFNI insns

Posted by Andrew Cooper 2 weeks ago
On 01/07/2019 12:26, Jan Beulich wrote:
> As to the feature dependency adjustment, while strictly speaking SSE is
> a sufficient prereq (to have XMM registers), vectors of bytes and qwords
> have got introduced only with SSE2. gcc, for example, uses a similar
> connection in its respective intrinsics header.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v9 20/23] x86emul: restore ordering within main switch statement

Posted by Jan Beulich 2 weeks ago
Incremental additions and/or mistakes have lead to some code blocks
sitting in "unexpected" places. Re-sort the case blocks (opcode space;
major opcode; 66/F3/F2 prefix; legacy/VEX/EVEX encoding).

As an exception the opcode space 0x0f EVEX-encoded VPEXTRW is left at
its current place, to keep it close to the "pextr" label.

Pure code movement.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Andrew Cooper <andrew.cooper3@citirx.com>
---
v7: New.

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -7105,15 +7105,6 @@ x86_emulate(
          ASSERT(!state->simd_size);
          break;
  
-    case X86EMUL_OPC_EVEX_F3(0x0f, 0x7e): /* vmovq xmm/m64,xmm */
-    case X86EMUL_OPC_EVEX_66(0x0f, 0xd6): /* vmovq xmm,xmm/m64 */
-        generate_exception_if(evex.lr || !evex.w || evex.opmsk || evex.brs,
-                              EXC_UD);
-        host_and_vcpu_must_have(avx512f);
-        d |= TwoOp;
-        op_bytes = 8;
-        goto simd_zmm;
-
      case X86EMUL_OPC_66(0x0f, 0xe7):     /* movntdq xmm,m128 */
      case X86EMUL_OPC_VEX_66(0x0f, 0xe7): /* vmovntdq {x,y}mm,mem */
          generate_exception_if(ea.type != OP_MEM, EXC_UD);
@@ -7511,6 +7502,15 @@ x86_emulate(
          op_bytes = 8;
          goto simd_0f_int;
  
+    case X86EMUL_OPC_EVEX_F3(0x0f, 0x7e): /* vmovq xmm/m64,xmm */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xd6): /* vmovq xmm,xmm/m64 */
+        generate_exception_if(evex.lr || !evex.w || evex.opmsk || evex.brs,
+                              EXC_UD);
+        host_and_vcpu_must_have(avx512f);
+        d |= TwoOp;
+        op_bytes = 8;
+        goto simd_zmm;
+
      case X86EMUL_OPC(0x0f, 0x80) ... X86EMUL_OPC(0x0f, 0x8f): /* jcc (near) */
          if ( test_cc(b, _regs.eflags) )
              jmp_rel((int32_t)src.val);
@@ -8611,63 +8611,6 @@ x86_emulate(
          dst.type = OP_NONE;
          break;
  
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0x10): /* vpsrlvw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0x11): /* vpsravw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0x12): /* vpsllvw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
-        host_and_vcpu_must_have(avx512bw);
-        generate_exception_if(!evex.w || evex.brs, EXC_UD);
-        elem_bytes = 2;
-        goto avx512f_no_sae;
-
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0x18): /* vbroadcastss xmm/m32,[xyz]mm{k} */
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0x58): /* vpbroadcastd xmm/m32,[xyz]mm{k} */
-        op_bytes = elem_bytes;
-        generate_exception_if(evex.w || evex.brs, EXC_UD);
-    avx512_broadcast:
-        /*
-         * For the respective code below the main switch() to work we need to
-         * fold op_mask here: A source element gets read whenever any of its
-         * respective destination elements' mask bits is set.
-         */
-        if ( fault_suppression )
-        {
-            n = 1 << ((b & 3) - evex.w);
-            EXPECT(elem_bytes > 0);
-            ASSERT(op_bytes == n * elem_bytes);
-            for ( i = n; i < (16 << evex.lr) / elem_bytes; i += n )
-                op_mask |= (op_mask >> i) & ((1 << n) - 1);
-        }
-        goto avx512f_no_sae;
-
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0x1b): /* vbroadcastf32x8 m256,zmm{k} */
-                                            /* vbroadcastf64x4 m256,zmm{k} */
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0x5b): /* vbroadcasti32x8 m256,zmm{k} */
-                                            /* vbroadcasti64x4 m256,zmm{k} */
-        generate_exception_if(ea.type != OP_MEM || evex.lr != 2, EXC_UD);
-        /* fall through */
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0x19): /* vbroadcastsd xmm/m64,{y,z}mm{k} */
-                                            /* vbroadcastf32x2 xmm/m64,{y,z}mm{k} */
-        generate_exception_if(!evex.lr, EXC_UD);
-        /* fall through */
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0x59): /* vpbroadcastq xmm/m64,[xyz]mm{k} */
-                                            /* vbroadcasti32x2 xmm/m64,[xyz]mm{k} */
-        if ( b == 0x59 )
-            op_bytes = 8;
-        generate_exception_if(evex.brs, EXC_UD);
-        if ( !evex.w )
-            host_and_vcpu_must_have(avx512dq);
-        goto avx512_broadcast;
-
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0x1a): /* vbroadcastf32x4 m128,{y,z}mm{k} */
-                                            /* vbroadcastf64x2 m128,{y,z}mm{k} */
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0x5a): /* vbroadcasti32x4 m128,{y,z}mm{k} */
-                                            /* vbroadcasti64x2 m128,{y,z}mm{k} */
-        generate_exception_if(ea.type != OP_MEM || !evex.lr || evex.brs,
-                              EXC_UD);
-        if ( evex.w )
-            host_and_vcpu_must_have(avx512dq);
-        goto avx512_broadcast;
-
      case X86EMUL_OPC_66(0x0f38, 0x20): /* pmovsxbw xmm/m64,xmm */
      case X86EMUL_OPC_66(0x0f38, 0x21): /* pmovsxbd xmm/m32,xmm */
      case X86EMUL_OPC_66(0x0f38, 0x22): /* pmovsxbq xmm/m16,xmm */
@@ -8701,47 +8644,14 @@ x86_emulate(
          host_and_vcpu_must_have(sse4_1);
          goto simd_0f38_common;
  
-    case X86EMUL_OPC_VEX_66(0x0f38, 0x13): /* vcvtph2ps xmm/mem,{x,y}mm */
-        generate_exception_if(vex.w, EXC_UD);
-        host_and_vcpu_must_have(f16c);
-        op_bytes = 8 << vex.l;
-        goto simd_0f_ymm;
-
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0x13): /* vcvtph2ps {x,y}mm/mem,[xyz]mm{k} */
-        generate_exception_if(evex.w || (ea.type != OP_REG && evex.brs), EXC_UD);
-        host_and_vcpu_must_have(avx512f);
-        if ( !evex.brs )
-            avx512_vlen_check(false);
-        op_bytes = 8 << evex.lr;
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x10): /* vpsrlvw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x11): /* vpsravw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x12): /* vpsllvw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        host_and_vcpu_must_have(avx512bw);
+        generate_exception_if(!evex.w || evex.brs, EXC_UD);
          elem_bytes = 2;
-        goto simd_zmm;
-
-    case X86EMUL_OPC_VEX_66(0x0f38, 0x16): /* vpermps ymm/m256,ymm,ymm */
-    case X86EMUL_OPC_VEX_66(0x0f38, 0x36): /* vpermd ymm/m256,ymm,ymm */
-        generate_exception_if(!vex.l || vex.w, EXC_UD);
-        goto simd_0f_avx2;
-
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0x16): /* vpermp{s,d} {y,z}mm/mem,{y,z}mm,{y,z}mm{k} */
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0x36): /* vperm{d,q} {y,z}mm/mem,{y,z}mm,{y,z}mm{k} */
-        generate_exception_if(!evex.lr, EXC_UD);
-        fault_suppression = false;
          goto avx512f_no_sae;
  
-    case X86EMUL_OPC_VEX_66(0x0f38, 0x20): /* vpmovsxbw xmm/mem,{x,y}mm */
-    case X86EMUL_OPC_VEX_66(0x0f38, 0x21): /* vpmovsxbd xmm/mem,{x,y}mm */
-    case X86EMUL_OPC_VEX_66(0x0f38, 0x22): /* vpmovsxbq xmm/mem,{x,y}mm */
-    case X86EMUL_OPC_VEX_66(0x0f38, 0x23): /* vpmovsxwd xmm/mem,{x,y}mm */
-    case X86EMUL_OPC_VEX_66(0x0f38, 0x24): /* vpmovsxwq xmm/mem,{x,y}mm */
-    case X86EMUL_OPC_VEX_66(0x0f38, 0x25): /* vpmovsxdq xmm/mem,{x,y}mm */
-    case X86EMUL_OPC_VEX_66(0x0f38, 0x30): /* vpmovzxbw xmm/mem,{x,y}mm */
-    case X86EMUL_OPC_VEX_66(0x0f38, 0x31): /* vpmovzxbd xmm/mem,{x,y}mm */
-    case X86EMUL_OPC_VEX_66(0x0f38, 0x32): /* vpmovzxbq xmm/mem,{x,y}mm */
-    case X86EMUL_OPC_VEX_66(0x0f38, 0x33): /* vpmovzxwd xmm/mem,{x,y}mm */
-    case X86EMUL_OPC_VEX_66(0x0f38, 0x34): /* vpmovzxwq xmm/mem,{x,y}mm */
-    case X86EMUL_OPC_VEX_66(0x0f38, 0x35): /* vpmovzxdq xmm/mem,{x,y}mm */
-        op_bytes = 16 >> (pmov_convert_delta[b & 7] - vex.l);
-        goto simd_0f_int;
-
      case X86EMUL_OPC_EVEX_F3(0x0f38, 0x10): /* vpmovuswb [xyz]mm,{x,y}mm/mem{k} */
      case X86EMUL_OPC_EVEX_66(0x0f38, 0x20): /* vpmovsxbw {x,y}mm/mem,[xyz]mm{k} */
      case X86EMUL_OPC_EVEX_F3(0x0f38, 0x20): /* vpmovswb [xyz]mm,{x,y}mm/mem{k} */
@@ -8787,6 +8697,96 @@ x86_emulate(
          elem_bytes = (b & 7) < 3 ? 1 : (b & 7) != 5 ? 2 : 4;
          goto avx512f_no_sae;
  
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x13): /* vcvtph2ps xmm/mem,{x,y}mm */
+        generate_exception_if(vex.w, EXC_UD);
+        host_and_vcpu_must_have(f16c);
+        op_bytes = 8 << vex.l;
+        goto simd_0f_ymm;
+
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x13): /* vcvtph2ps {x,y}mm/mem,[xyz]mm{k} */
+        generate_exception_if(evex.w || (ea.type != OP_REG && evex.brs), EXC_UD);
+        host_and_vcpu_must_have(avx512f);
+        if ( !evex.brs )
+            avx512_vlen_check(false);
+        op_bytes = 8 << evex.lr;
+        elem_bytes = 2;
+        goto simd_zmm;
+
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x16): /* vpermps ymm/m256,ymm,ymm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x36): /* vpermd ymm/m256,ymm,ymm */
+        generate_exception_if(!vex.l || vex.w, EXC_UD);
+        goto simd_0f_avx2;
+
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x16): /* vpermp{s,d} {y,z}mm/mem,{y,z}mm,{y,z}mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x36): /* vperm{d,q} {y,z}mm/mem,{y,z}mm,{y,z}mm{k} */
+        generate_exception_if(!evex.lr, EXC_UD);
+        fault_suppression = false;
+        goto avx512f_no_sae;
+
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x18): /* vbroadcastss xmm/m32,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x58): /* vpbroadcastd xmm/m32,[xyz]mm{k} */
+        op_bytes = elem_bytes;
+        generate_exception_if(evex.w || evex.brs, EXC_UD);
+    avx512_broadcast:
+        /*
+         * For the respective code below the main switch() to work we need to
+         * fold op_mask here: A source element gets read whenever any of its
+         * respective destination elements' mask bits is set.
+         */
+        if ( fault_suppression )
+        {
+            n = 1 << ((b & 3) - evex.w);
+            EXPECT(elem_bytes > 0);
+            ASSERT(op_bytes == n * elem_bytes);
+            for ( i = n; i < (16 << evex.lr) / elem_bytes; i += n )
+                op_mask |= (op_mask >> i) & ((1 << n) - 1);
+        }
+        goto avx512f_no_sae;
+
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x1b): /* vbroadcastf32x8 m256,zmm{k} */
+                                            /* vbroadcastf64x4 m256,zmm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x5b): /* vbroadcasti32x8 m256,zmm{k} */
+                                            /* vbroadcasti64x4 m256,zmm{k} */
+        generate_exception_if(ea.type != OP_MEM || evex.lr != 2, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x19): /* vbroadcastsd xmm/m64,{y,z}mm{k} */
+                                            /* vbroadcastf32x2 xmm/m64,{y,z}mm{k} */
+        generate_exception_if(!evex.lr, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x59): /* vpbroadcastq xmm/m64,[xyz]mm{k} */
+                                            /* vbroadcasti32x2 xmm/m64,[xyz]mm{k} */
+        if ( b == 0x59 )
+            op_bytes = 8;
+        generate_exception_if(evex.brs, EXC_UD);
+        if ( !evex.w )
+            host_and_vcpu_must_have(avx512dq);
+        goto avx512_broadcast;
+
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x1a): /* vbroadcastf32x4 m128,{y,z}mm{k} */
+                                            /* vbroadcastf64x2 m128,{y,z}mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x5a): /* vbroadcasti32x4 m128,{y,z}mm{k} */
+                                            /* vbroadcasti64x2 m128,{y,z}mm{k} */
+        generate_exception_if(ea.type != OP_MEM || !evex.lr || evex.brs,
+                              EXC_UD);
+        if ( evex.w )
+            host_and_vcpu_must_have(avx512dq);
+        goto avx512_broadcast;
+
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x20): /* vpmovsxbw xmm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x21): /* vpmovsxbd xmm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x22): /* vpmovsxbq xmm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x23): /* vpmovsxwd xmm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x24): /* vpmovsxwq xmm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x25): /* vpmovsxdq xmm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x30): /* vpmovzxbw xmm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x31): /* vpmovzxbd xmm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x32): /* vpmovzxbq xmm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x33): /* vpmovzxwd xmm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x34): /* vpmovzxwq xmm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x35): /* vpmovzxdq xmm/mem,{x,y}mm */
+        op_bytes = 16 >> (pmov_convert_delta[b & 7] - vex.l);
+        goto simd_0f_int;
+
      case X86EMUL_OPC_EVEX_F3(0x0f38, 0x29): /* vpmov{b,w}2m [xyz]mm,k */
      case X86EMUL_OPC_EVEX_F3(0x0f38, 0x39): /* vpmov{d,q}2m [xyz]mm,k */
          generate_exception_if(!evex.r || !evex.R, EXC_UD);
@@ -8894,6 +8894,52 @@ x86_emulate(
          break;
      }
  
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x2c): /* vscalefp{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x42): /* vgetexpp{s,d} [xyz]mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x96): /* vfmaddsub132p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x97): /* vfmsubadd132p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x98): /* vfmadd132p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x9a): /* vfmsub132p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x9c): /* vfnmadd132p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x9e): /* vfnmsub132p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xa6): /* vfmaddsub213p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xa7): /* vfmsubadd213p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xa8): /* vfmadd213p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xaa): /* vfmsub213p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xac): /* vfnmadd213p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xae): /* vfnmsub213p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xb6): /* vfmaddsub231p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xb7): /* vfmsubadd231p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xb8): /* vfmadd231p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xba): /* vfmsub231p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xbc): /* vfnmadd231p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xbe): /* vfnmsub231p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        host_and_vcpu_must_have(avx512f);
+        if ( ea.type != OP_REG || !evex.brs )
+            avx512_vlen_check(false);
+        goto simd_zmm;
+
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x2d): /* vscalefs{s,d} xmm/mem,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x43): /* vgetexps{s,d} xmm/mem,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x99): /* vfmadd132s{s,d} xmm/mem,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x9b): /* vfmsub132s{s,d} xmm/mem,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x9d): /* vfnmadd132s{s,d} xmm/mem,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x9f): /* vfnmsub132s{s,d} xmm/mem,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xa9): /* vfmadd213s{s,d} xmm/mem,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xab): /* vfmsub213s{s,d} xmm/mem,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xad): /* vfnmadd213s{s,d} xmm/mem,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xaf): /* vfnmsub213s{s,d} xmm/mem,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xb9): /* vfmadd231s{s,d} xmm/mem,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xbb): /* vfmsub231s{s,d} xmm/mem,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xbd): /* vfnmadd231s{s,d} xmm/mem,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xbf): /* vfnmsub231s{s,d} xmm/mem,xmm,xmm{k} */
+        host_and_vcpu_must_have(avx512f);
+    simd_zmm_scalar_sae:
+        generate_exception_if(ea.type != OP_REG && evex.brs, EXC_UD);
+        if ( !evex.brs )
+            avx512_vlen_check(true);
+        goto simd_zmm;
+
      case X86EMUL_OPC_66(0x0f38, 0x37): /* pcmpgtq xmm/m128,xmm */
          host_and_vcpu_must_have(sse4_2);
          goto simd_0f38_common;
@@ -8926,6 +8972,31 @@ x86_emulate(
          generate_exception_if(vex.l, EXC_UD);
          goto simd_0f_avx;
  
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x50): /* vpdpbusd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x51): /* vpdpbusds [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x52): /* vpdpwssd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x53): /* vpdpwssds [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        host_and_vcpu_must_have(avx512_vnni);
+        generate_exception_if(evex.w, EXC_UD);
+        goto avx512f_no_sae;
+
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x58): /* vpbroadcastd xmm/m32,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x59): /* vpbroadcastq xmm/m64,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x78): /* vpbroadcastb xmm/m8,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x79): /* vpbroadcastw xmm/m16,{x,y}mm */
+        op_bytes = 1 << ((!(b & 0x20) * 2) + (b & 1));
+        /* fall through */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x46): /* vpsravd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+        generate_exception_if(vex.w, EXC_UD);
+        goto simd_0f_avx2;
+
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x4d): /* vrcp14s{s,d} xmm/mem,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x4f): /* vrsqrt14s{s,d} xmm/mem,xmm,xmm{k} */
+        host_and_vcpu_must_have(avx512f);
+        generate_exception_if(evex.brs, EXC_UD);
+        avx512_vlen_check(true);
+        goto simd_zmm;
+
      case X86EMUL_OPC_EVEX_F2(0x0f38, 0x52): /* vp4dpwssd m128,zmm+3,zmm{k} */
      case X86EMUL_OPC_EVEX_F2(0x0f38, 0x53): /* vp4dpwssds m128,zmm+3,zmm{k} */
          host_and_vcpu_must_have(avx512_4vnniw);
@@ -8948,23 +9019,6 @@ x86_emulate(
          host_and_vcpu_must_have(avx512_vpopcntdq);
          goto avx512f_no_sae;
  
-    case X86EMUL_OPC_VEX_66(0x0f38, 0x58): /* vpbroadcastd xmm/m32,{x,y}mm */
-    case X86EMUL_OPC_VEX_66(0x0f38, 0x59): /* vpbroadcastq xmm/m64,{x,y}mm */
-    case X86EMUL_OPC_VEX_66(0x0f38, 0x78): /* vpbroadcastb xmm/m8,{x,y}mm */
-    case X86EMUL_OPC_VEX_66(0x0f38, 0x79): /* vpbroadcastw xmm/m16,{x,y}mm */
-        op_bytes = 1 << ((!(b & 0x20) * 2) + (b & 1));
-        /* fall through */
-    case X86EMUL_OPC_VEX_66(0x0f38, 0x46): /* vpsravd {x,y}mm/mem,{x,y}mm,{x,y}mm */
-        generate_exception_if(vex.w, EXC_UD);
-        goto simd_0f_avx2;
-
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0x4d): /* vrcp14s{s,d} xmm/mem,xmm,xmm{k} */
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0x4f): /* vrsqrt14s{s,d} xmm/mem,xmm,xmm{k} */
-        host_and_vcpu_must_have(avx512f);
-        generate_exception_if(evex.brs, EXC_UD);
-        avx512_vlen_check(true);
-        goto simd_zmm;
-
      case X86EMUL_OPC_VEX_66(0x0f38, 0x5a): /* vbroadcasti128 m128,ymm */
          generate_exception_if(ea.type != OP_MEM || !vex.l || vex.w, EXC_UD);
          goto simd_0f_avx2;
@@ -9352,60 +9406,6 @@ x86_emulate(
          host_and_vcpu_must_have(fma);
          goto simd_0f_ymm;
  
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0x2c): /* vscalefp{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0x42): /* vgetexpp{s,d} [xyz]mm/mem,[xyz]mm{k} */
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0x96): /* vfmaddsub132p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0x97): /* vfmsubadd132p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0x98): /* vfmadd132p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0x9a): /* vfmsub132p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0x9c): /* vfnmadd132p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0x9e): /* vfnmsub132p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0xa6): /* vfmaddsub213p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0xa7): /* vfmsubadd213p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0xa8): /* vfmadd213p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0xaa): /* vfmsub213p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0xac): /* vfnmadd213p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0xae): /* vfnmsub213p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0xb6): /* vfmaddsub231p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0xb7): /* vfmsubadd231p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0xb8): /* vfmadd231p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0xba): /* vfmsub231p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0xbc): /* vfnmadd231p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0xbe): /* vfnmsub231p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
-        host_and_vcpu_must_have(avx512f);
-        if ( ea.type != OP_REG || !evex.brs )
-            avx512_vlen_check(false);
-        goto simd_zmm;
-
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0x2d): /* vscalefs{s,d} xmm/mem,xmm,xmm{k} */
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0x43): /* vgetexps{s,d} xmm/mem,xmm,xmm{k} */
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0x99): /* vfmadd132s{s,d} xmm/mem,xmm,xmm{k} */
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0x9b): /* vfmsub132s{s,d} xmm/mem,xmm,xmm{k} */
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0x9d): /* vfnmadd132s{s,d} xmm/mem,xmm,xmm{k} */
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0x9f): /* vfnmsub132s{s,d} xmm/mem,xmm,xmm{k} */
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0xa9): /* vfmadd213s{s,d} xmm/mem,xmm,xmm{k} */
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0xab): /* vfmsub213s{s,d} xmm/mem,xmm,xmm{k} */
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0xad): /* vfnmadd213s{s,d} xmm/mem,xmm,xmm{k} */
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0xaf): /* vfnmsub213s{s,d} xmm/mem,xmm,xmm{k} */
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0xb9): /* vfmadd231s{s,d} xmm/mem,xmm,xmm{k} */
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0xbb): /* vfmsub231s{s,d} xmm/mem,xmm,xmm{k} */
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0xbd): /* vfnmadd231s{s,d} xmm/mem,xmm,xmm{k} */
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0xbf): /* vfnmsub231s{s,d} xmm/mem,xmm,xmm{k} */
-        host_and_vcpu_must_have(avx512f);
-    simd_zmm_scalar_sae:
-        generate_exception_if(ea.type != OP_REG && evex.brs, EXC_UD);
-        if ( !evex.brs )
-            avx512_vlen_check(true);
-        goto simd_zmm;
-
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0x50): /* vpdpbusd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0x51): /* vpdpbusds [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0x52): /* vpdpwssd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
-    case X86EMUL_OPC_EVEX_66(0x0f38, 0x53): /* vpdpwssds [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
-        host_and_vcpu_must_have(avx512_vnni);
-        generate_exception_if(evex.w, EXC_UD);
-        goto avx512f_no_sae;
-
      case X86EMUL_OPC_EVEX_F2(0x0f38, 0x9a): /* v4fmaddps m128,zmm+3,zmm{k} */
      case X86EMUL_OPC_EVEX_F2(0x0f38, 0xaa): /* v4fnmaddps m128,zmm+3,zmm{k} */
          host_and_vcpu_must_have(avx512_4fmaps);
@@ -10254,11 +10254,6 @@ x86_emulate(
          generate_exception_if(evex.brs || evex.opmsk, EXC_UD);
          goto avx512f_imm8_no_sae;
  
-    case X86EMUL_OPC_VEX_66(0x0f3a, 0x4a): /* vblendvps {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
-    case X86EMUL_OPC_VEX_66(0x0f3a, 0x4b): /* vblendvpd {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
-        generate_exception_if(vex.w, EXC_UD);
-        goto simd_0f_imm8_avx;
-
      case X86EMUL_OPC_VEX_66(0x0f3a, 0x48): /* vpermil2ps $imm,{x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
                                             /* vpermil2ps $imm,{x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
      case X86EMUL_OPC_VEX_66(0x0f3a, 0x49): /* vpermil2pd $imm,{x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
@@ -10266,6 +10261,11 @@ x86_emulate(
          host_and_vcpu_must_have(xop);
          goto simd_0f_imm8_ymm;
  
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x4a): /* vblendvps {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x4b): /* vblendvpd {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+        generate_exception_if(vex.w, EXC_UD);
+        goto simd_0f_imm8_avx;
+
      case X86EMUL_OPC_VEX_66(0x0f3a, 0x4c): /* vpblendvb {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
          generate_exception_if(vex.w, EXC_UD);
          goto simd_0f_int_imm8;

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v9 21/23] x86emul: add an AES/VAES test case to the harness

Posted by Jan Beulich 2 weeks ago
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
v8: New.

--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -19,8 +19,9 @@ CFLAGS += $(CFLAGS_xeninclude)
  SIMD := 3dnow sse sse2 sse4 avx avx2 xop avx512f avx512bw avx512dq avx512er avx512vbmi
  FMA := fma4 fma
  SG := avx2-sg avx512f-sg avx512vl-sg
+AES := ssse3-aes avx-aes avx2-vaes avx512bw-vaes
  GF := sse2-gf avx2-gf avx512bw-gf
-TESTCASES := blowfish $(SIMD) $(FMA) $(SG) $(GF)
+TESTCASES := blowfish $(SIMD) $(FMA) $(SG) $(AES) $(GF)
  
  OPMASK := avx512f avx512dq avx512bw
  
@@ -143,6 +144,10 @@ $(1)-cflags := \
  	   $(foreach flt,$($(1)-flts), \
  	     "-D_$(vec)x$(idx)f$(flt) -m$(1:-sg=) $(call non-sse,$(1)) -Os -DVEC_MAX=$(vec) -DIDX_SIZE=$(idx) -DFLOAT_SIZE=$(flt)")))
  endef
+define simd-aes-defs
+$(1)-cflags := $(foreach vec,$($(patsubst %-aes,sse,$(1))-vecs) $($(patsubst %-vaes,%,$(1))-vecs), \
+	         "-D_$(vec) -maes $(addprefix -m,$(subst -,$(space),$(1))) $(call non-sse,$(1)) -Os -DVEC_SIZE=$(vec)")
+endef
  define simd-gf-defs
  $(1)-cflags := $(foreach vec,$($(1:-gf=)-vecs), \
  	         "-D_$(vec) -mgfni -m$(1:-gf=) $(call non-sse,$(1)) -Os -DVEC_SIZE=$(vec)")
@@ -153,6 +158,7 @@ endef
  
  $(foreach flavor,$(SIMD) $(FMA),$(eval $(call simd-defs,$(flavor))))
  $(foreach flavor,$(SG),$(eval $(call simd-sg-defs,$(flavor))))
+$(foreach flavor,$(AES),$(eval $(call simd-aes-defs,$(flavor))))
  $(foreach flavor,$(GF),$(eval $(call simd-gf-defs,$(flavor))))
  $(foreach flavor,$(OPMASK),$(eval $(call opmask-defs,$(flavor))))
  
@@ -203,10 +209,13 @@ $(addsuffix .c,$(FMA)):
  $(addsuffix .c,$(SG)):
  	ln -sf simd-sg.c $@
  
+$(addsuffix .c,$(AES)):
+	ln -sf simd-aes.c $@
+
  $(addsuffix .c,$(GF)):
  	ln -sf simd-gf.c $@
  
-$(addsuffix .h,$(SIMD) $(FMA) $(SG) $(GF)): simd.h
+$(addsuffix .h,$(SIMD) $(FMA) $(SG) $(AES) $(GF)): simd.h
  
  xop.h avx512f.h: simd-fma.c
  
--- /dev/null
+++ b/tools/tests/x86_emulator/simd-aes.c
@@ -0,0 +1,102 @@
+#define UINT_SIZE 1
+
+#include "simd.h"
+ENTRY(aes_test);
+
+#if VEC_SIZE == 16
+# define AES(op, a...) __builtin_ia32_vaes ## op ## _v16qi(a)
+# define imc(x) ((vec_t)__builtin_ia32_aesimc128((vdi_t)(x)))
+#elif VEC_SIZE == 32
+# define AES(op, a...) __builtin_ia32_vaes ## op ## _v32qi(a)
+# define imc(x) ({ \
+    vec_t r_; \
+    unsigned char __attribute__((vector_size(16))) t_; \
+    asm ( "vaesimc (%3), %x0\n\t" \
+          "vaesimc 16(%3), %1\n\t" \
+          "vinserti128 $1, %1, %0, %0" \
+          : "=&v" (r_), "=&v" (t_) \
+          : "m" (x), "r" (&(x)) ); \
+    r_; \
+})
+#elif VEC_SIZE == 64
+# define AES(op, a...) __builtin_ia32_vaes ## op ## _v64qi(a)
+# define imc(x) ({ \
+    vec_t r_; \
+    unsigned char __attribute__((vector_size(16))) t_; \
+    asm ( "vaesimc (%3), %x0\n\t" \
+          "vaesimc 1*16(%3), %1\n\t" \
+          "vinserti32x4 $1, %1, %0, %0\n\t" \
+          "vaesimc 2*16(%3), %1\n\t" \
+          "vinserti32x4 $2, %1, %0, %0\n\t" \
+          "vaesimc 3*16(%3), %1\n\t" \
+          "vinserti32x4 $3, %1, %0, %0" \
+          : "=&v" (r_), "=&v" (t_) \
+          : "m" (x), "r" (&(x)) ); \
+    r_; \
+})
+#endif
+
+#ifdef __AVX512BW__
+# define ALL_TRUE (~0ULL >> (64 - ELEM_COUNT))
+# define eq(x, y) (B(pcmpeqb, _mask, (vqi_t)(x), (vqi_t)(y), -1) == ALL_TRUE)
+# define aes(op, x, y) ((vec_t)AES(op, (vqi_t)(x), (vqi_t)(y)))
+#else
+# if defined(__AVX2__) && VEC_SIZE == 32
+#  define to_bool(cmp) B(ptestc, , cmp, (vdi_t){} == 0)
+#  define aes(op, x, y) ((vec_t)AES(op, (vqi_t)(x), (vqi_t)(y)))
+# else
+#  define to_bool(cmp) (__builtin_ia32_pmovmskb128(cmp) == 0xffff)
+#  define aes(op, x, y) ((vec_t)__builtin_ia32_aes ## op ## 128((vdi_t)(x), (vdi_t)(y)))
+# endif
+# define eq(x, y) to_bool((x) == (y))
+#endif
+
+int aes_test(void)
+{
+    unsigned int i;
+    vec_t src, zero = {};
+
+    for ( i = 0; i < ELEM_COUNT; ++i )
+        src[i] = i;
+
+    do {
+        vec_t x, y;
+
+        touch(src);
+        x = imc(src);
+        touch(src);
+
+        touch(zero);
+        y = aes(enclast, src, zero);
+        touch(zero);
+        y = aes(dec, y, zero);
+
+        if ( !eq(x, y) ) return __LINE__;
+
+        touch(zero);
+        x = aes(declast, src, zero);
+        touch(zero);
+        y = aes(enc, x, zero);
+        touch(y);
+        x = imc(y);
+
+        if ( !eq(x, src) ) return __LINE__;
+
+#if VEC_SIZE == 16
+        touch(src);
+        x = (vec_t)__builtin_ia32_aeskeygenassist128((vdi_t)src, 0);
+        touch(src);
+        y = (vec_t)__builtin_ia32_pshufb128((vqi_t)x,
+                                            (vqi_t){  7,  4,  5,  6,
+                                                      1,  2,  3,  0,
+                                                     15, 12, 13, 14,
+                                                      9, 10, 11,  8 });
+        if ( !eq(x, y) ) return __LINE__;
+#endif
+
+        src += ELEM_COUNT;
+        i += ELEM_COUNT;
+    } while ( i <= 256 );
+
+    return 0;
+}
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -340,6 +340,10 @@ REN(pandn, , d);
  REN(por, , d);
  REN(pxor, , d);
  #  endif
+OVR(aesdec);
+OVR(aesdeclast);
+OVR(aesenc);
+OVR(aesenclast);
  OVR(cvtpd2dqx);
  OVR(cvtpd2dqy);
  OVR(cvtpd2psx);
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -12,12 +12,15 @@ asm ( ".pushsection .test, \"ax\", @prog
  #include "sse.h"
  #include "sse2.h"
  #include "sse2-gf.h"
+#include "ssse3-aes.h"
  #include "sse4.h"
  #include "avx.h"
+#include "avx-aes.h"
  #include "fma4.h"
  #include "fma.h"
  #include "avx2.h"
  #include "avx2-sg.h"
+#include "avx2-vaes.h"
  #include "avx2-gf.h"
  #include "xop.h"
  #include "avx512f-opmask.h"
@@ -27,6 +30,7 @@ asm ( ".pushsection .test, \"ax\", @prog
  #include "avx512f-sg.h"
  #include "avx512vl-sg.h"
  #include "avx512bw.h"
+#include "avx512bw-vaes.h"
  #include "avx512bw-gf.h"
  #include "avx512dq.h"
  #include "avx512er.h"
@@ -91,6 +95,16 @@ static bool simd_check_xop(void)
      return cpu_has_xop;
  }
  
+static bool simd_check_ssse3_aes(void)
+{
+    return cpu_has_aesni && cpu_has_ssse3;
+}
+
+static bool simd_check_avx_aes(void)
+{
+    return cpu_has_aesni && cpu_has_avx;
+}
+
  static bool simd_check_avx512f(void)
  {
      return cpu_has_avx512f;
@@ -141,6 +155,22 @@ static bool simd_check_avx512vbmi_vl(voi
      return cpu_has_avx512_vbmi && cpu_has_avx512vl;
  }
  
+static bool simd_check_avx2_vaes(void)
+{
+    return cpu_has_aesni && cpu_has_vaes && cpu_has_avx2;
+}
+
+static bool simd_check_avx512bw_vaes(void)
+{
+    return cpu_has_aesni && cpu_has_vaes && cpu_has_avx512bw;
+}
+
+static bool simd_check_avx512bw_vaes_vl(void)
+{
+    return cpu_has_aesni && cpu_has_vaes &&
+           cpu_has_avx512bw && cpu_has_avx512vl;
+}
+
  static bool simd_check_sse2_gf(void)
  {
      return cpu_has_gfni && cpu_has_sse2;
@@ -319,6 +349,8 @@ static const struct {
      SIMD(XOP i16x16,              xop,      32i2),
      SIMD(XOP i32x8,               xop,      32i4),
      SIMD(XOP i64x4,               xop,      32i8),
+    SIMD(AES (legacy),      ssse3_aes,        16),
+    SIMD(AES (VEX/x16),       avx_aes,        16),
      SIMD(OPMASK/w,     avx512f_opmask,         2),
      SIMD(OPMASK+DQ/b, avx512dq_opmask,         1),
      SIMD(OPMASK+DQ/w, avx512dq_opmask,         2),
@@ -418,6 +450,10 @@ static const struct {
      AVX512VL(_VBMI+VL u16x8, avx512vbmi,    16u2),
      AVX512VL(_VBMI+VL s16x16, avx512vbmi,   32i2),
      AVX512VL(_VBMI+VL u16x16, avx512vbmi,   32u2),
+    SIMD(VAES (VEX/x32),    avx2_vaes,        32),
+    SIMD(VAES (EVEX/x64), avx512bw_vaes,      64),
+    AVX512VL(VL+VAES (x16), avx512bw_vaes,    16),
+    AVX512VL(VL+VAES (x32), avx512bw_vaes,    32),
      SIMD(GFNI (legacy),       sse2_gf,        16),
      SIMD(GFNI (VEX/x16),      avx2_gf,        16),
      SIMD(GFNI (VEX/x32),      avx2_gf,        32),
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -125,10 +125,12 @@ static inline bool xcr0_mask(uint64_t ma
  #define cpu_has_sse        cp.basic.sse
  #define cpu_has_sse2       cp.basic.sse2
  #define cpu_has_sse3       cp.basic.sse3
+#define cpu_has_ssse3      cp.basic.ssse3
  #define cpu_has_fma       (cp.basic.fma && xcr0_mask(6))
  #define cpu_has_sse4_1     cp.basic.sse4_1
  #define cpu_has_sse4_2     cp.basic.sse4_2
  #define cpu_has_popcnt     cp.basic.popcnt
+#define cpu_has_aesni      cp.basic.aesni
  #define cpu_has_avx       (cp.basic.avx  && xcr0_mask(6))
  #define cpu_has_f16c      (cp.basic.f16c && xcr0_mask(6))
  

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v9 22/23] x86emul: add a SHA test case to the harness

Posted by Jan Beulich 2 weeks ago
Also use this for AVX512VL VPRO{L,R}{,V}D as well as some further shifts
testing.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
v8: New.

--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -20,8 +20,9 @@ SIMD := 3dnow sse sse2 sse4 avx avx2 xop
  FMA := fma4 fma
  SG := avx2-sg avx512f-sg avx512vl-sg
  AES := ssse3-aes avx-aes avx2-vaes avx512bw-vaes
+SHA := sse4-sha avx-sha avx512f-sha
  GF := sse2-gf avx2-gf avx512bw-gf
-TESTCASES := blowfish $(SIMD) $(FMA) $(SG) $(AES) $(GF)
+TESTCASES := blowfish $(SIMD) $(FMA) $(SG) $(AES) $(SHA) $(GF)
  
  OPMASK := avx512f avx512dq avx512bw
  
@@ -148,6 +149,10 @@ define simd-aes-defs
  $(1)-cflags := $(foreach vec,$($(patsubst %-aes,sse,$(1))-vecs) $($(patsubst %-vaes,%,$(1))-vecs), \
  	         "-D_$(vec) -maes $(addprefix -m,$(subst -,$(space),$(1))) $(call non-sse,$(1)) -Os -DVEC_SIZE=$(vec)")
  endef
+define simd-sha-defs
+$(1)-cflags := $(foreach vec,$(sse-vecs), \
+	         "-D_$(vec) $(addprefix -m,$(subst -,$(space),$(1))) -Os -DVEC_SIZE=$(vec)")
+endef
  define simd-gf-defs
  $(1)-cflags := $(foreach vec,$($(1:-gf=)-vecs), \
  	         "-D_$(vec) -mgfni -m$(1:-gf=) $(call non-sse,$(1)) -Os -DVEC_SIZE=$(vec)")
@@ -159,6 +164,7 @@ endef
  $(foreach flavor,$(SIMD) $(FMA),$(eval $(call simd-defs,$(flavor))))
  $(foreach flavor,$(SG),$(eval $(call simd-sg-defs,$(flavor))))
  $(foreach flavor,$(AES),$(eval $(call simd-aes-defs,$(flavor))))
+$(foreach flavor,$(SHA),$(eval $(call simd-sha-defs,$(flavor))))
  $(foreach flavor,$(GF),$(eval $(call simd-gf-defs,$(flavor))))
  $(foreach flavor,$(OPMASK),$(eval $(call opmask-defs,$(flavor))))
  
@@ -212,10 +218,13 @@ $(addsuffix .c,$(SG)):
  $(addsuffix .c,$(AES)):
  	ln -sf simd-aes.c $@
  
+$(addsuffix .c,$(SHA)):
+	ln -sf simd-sha.c $@
+
  $(addsuffix .c,$(GF)):
  	ln -sf simd-gf.c $@
  
-$(addsuffix .h,$(SIMD) $(FMA) $(SG) $(AES) $(GF)): simd.h
+$(addsuffix .h,$(SIMD) $(FMA) $(SG) $(AES) $(SHA) $(GF)): simd.h
  
  xop.h avx512f.h: simd-fma.c
  
--- /dev/null
+++ b/tools/tests/x86_emulator/simd-sha.c
@@ -0,0 +1,392 @@
+#define INT_SIZE 4
+
+#include "simd.h"
+ENTRY(sha_test);
+
+#define SHA(op, a...) __builtin_ia32_sha ## op(a)
+
+#ifdef __AVX512F__
+# define ALL_TRUE (~0ULL >> (64 - ELEM_COUNT))
+# define eq(x, y) (B(pcmpeqd, _mask, x, y, -1) == ALL_TRUE)
+# define blend(x, y, sel) B(movdqa32_, _mask, y, x, sel)
+# define rot_c(f, r, x, n) B(pro ## f ## d, _mask, x, n, undef(), ~0)
+# define rot_s(f, r, x, n) ({ /* gcc does not support embedded broadcast */ \
+    vec_t r_; \
+    asm ( "vpro" #f "vd %2%{1to%c3%}, %1, %0" \
+          : "=v" (r_) \
+          : "v" (x), "m" (n), "i" (ELEM_COUNT) ); \
+    r_; \
+})
+# define rot_v(d, x, n) B(pro ## d ## vd, _mask, x, n, undef(), ~0)
+# define shift_s(d, x, n) ({ \
+    vec_t r_; \
+    asm ( "vps" #d "lvd %2%{1to%c3%}, %1, %0" \
+          : "=v" (r_) \
+          : "v" (x), "m" (n), "i" (ELEM_COUNT) ); \
+    r_; \
+})
+# define vshift(d, x, n) ({ /* gcc does not allow memory operands */ \
+    vec_t r_; \
+    asm ( "vps" #d "ldq %2, %1, %0" \
+          : "=v" (r_) : "m" (x), "i" ((n) * ELEM_SIZE) ); \
+    r_; \
+})
+#else
+# define to_bool(cmp) (__builtin_ia32_pmovmskb128(cmp) == 0xffff)
+# define eq(x, y) to_bool((x) == (y))
+# define blend(x, y, sel) \
+    ((vec_t)__builtin_ia32_pblendw128((vhi_t)(x), (vhi_t)(y), \
+                                      ((sel) & 1 ? 0x03 : 0) | \
+                                      ((sel) & 2 ? 0x0c : 0) | \
+                                      ((sel) & 4 ? 0x30 : 0) | \
+                                      ((sel) & 8 ? 0xc0 : 0)))
+# define rot_c(f, r, x, n) (sh ## f ## _c(x, n) | sh ## r ## _c(x, 32 - (n)))
+# define rot_s(f, r, x, n) ({ /* gcc does not allow memory operands */ \
+    vec_t r_, t_, n_ = (vec_t){ 32 } - (n); \
+    asm ( "ps" #f "ld %2, %0; ps" #r "ld %3, %1; por %1, %0" \
+          : "=&x" (r_), "=&x" (t_) \
+          : "m" (n), "m" (n_), "0" (x), "1" (x) ); \
+    r_; \
+})
+static inline unsigned int rotl(unsigned int x, unsigned int n)
+{
+    return (x << (n & 0x1f)) | (x >> ((32 - n) & 0x1f));
+}
+static inline unsigned int rotr(unsigned int x, unsigned int n)
+{
+    return (x >> (n & 0x1f)) | (x << ((32 - n) & 0x1f));
+}
+# define rot_v(d, x, n) ({ \
+    vec_t t_; \
+    unsigned int i_; \
+    for ( i_ = 0; i_ < ELEM_COUNT; ++i_ ) \
+        t_[i_] = rot ## d((x)[i_], (n)[i_]); \
+    t_; \
+})
+# define shift_s(d, x, n) ({ \
+    vec_t r_; \
+    asm ( "ps" #d "ld %1, %0" : "=&x" (r_) : "m" (n), "0" (x) ); \
+    r_; \
+})
+# define vshift(d, x, n) \
+    (vec_t)(__builtin_ia32_ps ## d ## ldqi128((vdi_t)(x), (n) * ELEM_SIZE * 8))
+#endif
+
+#define alignr(x, y, n) ((vec_t)__builtin_ia32_palignr128((vdi_t)(x), (vdi_t)(y), (n) * 8))
+#define hadd(x, y) __builtin_ia32_phaddd128(x, y)
+#define rol_c(x, n) rot_c(l, r, x, n)
+#define rol_s(x, n) rot_s(l, r, x, n)
+#define rol_v(x, n...) rot_v(l, x, n)
+#define ror_c(x, n) rot_c(r, l, x, n)
+#define ror_s(x, n) rot_s(r, l, x, n)
+#define ror_v(x, n...) rot_v(r, x, n)
+#define shl_c(x, n) __builtin_ia32_pslldi128(x, n)
+#define shl_s(x, n) shift_s(l, x, n)
+#define shr_c(x, n) __builtin_ia32_psrldi128(x, n)
+#define shr_s(x, n) shift_s(r, x, n)
+#define shuf(x, s) __builtin_ia32_pshufd(x, s)
+#define swap(x) shuf(x, 0b00011011)
+#define vshl(x, n) vshift(l, x, n)
+#define vshr(x, n) vshift(r, x, n)
+
+static inline vec_t sha256_sigma0(vec_t w)
+{
+    vec_t res;
+
+    touch(w);
+    res = ror_c(w, 7);
+    touch(w);
+    res ^= rol_c(w, 14);
+    touch(w);
+    res ^= shr_c(w, 3);
+    touch(w);
+
+    return res;
+}
+
+static inline vec_t sha256_sigma1(vec_t w)
+{
+    vec_t _17 = { 17 }, _19 = { 19 }, _10 = { 10 };
+
+    return ror_s(w, _17) ^ ror_s(w, _19) ^ shr_s(w, _10);
+}
+
+static inline vec_t sha256_Sigma0(vec_t w)
+{
+    vec_t res, n1 = { 0, 0, 2, 2 }, n2 = { 0, 0, 13, 13 }, n3 = { 0, 0, 10, 10 };
+
+    touch(n1);
+    res = ror_v(w, n1);
+    touch(n2);
+    res ^= ror_v(w, n2);
+    touch(n3);
+
+    return res ^ rol_v(w, n3);
+}
+
+static inline vec_t sha256_Sigma1(vec_t w)
+{
+    return ror_c(w, 6) ^ ror_c(w, 11) ^ rol_c(w, 7);
+}
+
+int sha_test(void)
+{
+    unsigned int i;
+    vec_t src, one = { 1 };
+    vqi_t raw = {};
+
+    for ( i = 1; i < VEC_SIZE; ++i )
+        raw[i] = i;
+    src = (vec_t)raw;
+
+    for ( i = 0; i < 256; i += VEC_SIZE )
+    {
+        vec_t x, y, tmp, hash = -src;
+        vec_t a, b, c, d, e, g, h;
+        unsigned int k, r;
+
+        touch(src);
+        x = SHA(1msg1, hash, src);
+        touch(src);
+        y = hash ^ alignr(hash, src, 8);
+        touch(src);
+
+        if ( !eq(x, y) ) return __LINE__;
+
+        touch(src);
+        x = SHA(1msg2, hash, src);
+        touch(src);
+        tmp = hash ^ alignr(src, hash, 12);
+        touch(tmp);
+        y = rol_c(tmp, 1);
+        tmp = hash ^ alignr(src, y, 12);
+        touch(tmp);
+        y = rol_c(tmp, 1);
+
+        if ( !eq(x, y) ) return __LINE__;
+
+        touch(src);
+        x = SHA(1msg2, hash, src);
+        touch(src);
+        tmp = rol_s(hash ^ alignr(src, hash, 12), one);
+        y = rol_s(hash ^ alignr(src, tmp, 12), one);
+
+        if ( !eq(x, y) ) return __LINE__;
+
+        touch(src);
+        x = SHA(1nexte, hash, src);
+        touch(src);
+        touch(hash);
+        tmp = rol_c(hash, 30);
+        tmp[2] = tmp[1] = tmp[0] = 0;
+
+        if ( !eq(x, src + tmp) ) return __LINE__;
+
+        /*
+         * SHA1RNDS4
+         *
+         * SRC1 = { A0, B0, C0, D0 }
+         * SRC2 = W' = { W[0]E0, W[1], W[2], W[3] }
+         *
+         * (NB that the notation is not C-like, i.e. elements are listed
+         * high-to-low everywhere in this comment.)
+         *
+         * In order to pick a simple rounds function, an immediate value of
+         * 1 is used; 3 would also be a possibility.
+         *
+         * Applying
+         *
+         * A1 = ROL5(A0) + (B0 ^ C0 ^ D0) + W'[0] + K
+         * E1 = D0
+         * D1 = C0
+         * C1 = ROL30(B0)
+         * B1 = A0
+         *
+         * iteratively four times and resolving round variable values to
+         * A<n> and B0, C0, and D0 we get
+         *
+         * A4 = ROL5(A3) + (A2 ^ ROL30(A1) ^ ROL30(A0)) + W'[3] + ROL30(B0) + K
+         * A3 = ROL5(A2) + (A1 ^ ROL30(A0) ^ ROL30(B0)) + W'[2] +       C0  + K
+         * A2 = ROL5(A1) + (A0 ^ ROL30(B0) ^       C0 ) + W'[1] +       D0  + K
+         * A1 = ROL5(A0) + (B0 ^       C0  ^       D0 ) + W'[0]             + K
+         *
+         * (respective per-column variable names:
+         *  y         a      b          c           d      src           e    k
+         * )
+         *
+         * with
+         *
+         * B4 = A3
+         * C4 = ROL30(A2)
+         * D4 = ROL30(A1)
+         * E4 = ROL30(A0)
+         *
+         * and hence
+         *
+         * DST = { A4, A3, ROL30(A2), ROL30(A1) }
+         */
+
+        touch(src);
+        x = SHA(1rnds4, hash, src, 1);
+        touch(src);
+
+        a = vshr(hash, 3);
+        b = vshr(hash, 2);
+        touch(hash);
+        d = rol_c(hash, 30);
+        touch(hash);
+        d = blend(d, hash, 0b0011);
+        c = vshr(d, 1);
+        e = vshl(d, 1);
+        tmp = (vec_t){};
+        k = rol_c(SHA(1rnds4, tmp, tmp, 1), 2)[0];
+
+        for ( r = 0; r < 4; ++r )
+        {
+            y = rol_c(a, 5) + (b ^ c ^ d) + swap(src) + e + k;
+
+            switch ( r )
+            {
+            case 0:
+                c[3] = rol_c(y, 30)[0];
+                /* fall through */
+            case 1:
+                b[r + 2] = y[r];
+                /* fall through */
+            case 2:
+                a[r + 1] = y[r];
+                break;
+            }
+
+            switch ( r )
+            {
+            case 3:
+                if ( a[3] != y[2] ) return __LINE__;
+                /* fall through */
+            case 2:
+                if ( a[2] != y[1] ) return __LINE__;
+                if ( b[3] != y[1] ) return __LINE__;
+                /* fall through */
+            case 1:
+                if ( a[1] != y[0] ) return __LINE__;
+                if ( b[2] != y[0] ) return __LINE__;
+                if ( c[3] != rol_c(y, 30)[0] ) return __LINE__;
+                break;
+            }
+        }
+
+        a = blend(rol_c(y, 30), y, 0b1100);
+
+        if ( !eq(x, a) ) return __LINE__;
+
+        touch(src);
+        x = SHA(256msg1, hash, src);
+        touch(src);
+        y = hash + sha256_sigma0(alignr(src, hash, 4));
+
+        if ( !eq(x, y) ) return __LINE__;
+
+        touch(src);
+        x = SHA(256msg2, hash, src);
+        touch(src);
+        tmp = hash + sha256_sigma1(alignr(hash, src, 8));
+        y = hash + sha256_sigma1(alignr(tmp, src, 8));
+
+        if ( !eq(x, y) ) return __LINE__;
+
+        /*
+         * SHA256RNDS2
+         *
+         * SRC1 = { C0, D0, G0, H0 }
+         * SRC2 = { A0, B0, E0, F0 }
+         * XMM0 = W' = { ?, ?, WK1, WK0 }
+         *
+         * (NB that the notation again is not C-like, i.e. elements are listed
+         * high-to-low everywhere in this comment.)
+         *
+         * Ch(E,F,G) = (E & F) ^ (~E & G)
+         * Maj(A,B,C) = (A & B) ^ (A & C) ^ (B & C)
+         *
+         * Σ0(A) = ROR2(A) ^ ROR13(A) ^ ROR22(A)
+         * Σ1(E) = ROR6(E) ^ ROR11(E) ^ ROR25(E)
+         *
+         * Applying
+         *
+         * A1 = Ch(E0, F0, G0) + Σ1(E0) + WK0 + H0 + Maj(A0, B0, C0) + Σ0(A0)
+         * B1 = A0
+         * C1 = B0
+         * D1 = C0
+         * E1 = Ch(E0, F0, G0) + Σ1(E0) + WK0 + H0 + D0
+         * F1 = E0
+         * G1 = F0
+         * H1 = G0
+         *
+         * iteratively four times and resolving round variable values to
+         * A<n> / E<n> and B0, C0, D0, F0, G0, and H0 we get
+         *
+         * A2 = Ch(E1, E0, F0) + Σ1(E1) + WK1 + G0 + Maj(A1, A0, B0) + Σ0(A1)
+         * A1 = Ch(E0, F0, G0) + Σ1(E0) + WK0 + H0 + Maj(A0, B0, C0) + Σ0(A0)
+         * E2 = Ch(E1, E0, F0) + Σ1(E1) + WK1 + G0 + C0
+         * E1 = Ch(E0, F0, G0) + Σ1(E0) + WK0 + H0 + D0
+         *
+         * with
+         *
+         * B2 = A1
+         * F2 = E1
+         *
+         * and hence
+         *
+         * DST = { A2, A1, E2, E1 }
+         *
+         * which we can simplify a little, by letting A0, B0, and E0 be zero
+         * and F0 = ~G0, and by then utilizing
+         *
+         * Ch(0, 0, x) = x
+         * Ch(x, 0, y) = ~x & y
+         * Maj(x, 0, 0) = Maj(0, x, 0) = Maj(0, 0, x) = 0
+         *
+         * A2 = (~E1 & F0) + Σ1(E1) + WK1 + G0 + Σ0(A1)
+         * A1 = (~E0 & G0) + Σ1(E0) + WK0 + H0 + Σ0(A0)
+         * E2 = (~E1 & F0) + Σ1(E1) + WK1 + G0 + C0
+         * E1 = (~E0 & G0) + Σ1(E0) + WK0 + H0 + D0
+         *
+         * (respective per-column variable names:
+         *  y      e    g        e    src    h    d
+         * )
+         */
+
+        tmp = (vec_t){ ~hash[1] };
+        touch(tmp);
+        x = SHA(256rnds2, hash, tmp, src);
+        touch(tmp);
+
+        e = y = (vec_t){};
+        d = alignr(y, hash, 8);
+        g = (vec_t){ hash[1], tmp[0], hash[1], tmp[0] };
+        h = shuf(hash, 0b01000100);
+
+        for ( r = 0; r < 2; ++r )
+        {
+            y = (~e & g) + sha256_Sigma1(e) + shuf(src, 0b01000100) +
+                h + sha256_Sigma0(d);
+
+            if ( !r )
+            {
+                d[3] = y[2];
+                e[3] = e[1] = y[0];
+            }
+            else if ( d[3] != y[2] )
+                return __LINE__;
+            else if ( e[1] != y[0] )
+                return __LINE__;
+            else if ( e[3] != y[0] )
+                return __LINE__;
+        }
+
+        if ( !eq(x, y) ) return __LINE__;
+
+        src += 0x01010101 * VEC_SIZE;
+    }
+
+    return 0;
+}
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -14,8 +14,10 @@ asm ( ".pushsection .test, \"ax\", @prog
  #include "sse2-gf.h"
  #include "ssse3-aes.h"
  #include "sse4.h"
+#include "sse4-sha.h"
  #include "avx.h"
  #include "avx-aes.h"
+#include "avx-sha.h"
  #include "fma4.h"
  #include "fma.h"
  #include "avx2.h"
@@ -28,6 +30,7 @@ asm ( ".pushsection .test, \"ax\", @prog
  #include "avx512bw-opmask.h"
  #include "avx512f.h"
  #include "avx512f-sg.h"
+#include "avx512f-sha.h"
  #include "avx512vl-sg.h"
  #include "avx512bw.h"
  #include "avx512bw-vaes.h"
@@ -155,6 +158,21 @@ static bool simd_check_avx512vbmi_vl(voi
      return cpu_has_avx512_vbmi && cpu_has_avx512vl;
  }
  
+static bool simd_check_sse4_sha(void)
+{
+    return cpu_has_sha && cpu_has_sse4_2;
+}
+
+static bool simd_check_avx_sha(void)
+{
+    return cpu_has_sha && cpu_has_avx;
+}
+
+static bool simd_check_avx512f_sha_vl(void)
+{
+    return cpu_has_sha && cpu_has_avx512vl;
+}
+
  static bool simd_check_avx2_vaes(void)
  {
      return cpu_has_aesni && cpu_has_vaes && cpu_has_avx2;
@@ -450,6 +468,9 @@ static const struct {
      AVX512VL(_VBMI+VL u16x8, avx512vbmi,    16u2),
      AVX512VL(_VBMI+VL s16x16, avx512vbmi,   32i2),
      AVX512VL(_VBMI+VL u16x16, avx512vbmi,   32u2),
+    SIMD(SHA,                sse4_sha,        16),
+    SIMD(AVX+SHA,             avx_sha,        16),
+    AVX512VL(VL+SHA,      avx512f_sha,        16),
      SIMD(VAES (VEX/x32),    avx2_vaes,        32),
      SIMD(VAES (EVEX/x64), avx512bw_vaes,      64),
      AVX512VL(VL+VAES (x16), avx512bw_vaes,    16),
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -142,6 +142,7 @@ static inline bool xcr0_mask(uint64_t ma
  #define cpu_has_avx512_ifma (cp.feat.avx512_ifma && xcr0_mask(0xe6))
  #define cpu_has_avx512er  (cp.feat.avx512er && xcr0_mask(0xe6))
  #define cpu_has_avx512cd  (cp.feat.avx512cd && xcr0_mask(0xe6))
+#define cpu_has_sha        cp.feat.sha
  #define cpu_has_avx512bw  (cp.feat.avx512bw && xcr0_mask(0xe6))
  #define cpu_has_avx512vl  (cp.feat.avx512vl && xcr0_mask(0xe6))
  #define cpu_has_avx512_vbmi (cp.feat.avx512_vbmi && xcr0_mask(0xe6))

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v9 23/23] x86emul: support VPCLMULQDQ insns

Posted by Jan Beulich 2 weeks ago
As to the feature dependency adjustment, while strictly speaking AVX is
a sufficient prereq (to have YMM registers), 256-bit vectors of integers
have got fully introduced with AVX2 only. Sadly gcc can't be used as a
reference here: They don't provide any AVX512-independent built-in at
all.

Along the lines of PCLMULQDQ, since the insns here and in particular
their memory access patterns follow the usual scheme, I didn't think it
was necessary to add a contrived test specifically for them, beyond the
Disp8 scaling one.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
v9: Re-base. Make VPCLMULQDQ also depend on PCLMULQDQ.
v8: No need to set fault_suppression to false.
v7: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -591,6 +591,10 @@ static const struct test avx512_vpopcntd
      INSN(popcnt, 66, 0f38, 55, vl, dq, vl)
  };
  
+static const struct test vpclmulqdq_all[] = {
+    INSN(pclmulqdq, 66, 0f3a, 44, vl, q_nb, vl)
+};
+
  static const unsigned char vl_all[] = { VL_512, VL_128, VL_256 };
  static const unsigned char vl_128[] = { VL_128 };
  static const unsigned char vl_no128[] = { VL_512, VL_256 };
@@ -968,4 +972,9 @@ void evex_disp8_test(void *instr, struct
      RUN(avx512_vbmi2, all);
      RUN(avx512_vnni, all);
      RUN(avx512_vpopcntdq, all);
+
+    if ( cpu_has_avx512f )
+    {
+        RUN(vpclmulqdq, all);
+    }
  }
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -144,6 +144,7 @@ static inline bool xcr0_mask(uint64_t ma
  #define cpu_has_avx512vl  (cp.feat.avx512vl && xcr0_mask(0xe6))
  #define cpu_has_avx512_vbmi (cp.feat.avx512_vbmi && xcr0_mask(0xe6))
  #define cpu_has_avx512_vbmi2 (cp.feat.avx512_vbmi2 && xcr0_mask(0xe6))
+#define cpu_has_vpclmulqdq (cp.feat.vpclmulqdq && xcr0_mask(6))
  #define cpu_has_avx512_vnni (cp.feat.avx512_vnni && xcr0_mask(0xe6))
  #define cpu_has_avx512_bitalg (cp.feat.avx512_bitalg && xcr0_mask(0xe6))
  #define cpu_has_avx512_vpopcntdq (cp.feat.avx512_vpopcntdq && xcr0_mask(0xe6))
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -594,7 +594,7 @@ static const struct ext0f3a_table {
      [0x3e ... 0x3f] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
      [0x40 ... 0x41] = { .simd_size = simd_packed_fp },
      [0x42 ... 0x43] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
-    [0x44] = { .simd_size = simd_packed_int },
+    [0x44] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
      [0x46] = { .simd_size = simd_packed_int },
      [0x48 ... 0x49] = { .simd_size = simd_packed_fp, .four_op = 1 },
      [0x4a ... 0x4b] = { .simd_size = simd_packed_fp, .four_op = 1 },
@@ -1890,6 +1890,7 @@ in_protmode(
  #define vcpu_has_avx512vl()    (ctxt->cpuid->feat.avx512vl)
  #define vcpu_has_avx512_vbmi() (ctxt->cpuid->feat.avx512_vbmi)
  #define vcpu_has_avx512_vbmi2() (ctxt->cpuid->feat.avx512_vbmi2)
+#define vcpu_has_vpclmulqdq()  (ctxt->cpuid->feat.vpclmulqdq)
  #define vcpu_has_avx512_vnni() (ctxt->cpuid->feat.avx512_vnni)
  #define vcpu_has_avx512_bitalg() (ctxt->cpuid->feat.avx512_bitalg)
  #define vcpu_has_avx512_vpopcntdq() (ctxt->cpuid->feat.avx512_vpopcntdq)
@@ -10207,13 +10208,19 @@ x86_emulate(
          goto opmask_shift_imm;
  
      case X86EMUL_OPC_66(0x0f3a, 0x44):     /* pclmulqdq $imm8,xmm/m128,xmm */
-    case X86EMUL_OPC_VEX_66(0x0f3a, 0x44): /* vpclmulqdq $imm8,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x44): /* vpclmulqdq $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
          host_and_vcpu_must_have(pclmulqdq);
          if ( vex.opcx == vex_none )
              goto simd_0f3a_common;
-        generate_exception_if(vex.l, EXC_UD);
+        if ( vex.l )
+            host_and_vcpu_must_have(vpclmulqdq);
          goto simd_0f_imm8_avx;
  
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x44): /* vpclmulqdq $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm */
+        host_and_vcpu_must_have(vpclmulqdq);
+        generate_exception_if(evex.brs || evex.opmsk, EXC_UD);
+        goto avx512f_imm8_no_sae;
+
      case X86EMUL_OPC_VEX_66(0x0f3a, 0x4a): /* vblendvps {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
      case X86EMUL_OPC_VEX_66(0x0f3a, 0x4b): /* vblendvpd {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
          generate_exception_if(vex.w, EXC_UD);
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -111,6 +111,7 @@
  /* CPUID level 0x00000007:0.ecx */
  #define cpu_has_avx512_vbmi     boot_cpu_has(X86_FEATURE_AVX512_VBMI)
  #define cpu_has_avx512_vbmi2    boot_cpu_has(X86_FEATURE_AVX512_VBMI2)
+#define cpu_has_vpclmulqdq      boot_cpu_has(X86_FEATURE_VPCLMULQDQ)
  #define cpu_has_avx512_vnni     boot_cpu_has(X86_FEATURE_AVX512_VNNI)
  #define cpu_has_avx512_bitalg   boot_cpu_has(X86_FEATURE_AVX512_BITALG)
  #define cpu_has_avx512_vpopcntdq boot_cpu_has(X86_FEATURE_AVX512_VPOPCNTDQ)
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -121,7 +121,7 @@ XEN_CPUFEATURE(PBE,           0*32+31) /
  
  /* Intel-defined CPU features, CPUID level 0x00000001.ecx, word 1 */
  XEN_CPUFEATURE(SSE3,          1*32+ 0) /*A  Streaming SIMD Extensions-3 */
-XEN_CPUFEATURE(PCLMULQDQ,     1*32+ 1) /*A  Carry-less mulitplication */
+XEN_CPUFEATURE(PCLMULQDQ,     1*32+ 1) /*A  Carry-less multiplication */
  XEN_CPUFEATURE(DTES64,        1*32+ 2) /*   64-bit Debug Store */
  XEN_CPUFEATURE(MONITOR,       1*32+ 3) /*   Monitor/Mwait support */
  XEN_CPUFEATURE(DSCPL,         1*32+ 4) /*   CPL Qualified Debug Store */
@@ -229,6 +229,7 @@ XEN_CPUFEATURE(UMIP,          6*32+ 2) /
  XEN_CPUFEATURE(PKU,           6*32+ 3) /*H  Protection Keys for Userspace */
  XEN_CPUFEATURE(OSPKE,         6*32+ 4) /*!  OS Protection Keys Enable */
  XEN_CPUFEATURE(AVX512_VBMI2,  6*32+ 6) /*A  Additional AVX-512 Vector Byte Manipulation Instrs */
+XEN_CPUFEATURE(VPCLMULQDQ,    6*32+10) /*A  Vector Carry-less Multiplication Instrs */
  XEN_CPUFEATURE(AVX512_VNNI,   6*32+11) /*A  Vector Neural Network Instrs */
  XEN_CPUFEATURE(AVX512_BITALG, 6*32+12) /*A  Support for VPOPCNT[B,W] and VPSHUFBITQMB */
  XEN_CPUFEATURE(AVX512_VPOPCNTDQ, 6*32+14) /*A  POPCNT for vectors of DW/QW */
--- a/xen/tools/gen-cpuid.py
+++ b/xen/tools/gen-cpuid.py
@@ -254,8 +254,9 @@ def crunch_numbers(state):
  
          # This is just the dependency between AVX512 and AVX2 of XSTATE
          # feature flags.  If want to use AVX512, AVX2 must be supported and
-        # enabled.
-        AVX2: [AVX512F],
+        # enabled.  Certain later extensions, acting on 256-bit vectors of
+        # integers, better depend on AVX2 than AVX.
+        AVX2: [AVX512F, VPCLMULQDQ],
  
          # AVX512F is taken to mean hardware support for 512bit registers
          # (which in practice depends on the EVEX prefix to encode) as well
@@ -270,6 +271,10 @@ def crunch_numbers(state):
          # registers), despite the SDM not formally making this connection.
          AVX512BW: [AVX512_BF16, AVX512_BITALG, AVX512_VBMI, AVX512_VBMI2],
  
+        # Extensions with VEX/EVEX encodings keyed to a separate feature
+        # flag are made dependents of their respective legacy feature.
+        PCLMULQDQ: [VPCLMULQDQ],
+
          # The features:
          #   * Single Thread Indirect Branch Predictors
          #   * Speculative Store Bypass Disable

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v9 23/23] x86emul: add a PCLMUL/VPCLMUL test case to the harness

Posted by Jan Beulich 2 weeks ago
Also use this for AVX512_VBMI2 VPSH{L,R}D{,V}{D,Q,W} testing (only the
quad word right shifts get actually used; the assumption is that their
"left" counterparts as well as the double word and word forms then work
as well).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Andrew Cooper <andrew.cooper3@citirx.com>
---
v8: New.

--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -20,9 +20,10 @@ SIMD := 3dnow sse sse2 sse4 avx avx2 xop
  FMA := fma4 fma
  SG := avx2-sg avx512f-sg avx512vl-sg
  AES := ssse3-aes avx-aes avx2-vaes avx512bw-vaes
+CLMUL := ssse3-pclmul avx-pclmul avx2-vpclmulqdq avx512bw-vpclmulqdq avx512vbmi2-vpclmulqdq
  SHA := sse4-sha avx-sha avx512f-sha
  GF := sse2-gf avx2-gf avx512bw-gf
-TESTCASES := blowfish $(SIMD) $(FMA) $(SG) $(AES) $(SHA) $(GF)
+TESTCASES := blowfish $(SIMD) $(FMA) $(SG) $(AES) $(CLMUL) $(SHA) $(GF)
  
  OPMASK := avx512f avx512dq avx512bw
  
@@ -89,6 +90,7 @@ avx512er-flts := 4 8
  avx512vbmi-vecs := $(avx512bw-vecs)
  avx512vbmi-ints := $(avx512bw-ints)
  avx512vbmi-flts := $(avx512bw-flts)
+avx512vbmi2-vecs := $(avx512bw-vecs)
  
  avx512f-opmask-vecs := 2
  avx512dq-opmask-vecs := 1 2
@@ -149,6 +151,10 @@ define simd-aes-defs
  $(1)-cflags := $(foreach vec,$($(patsubst %-aes,sse,$(1))-vecs) $($(patsubst %-vaes,%,$(1))-vecs), \
  	         "-D_$(vec) -maes $(addprefix -m,$(subst -,$(space),$(1))) $(call non-sse,$(1)) -Os -DVEC_SIZE=$(vec)")
  endef
+define simd-clmul-defs
+$(1)-cflags := $(foreach vec,$($(patsubst %-pclmul,sse,$(1))-vecs) $($(patsubst %-vpclmulqdq,%,$(1))-vecs), \
+	         "-D_$(vec) -mpclmul $(addprefix -m,$(subst -,$(space),$(1))) $(call non-sse,$(1)) -Os -DVEC_SIZE=$(vec)")
+endef
  define simd-sha-defs
  $(1)-cflags := $(foreach vec,$(sse-vecs), \
  	         "-D_$(vec) $(addprefix -m,$(subst -,$(space),$(1))) -Os -DVEC_SIZE=$(vec)")
@@ -164,6 +170,7 @@ endef
  $(foreach flavor,$(SIMD) $(FMA),$(eval $(call simd-defs,$(flavor))))
  $(foreach flavor,$(SG),$(eval $(call simd-sg-defs,$(flavor))))
  $(foreach flavor,$(AES),$(eval $(call simd-aes-defs,$(flavor))))
+$(foreach flavor,$(CLMUL),$(eval $(call simd-clmul-defs,$(flavor))))
  $(foreach flavor,$(SHA),$(eval $(call simd-sha-defs,$(flavor))))
  $(foreach flavor,$(GF),$(eval $(call simd-gf-defs,$(flavor))))
  $(foreach flavor,$(OPMASK),$(eval $(call opmask-defs,$(flavor))))
@@ -218,13 +225,16 @@ $(addsuffix .c,$(SG)):
  $(addsuffix .c,$(AES)):
  	ln -sf simd-aes.c $@
  
+$(addsuffix .c,$(CLMUL)):
+	ln -sf simd-clmul.c $@
+
  $(addsuffix .c,$(SHA)):
  	ln -sf simd-sha.c $@
  
  $(addsuffix .c,$(GF)):
  	ln -sf simd-gf.c $@
  
-$(addsuffix .h,$(SIMD) $(FMA) $(SG) $(AES) $(SHA) $(GF)): simd.h
+$(addsuffix .h,$(SIMD) $(FMA) $(SG) $(AES) $(CLMUL) $(SHA) $(GF)): simd.h
  
  xop.h avx512f.h: simd-fma.c
  
--- /dev/null
+++ b/tools/tests/x86_emulator/simd-clmul.c
@@ -0,0 +1,150 @@
+#define UINT_SIZE 8
+
+#include "simd.h"
+ENTRY(clmul_test);
+
+#ifdef __AVX512F__ /* AVX512BW may get enabled only below */
+# define ALL_TRUE (~0ULL >> (64 - ELEM_COUNT))
+# define eq(x, y) (B(pcmpeqq, _mask, (vdi_t)(x), (vdi_t)(y), -1) == ALL_TRUE)
+# define lane_shr_unit(x) \
+    ((vec_t)B(palignr, _mask, (vdi_t)(x), (vdi_t)(x), 64, (vdi_t){}, \
+              0x00ff00ff00ff00ffULL & (~0ULL >> (64 - VEC_SIZE))))
+#else
+# if defined(__AVX2__) && VEC_SIZE == 32
+#  define to_bool(cmp) B(ptestc, , cmp, (vdi_t){} == 0)
+# else
+#  define to_bool(cmp) (__builtin_ia32_pmovmskb128(cmp) == 0xffff)
+# endif
+# define eq(x, y) to_bool((x) == (y))
+# define lane_shr_unit(x) ((vec_t)B(palignr, , (vdi_t){}, (vdi_t)(x), 64))
+#endif
+
+#define CLMUL(op, x, y, c) (vec_t)(__builtin_ia32_ ## op((vdi_t)(x), (vdi_t)(y), c))
+
+#if VEC_SIZE == 16
+# define clmul(x, y, c) CLMUL(pclmulqdq128, x, y, c)
+# define vpshrd __builtin_ia32_vpshrd_v2di
+#elif VEC_SIZE == 32
+# define clmul(x, y, c) CLMUL(vpclmulqdq_v4di, x, y, c)
+# define vpshrd __builtin_ia32_vpshrd_v4di
+#elif VEC_SIZE == 64
+# define clmul(x, y, c) CLMUL(vpclmulqdq_v8di, x, y, c)
+# define vpshrd __builtin_ia32_vpshrd_v8di
+#endif
+
+#define clmul_ll(x, y) clmul(x, y, 0x00)
+#define clmul_hl(x, y) clmul(x, y, 0x01)
+#define clmul_lh(x, y) clmul(x, y, 0x10)
+#define clmul_hh(x, y) clmul(x, y, 0x11)
+
+#if defined(__AVX512VBMI2__)
+# pragma GCC target ( "avx512bw" )
+# define lane_shr_i(x, n) ({ \
+    vec_t h_ = lane_shr_unit(x); \
+    touch(h_); \
+    (n) < 64 ? (vec_t)vpshrd((vdi_t)(x), (vdi_t)(h_), n) : h_ >> ((n) - 64); \
+})
+# define lane_shr_v(x, n) ({ \
+    vec_t t_ = (x), h_ = lane_shr_unit(x); \
+    typeof(t_[0]) n_ = (n); \
+    if ( (n) < 64 ) \
+        /* gcc does not support embedded broadcast */ \
+        asm ( "vpshrdvq %2%{1to%c3%}, %1, %0" \
+              : "+v" (t_) : "v" (h_), "m" (n_), "i" (ELEM_COUNT) ); \
+    else \
+        t_ = h_ >> ((n) - 64); \
+    t_; \
+})
+#else
+# define lane_shr_i lane_shr_v
+# define lane_shr_v(x, n) ({ \
+    vec_t t_ = (n) > 0 ? lane_shr_unit(x) : (x); \
+    (n) < 64 ? ((x) >> (n)) | (t_ << (-(n) & 0x3f)) \
+             : t_ >> ((n) - 64); \
+})
+#endif
+
+int clmul_test(void)
+{
+    unsigned int i;
+    vec_t src;
+    vqi_t raw = {};
+
+    for ( i = 1; i < VEC_SIZE; ++i )
+        raw[i] = i;
+    src = (vec_t)raw;
+
+    for ( i = 0; i < 256; i += VEC_SIZE )
+    {
+        vec_t x = {}, y, z, lo, hi;
+        unsigned int j;
+
+        touch(x);
+        y = clmul_ll(src, x);
+        touch(x);
+
+        if ( !eq(y, x) ) return __LINE__;
+
+        for ( j = 0; j < ELEM_COUNT; j += 2 )
+            x[j] = 1;
+
+        touch(src);
+        y = clmul_ll(x, src);
+        touch(src);
+        z = clmul_lh(x, src);
+        touch(src);
+
+        for ( j = 0; j < ELEM_COUNT; j += 2 )
+            y[j + 1] = z[j];
+
+        if ( !eq(y, src) ) return __LINE__;
+
+        /*
+         * Besides the obvious property of the low and high half products
+         * being the same either direction, the "square" of a number has the
+         * property of simply being the original bit pattern with a zero bit
+         * inserted between any two bits. This is what the code below checks.
+         */
+
+        x = src;
+        touch(src);
+        y = clmul_lh(x, src);
+        touch(src);
+        z = clmul_hl(x, src);
+
+        if ( !eq(y, z) ) return __LINE__;
+
+        touch(src);
+        y = lo = clmul_ll(x, src);
+        touch(src);
+        z = hi = clmul_hh(x, src);
+        touch(src);
+
+        for ( j = 0; j < 64; ++j )
+        {
+            vec_t l = lane_shr_v(lo, 2 * j);
+            vec_t h = lane_shr_v(hi, 2 * j);
+            unsigned int n;
+
+            if ( !eq(l, y) ) return __LINE__;
+            if ( !eq(h, z) ) return __LINE__;
+
+            x = src >> j;
+
+            for ( n = 0; n < ELEM_COUNT; n += 2 )
+            {
+                if ( (x[n + 0] & 1) != (l[n] & 3) ) return __LINE__;
+                if ( (x[n + 1] & 1) != (h[n] & 3) ) return __LINE__;
+            }
+
+            touch(y);
+            y = lane_shr_i(y, 2);
+            touch(z);
+            z = lane_shr_i(z, 2);
+        }
+
+        src += 0x0101010101010101ULL * VEC_SIZE;
+    }
+
+    return 0;
+}
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -381,6 +381,7 @@ OVR(movntdq);
  OVR(movntdqa);
  OVR(movshdup);
  OVR(movsldup);
+OVR(pclmulqdq);
  OVR(permd);
  OVR(permq);
  OVR(pmovsxbd);
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -13,16 +13,19 @@ asm ( ".pushsection .test, \"ax\", @prog
  #include "sse2.h"
  #include "sse2-gf.h"
  #include "ssse3-aes.h"
+#include "ssse3-pclmul.h"
  #include "sse4.h"
  #include "sse4-sha.h"
  #include "avx.h"
  #include "avx-aes.h"
+#include "avx-pclmul.h"
  #include "avx-sha.h"
  #include "fma4.h"
  #include "fma.h"
  #include "avx2.h"
  #include "avx2-sg.h"
  #include "avx2-vaes.h"
+#include "avx2-vpclmulqdq.h"
  #include "avx2-gf.h"
  #include "xop.h"
  #include "avx512f-opmask.h"
@@ -34,10 +37,12 @@ asm ( ".pushsection .test, \"ax\", @prog
  #include "avx512vl-sg.h"
  #include "avx512bw.h"
  #include "avx512bw-vaes.h"
+#include "avx512bw-vpclmulqdq.h"
  #include "avx512bw-gf.h"
  #include "avx512dq.h"
  #include "avx512er.h"
  #include "avx512vbmi.h"
+#include "avx512vbmi2-vpclmulqdq.h"
  
  #define verbose false /* Switch to true for far more logging. */
  
@@ -108,6 +113,16 @@ static bool simd_check_avx_aes(void)
      return cpu_has_aesni && cpu_has_avx;
  }
  
+static bool simd_check_ssse3_pclmul(void)
+{
+    return cpu_has_pclmulqdq && cpu_has_ssse3;
+}
+
+static bool simd_check_avx_pclmul(void)
+{
+    return cpu_has_pclmulqdq && cpu_has_avx;
+}
+
  static bool simd_check_avx512f(void)
  {
      return cpu_has_avx512f;
@@ -189,6 +204,31 @@ static bool simd_check_avx512bw_vaes_vl(
             cpu_has_avx512bw && cpu_has_avx512vl;
  }
  
+static bool simd_check_avx2_vpclmulqdq(void)
+{
+    return cpu_has_vpclmulqdq && cpu_has_avx2;
+}
+
+static bool simd_check_avx512bw_vpclmulqdq(void)
+{
+    return cpu_has_vpclmulqdq && cpu_has_avx512bw;
+}
+
+static bool simd_check_avx512bw_vpclmulqdq_vl(void)
+{
+    return cpu_has_vpclmulqdq && cpu_has_avx512bw && cpu_has_avx512vl;
+}
+
+static bool simd_check_avx512vbmi2_vpclmulqdq(void)
+{
+    return cpu_has_avx512_vbmi2 && simd_check_avx512bw_vpclmulqdq();
+}
+
+static bool simd_check_avx512vbmi2_vpclmulqdq_vl(void)
+{
+    return cpu_has_avx512_vbmi2 && simd_check_avx512bw_vpclmulqdq_vl();
+}
+
  static bool simd_check_sse2_gf(void)
  {
      return cpu_has_gfni && cpu_has_sse2;
@@ -369,6 +409,8 @@ static const struct {
      SIMD(XOP i64x4,               xop,      32i8),
      SIMD(AES (legacy),      ssse3_aes,        16),
      SIMD(AES (VEX/x16),       avx_aes,        16),
+    SIMD(PCLMUL (legacy), ssse3_pclmul,       16),
+    SIMD(PCLMUL (VEX/x2),  avx_pclmul,        16),
      SIMD(OPMASK/w,     avx512f_opmask,         2),
      SIMD(OPMASK+DQ/b, avx512dq_opmask,         1),
      SIMD(OPMASK+DQ/w, avx512dq_opmask,         2),
@@ -475,6 +517,13 @@ static const struct {
      SIMD(VAES (EVEX/x64), avx512bw_vaes,      64),
      AVX512VL(VL+VAES (x16), avx512bw_vaes,    16),
      AVX512VL(VL+VAES (x32), avx512bw_vaes,    32),
+    SIMD(VPCLMUL (VEX/x4), avx2_vpclmulqdq,  32),
+    SIMD(VPCLMUL (EVEX/x8), avx512bw_vpclmulqdq, 64),
+    AVX512VL(VL+VPCLMUL (x4), avx512bw_vpclmulqdq, 16),
+    AVX512VL(VL+VPCLMUL (x8), avx512bw_vpclmulqdq, 32),
+    SIMD(AVX512_VBMI2+VPCLMUL (x8), avx512vbmi2_vpclmulqdq, 64),
+    AVX512VL(_VBMI2+VL+VPCLMUL (x2), avx512vbmi2_vpclmulqdq, 16),
+    AVX512VL(_VBMI2+VL+VPCLMUL (x4), avx512vbmi2_vpclmulqdq, 32),
      SIMD(GFNI (legacy),       sse2_gf,        16),
      SIMD(GFNI (VEX/x16),      avx2_gf,        16),
      SIMD(GFNI (VEX/x32),      avx2_gf,        32),
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -125,6 +125,7 @@ static inline bool xcr0_mask(uint64_t ma
  #define cpu_has_sse        cp.basic.sse
  #define cpu_has_sse2       cp.basic.sse2
  #define cpu_has_sse3       cp.basic.sse3
+#define cpu_has_pclmulqdq  cp.basic.pclmulqdq
  #define cpu_has_ssse3      cp.basic.ssse3
  #define cpu_has_fma       (cp.basic.fma && xcr0_mask(6))
  #define cpu_has_sse4_1     cp.basic.sse4_1

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel