[PATCH v10 0/9] x86emul: further work

Jan Beulich posted 9 patches 3 years, 11 months ago
Only 0 patches received!
[PATCH v10 0/9] x86emul: further work
Posted by Jan Beulich 3 years, 11 months ago
The first two patches are bug fixes, in part pointed out by the
3rd patch. The remainder is further enabling.

1: address x86_insn_is_mem_{access,write}() omissions 
2: rework CMP and TEST emulation
3: also test decoding and mem access / write logic
4: disable FPU/MMX/SIMD insn emulation when !HVM
5: support MOVDIR{I,64B} insns
6: support ENQCMD insn
7: support FNSTENV and FNSAVE
8: support FLDENV and FRSTOR
9: support FXSAVE/FXRSTOR

Main changes from v9 are several fixes in patch 1 and the new
patch 2, both a result of the new patch 3. For other changes
see the individual patches.

Jan

[PATCH v10 1/9] x86emul: address x86_insn_is_mem_{access,write}() omissions
Posted by Jan Beulich 3 years, 11 months ago
First of all explain in comments what the functions' purposes are. Then
make them actually match their comments.

Note that fc6fa977be54 ("x86emul: extend x86_insn_is_mem_write()
coverage") didn't actually fix the function's behavior for {,V}STMXCSR:
Both are covered by generic code higher up in the function, due to
x86_decode_twobyte() already doing suitable adjustments. And VSTMXCSR
wouldn't have been covered anyway without a further X86EMUL_OPC_VEX()
case label. Keep the inner case label in a comment for reference.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v10: Move ARPL case to earlier switch() x86_insn_is_mem_write(). Make
     group 5 handling actually work there. Drop VMPTRST case. Also
     handle CLFLUSH*, CLWB, UDn, and remaining PREFETCH* in
     x86_insn_is_mem_access().
v9: New.

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -11474,25 +11474,87 @@ x86_insn_operand_ea(const struct x86_emu
     return state->ea.mem.off;
 }
 
+/*
+ * This function means to return 'true' for all supported insns with explicit
+ * accesses to memory.  This means also insns which don't have an explicit
+ * memory operand (like POP), but it does not mean e.g. segment selector
+ * loads, where the descriptor table access is considered an implicit one.
+ */
 bool
 x86_insn_is_mem_access(const struct x86_emulate_state *state,
                        const struct x86_emulate_ctxt *ctxt)
 {
+    if ( mode_64bit() && state->not_64bit )
+        return false;
+
     if ( state->ea.type == OP_MEM )
-        return ctxt->opcode != 0x8d /* LEA */ &&
-               (ctxt->opcode != X86EMUL_OPC(0x0f, 0x01) ||
-                (state->modrm_reg & 7) != 7) /* INVLPG */;
+    {
+        switch ( ctxt->opcode )
+        {
+        case 0x8d: /* LEA */
+        case X86EMUL_OPC(0x0f, 0x0d): /* PREFETCH */
+        case X86EMUL_OPC(0x0f, 0x18)
+         ... X86EMUL_OPC(0x0f, 0x1f): /* NOP space */
+        case X86EMUL_OPC_66(0x0f, 0x18)
+         ... X86EMUL_OPC_66(0x0f, 0x1f): /* NOP space */
+        case X86EMUL_OPC_F3(0x0f, 0x18)
+         ... X86EMUL_OPC_F3(0x0f, 0x1f): /* NOP space */
+        case X86EMUL_OPC_F2(0x0f, 0x18)
+         ... X86EMUL_OPC_F2(0x0f, 0x1f): /* NOP space */
+        case X86EMUL_OPC(0x0f, 0xb9): /* UD1 */
+        case X86EMUL_OPC(0x0f, 0xff): /* UD0 */
+            return false;
+
+        case X86EMUL_OPC(0x0f, 0x01):
+            return (state->modrm_reg & 7) != 7; /* INVLPG */
+
+        case X86EMUL_OPC(0x0f, 0xae):
+            return (state->modrm_reg & 7) != 7; /* CLFLUSH */
+
+        case X86EMUL_OPC_66(0x0f, 0xae):
+            return (state->modrm_reg & 7) < 6; /* CLWB, CLFLUSHOPT */
+        }
+
+        return true;
+    }
 
     switch ( ctxt->opcode )
     {
+    case 0x06 ... 0x07: /* PUSH / POP %es */
+    case 0x0e:          /* PUSH %cs */
+    case 0x16 ... 0x17: /* PUSH / POP %ss */
+    case 0x1e ... 0x1f: /* PUSH / POP %ds */
+    case 0x50 ... 0x5f: /* PUSH / POP reg */
+    case 0x60 ... 0x61: /* PUSHA / POPA */
+    case 0x68: case 0x6a: /* PUSH imm */
     case 0x6c ... 0x6f: /* INS / OUTS */
+    case 0x8f:          /* POP r/m */
+    case 0x9a:          /* CALL (far, direct) */
+    case 0x9c ... 0x9d: /* PUSHF / POPF */
     case 0xa4 ... 0xa7: /* MOVS / CMPS */
     case 0xaa ... 0xaf: /* STOS / LODS / SCAS */
+    case 0xc2 ... 0xc3: /* RET (near) */
+    case 0xc8 ... 0xc9: /* ENTER / LEAVE */
+    case 0xca ... 0xcb: /* RET (far) */
     case 0xd7:          /* XLAT */
+    case 0xe8:          /* CALL (near, direct) */
+    case X86EMUL_OPC(0x0f, 0xa0):         /* PUSH %fs */
+    case X86EMUL_OPC(0x0f, 0xa1):         /* POP %fs */
+    case X86EMUL_OPC(0x0f, 0xa8):         /* PUSH %gs */
+    case X86EMUL_OPC(0x0f, 0xa9):         /* POP %gs */
     CASE_SIMD_PACKED_INT_VEX(0x0f, 0xf7): /* MASKMOV{Q,DQU} */
                                           /* VMASKMOVDQU */
         return true;
 
+    case 0xff:
+        switch ( state->modrm_reg & 7 )
+        {
+        case 2: /* CALL (near, indirect) */
+        case 6: /* PUSH r/m */
+            return true;
+        }
+        break;
+
     case X86EMUL_OPC(0x0f, 0x01):
         /* Cover CLZERO. */
         return (state->modrm_rm & 7) == 4 && (state->modrm_reg & 7) == 7;
@@ -11501,10 +11563,20 @@ x86_insn_is_mem_access(const struct x86_
     return false;
 }
 
+/*
+ * This function means to return 'true' for all supported insns with explicit
+ * writes to memory.  This means also insns which don't have an explicit
+ * memory operand (like PUSH), but it does not mean e.g. segment selector
+ * loads, where the (possible) descriptor table write is considered an
+ * implicit access.
+ */
 bool
 x86_insn_is_mem_write(const struct x86_emulate_state *state,
                       const struct x86_emulate_ctxt *ctxt)
 {
+    if ( mode_64bit() && state->not_64bit )
+        return false;
+
     switch ( state->desc & DstMask )
     {
     case DstMem:
@@ -11516,19 +11588,48 @@ x86_insn_is_mem_write(const struct x86_e
         break;
 
     default:
+        switch ( ctxt->opcode )
+        {
+        case 0x63:                         /* ARPL */
+            return !mode_64bit();
+        }
+
         return false;
     }
 
     if ( state->modrm_mod == 3 )
-        /* CLZERO is the odd one. */
-        return ctxt->opcode == X86EMUL_OPC(0x0f, 0x01) &&
-               (state->modrm_rm & 7) == 4 && (state->modrm_reg & 7) == 7;
+    {
+        switch ( ctxt->opcode )
+        {
+        case 0xff: /* Grp5 */
+            break;
+
+        case X86EMUL_OPC(0x0f, 0x01): /* CLZERO is the odd one. */
+            return (state->modrm_rm & 7) == 4 && (state->modrm_reg & 7) == 7;
+
+        default:
+            return false;
+        }
+    }
 
     switch ( ctxt->opcode )
     {
+    case 0x06:                           /* PUSH %es */
+    case 0x0e:                           /* PUSH %cs */
+    case 0x16:                           /* PUSH %ss */
+    case 0x1e:                           /* PUSH %ds */
+    case 0x50 ... 0x57:                  /* PUSH reg */
+    case 0x60:                           /* PUSHA */
+    case 0x68: case 0x6a:                /* PUSH imm */
     case 0x6c: case 0x6d:                /* INS */
+    case 0x9a:                           /* CALL (far, direct) */
+    case 0x9c:                           /* PUSHF */
     case 0xa4: case 0xa5:                /* MOVS */
     case 0xaa: case 0xab:                /* STOS */
+    case 0xc8:                           /* ENTER */
+    case 0xe8:                           /* CALL (near, direct) */
+    case X86EMUL_OPC(0x0f, 0xa0):        /* PUSH %fs */
+    case X86EMUL_OPC(0x0f, 0xa8):        /* PUSH %gs */
     case X86EMUL_OPC(0x0f, 0xab):        /* BTS */
     case X86EMUL_OPC(0x0f, 0xb3):        /* BTR */
     case X86EMUL_OPC(0x0f, 0xbb):        /* BTC */
@@ -11586,6 +11687,16 @@ x86_insn_is_mem_write(const struct x86_e
         }
         break;
 
+    case 0xff:
+        switch ( state->modrm_reg & 7 )
+        {
+        case 2: /* CALL (near, indirect) */
+        case 3: /* CALL (far, indirect) */
+        case 6: /* PUSH r/m */
+            return true;
+        }
+        break;
+
     case X86EMUL_OPC(0x0f, 0x01):
         switch ( state->modrm_reg & 7 )
         {
@@ -11600,7 +11711,7 @@ x86_insn_is_mem_write(const struct x86_e
         switch ( state->modrm_reg & 7 )
         {
         case 0: /* FXSAVE */
-        case 3: /* {,V}STMXCSR */
+        /* case 3: STMXCSR - handled above */
         case 4: /* XSAVE */
         case 6: /* XSAVEOPT */
             return true;
@@ -11616,7 +11727,6 @@ x86_insn_is_mem_write(const struct x86_e
         case 1: /* CMPXCHG{8,16}B */
         case 4: /* XSAVEC */
         case 5: /* XSAVES */
-        case 7: /* VMPTRST */
             return true;
         }
         break;


Re: [PATCH v10 1/9] x86emul: address x86_insn_is_mem_{access, write}() omissions
Posted by Andrew Cooper 3 years, 11 months ago
On 25/05/2020 15:26, Jan Beulich wrote:
> First of all explain in comments what the functions' purposes are. Then
> make them actually match their comments.
>
> Note that fc6fa977be54 ("x86emul: extend x86_insn_is_mem_write()
> coverage") didn't actually fix the function's behavior for {,V}STMXCSR:
> Both are covered by generic code higher up in the function, due to
> x86_decode_twobyte() already doing suitable adjustments. And VSTMXCSR
> wouldn't have been covered anyway without a further X86EMUL_OPC_VEX()
> case label. Keep the inner case label in a comment for reference.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> v10: Move ARPL case to earlier switch() x86_insn_is_mem_write(). Make
>      group 5 handling actually work there. Drop VMPTRST case. Also
>      handle CLFLUSH*, CLWB, UDn, and remaining PREFETCH* in
>      x86_insn_is_mem_access().
> v9: New.
>
> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -11474,25 +11474,87 @@ x86_insn_operand_ea(const struct x86_emu
>      return state->ea.mem.off;
>  }
>  
> +/*
> + * This function means to return 'true' for all supported insns with explicit
> + * accesses to memory.  This means also insns which don't have an explicit
> + * memory operand (like POP), but it does not mean e.g. segment selector
> + * loads, where the descriptor table access is considered an implicit one.
> + */
>  bool
>  x86_insn_is_mem_access(const struct x86_emulate_state *state,
>                         const struct x86_emulate_ctxt *ctxt)
>  {
> +    if ( mode_64bit() && state->not_64bit )
> +        return false;

Is this path actually used?  state->not_64bit ought to fail instruction
decode, at which point we wouldn't have a valid state to be used here.

Everything else looks ok, so Acked-by: Andrew Cooper
<andrew.cooper3@citrix.com>

Re: [PATCH v10 1/9] x86emul: address x86_insn_is_mem_{access, write}() omissions
Posted by Jan Beulich 3 years, 11 months ago
On 29.05.2020 14:18, Andrew Cooper wrote:
> On 25/05/2020 15:26, Jan Beulich wrote:
>> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
>> @@ -11474,25 +11474,87 @@ x86_insn_operand_ea(const struct x86_emu
>>      return state->ea.mem.off;
>>  }
>>  
>> +/*
>> + * This function means to return 'true' for all supported insns with explicit
>> + * accesses to memory.  This means also insns which don't have an explicit
>> + * memory operand (like POP), but it does not mean e.g. segment selector
>> + * loads, where the descriptor table access is considered an implicit one.
>> + */
>>  bool
>>  x86_insn_is_mem_access(const struct x86_emulate_state *state,
>>                         const struct x86_emulate_ctxt *ctxt)
>>  {
>> +    if ( mode_64bit() && state->not_64bit )
>> +        return false;
> 
> Is this path actually used?

Yes, it is. It's only x86_emulate() which has

    generate_exception_if(state->not_64bit && mode_64bit(), EXC_UD);

right now.

> state->not_64bit ought to fail instruction
> decode, at which point we wouldn't have a valid state to be used here.

x86_decode() currently doesn't have much raising of #UD at all, I
think. If it wasn't like this, the not_64bit field wouldn't be
needed - it's used only to communicate from decode to execute.
We're not entirely consistent with this though, seeing in
x86_decode_onebyte(), a few lines below the block of case labels
setting that field,

    case 0x9a: /* call (far, absolute) */
    case 0xea: /* jmp (far, absolute) */
        generate_exception_if(mode_64bit(), EXC_UD);

We could certainly drop the field and raise #UD during decode
already, but don't you think we then should do so for all
encodings that ultimately lead to #UD, e.g. also for AVX insns
without AVX available to the guest? This would make
x86_decode() quite a bit larger, as it would then also need to
have a giant switch() (or something else that's suitable to
cover all cases).

> Everything else looks ok, so Acked-by: Andrew Cooper
> <andrew.cooper3@citrix.com>

Thanks.

Jan

Re: [PATCH v10 1/9] x86emul: address x86_insn_is_mem_{access, write}() omissions
Posted by Andrew Cooper 3 years, 11 months ago
On 29/05/2020 14:29, Jan Beulich wrote:
> On 29.05.2020 14:18, Andrew Cooper wrote:
>> On 25/05/2020 15:26, Jan Beulich wrote:
>>> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
>>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
>>> @@ -11474,25 +11474,87 @@ x86_insn_operand_ea(const struct x86_emu
>>>      return state->ea.mem.off;
>>>  }
>>>  
>>> +/*
>>> + * This function means to return 'true' for all supported insns with explicit
>>> + * accesses to memory.  This means also insns which don't have an explicit
>>> + * memory operand (like POP), but it does not mean e.g. segment selector
>>> + * loads, where the descriptor table access is considered an implicit one.
>>> + */
>>>  bool
>>>  x86_insn_is_mem_access(const struct x86_emulate_state *state,
>>>                         const struct x86_emulate_ctxt *ctxt)
>>>  {
>>> +    if ( mode_64bit() && state->not_64bit )
>>> +        return false;
>> Is this path actually used?
> Yes, it is. It's only x86_emulate() which has
>
>     generate_exception_if(state->not_64bit && mode_64bit(), EXC_UD);
>
> right now.

Oh.  That is a bit awkward.

>> state->not_64bit ought to fail instruction
>> decode, at which point we wouldn't have a valid state to be used here.
> x86_decode() currently doesn't have much raising of #UD at all, I
> think. If it wasn't like this, the not_64bit field wouldn't be
> needed - it's used only to communicate from decode to execute.
> We're not entirely consistent with this though, seeing in
> x86_decode_onebyte(), a few lines below the block of case labels
> setting that field,
>
>     case 0x9a: /* call (far, absolute) */
>     case 0xea: /* jmp (far, absolute) */
>         generate_exception_if(mode_64bit(), EXC_UD);

This is because there is no legitimate way to determine the end of the
instruction.

Most of the not_64bit instructions do have a well-defined end, even if
they aren't permitted for use.

> We could certainly drop the field and raise #UD during decode
> already, but don't you think we then should do so for all
> encodings that ultimately lead to #UD, e.g. also for AVX insns
> without AVX available to the guest? This would make
> x86_decode() quite a bit larger, as it would then also need to
> have a giant switch() (or something else that's suitable to
> cover all cases).

I think there is a semantic difference between "we can't parse the
instruction", and "we can parse it, but it's not legitimate in this
context".

I don't think our exact split is correct, but I don't think moving all
#UD checking into x86_decode() is correct either.

~Andrew

Re: [PATCH v10 1/9] x86emul: address x86_insn_is_mem_{access, write}() omissions
Posted by Jan Beulich 3 years, 11 months ago
On 29.05.2020 17:03, Andrew Cooper wrote:
> On 29/05/2020 14:29, Jan Beulich wrote:
>> On 29.05.2020 14:18, Andrew Cooper wrote:
>>> On 25/05/2020 15:26, Jan Beulich wrote:
>>>> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
>>>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
>>>> @@ -11474,25 +11474,87 @@ x86_insn_operand_ea(const struct x86_emu
>>>>      return state->ea.mem.off;
>>>>  }
>>>>  
>>>> +/*
>>>> + * This function means to return 'true' for all supported insns with explicit
>>>> + * accesses to memory.  This means also insns which don't have an explicit
>>>> + * memory operand (like POP), but it does not mean e.g. segment selector
>>>> + * loads, where the descriptor table access is considered an implicit one.
>>>> + */
>>>>  bool
>>>>  x86_insn_is_mem_access(const struct x86_emulate_state *state,
>>>>                         const struct x86_emulate_ctxt *ctxt)
>>>>  {
>>>> +    if ( mode_64bit() && state->not_64bit )
>>>> +        return false;
>>> Is this path actually used?
>> Yes, it is. It's only x86_emulate() which has
>>
>>     generate_exception_if(state->not_64bit && mode_64bit(), EXC_UD);
>>
>> right now.
> 
> Oh.  That is a bit awkward.
> 
>>> state->not_64bit ought to fail instruction
>>> decode, at which point we wouldn't have a valid state to be used here.
>> x86_decode() currently doesn't have much raising of #UD at all, I
>> think. If it wasn't like this, the not_64bit field wouldn't be
>> needed - it's used only to communicate from decode to execute.
>> We're not entirely consistent with this though, seeing in
>> x86_decode_onebyte(), a few lines below the block of case labels
>> setting that field,
>>
>>     case 0x9a: /* call (far, absolute) */
>>     case 0xea: /* jmp (far, absolute) */
>>         generate_exception_if(mode_64bit(), EXC_UD);
> 
> This is because there is no legitimate way to determine the end of the
> instruction.
> 
> Most of the not_64bit instructions do have a well-defined end, even if
> they aren't permitted for use.

I wouldn't bet on that. Just look at the re-use of opcode D6
(salc in 32-bit) by L1OM for an extreme example. There's nothing
we can say on how any of the reserved opcodes may get re-used.
Prior to AVX / AVX512 we'd have estimated C4, C5, and 62 wrongly
as well (but that's true independent of execution mode).

>> We could certainly drop the field and raise #UD during decode
>> already, but don't you think we then should do so for all
>> encodings that ultimately lead to #UD, e.g. also for AVX insns
>> without AVX available to the guest? This would make
>> x86_decode() quite a bit larger, as it would then also need to
>> have a giant switch() (or something else that's suitable to
>> cover all cases).
> 
> I think there is a semantic difference between "we can't parse the
> instruction", and "we can parse it, but it's not legitimate in this
> context".
> 
> I don't think our exact split is correct, but I don't think moving all
> #UD checking into x86_decode() is correct either.

Do you have any clear, sufficiently simple rule in mind for where
we could draw the boundary?

Jan

[PATCH v10 2/9] x86emul: rework CMP and TEST emulation
Posted by Jan Beulich 3 years, 11 months ago
Unlike similarly encoded insns these don't write their memory operands,
and hence x86_is_mem_write() should return false for them. However,
rather than adding special logic there, rework how their emulation gets
done, by making decoding attributes properly describe the r/o nature of
their memory operands.

Note how this also allows dropping custom LOCK prefix checks.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v10: New.

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -84,7 +84,7 @@ static const opcode_desc_t opcode_table[
     ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
     ByteOp|DstEax|SrcImm, DstEax|SrcImm, 0, ImplicitOps,
     /* 0x38 - 0x3F */
-    ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
+    ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
     ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
     ByteOp|DstEax|SrcImm, DstEax|SrcImm, 0, ImplicitOps,
     /* 0x40 - 0x4F */
@@ -2481,7 +2481,6 @@ x86_decode_onebyte(
     case 0x60: /* pusha */
     case 0x61: /* popa */
     case 0x62: /* bound */
-    case 0x82: /* Grp1 (x86/32 only) */
     case 0xc4: /* les */
     case 0xc5: /* lds */
     case 0xce: /* into */
@@ -2491,6 +2490,14 @@ x86_decode_onebyte(
         state->not_64bit = true;
         break;
 
+    case 0x82: /* Grp1 (x86/32 only) */
+        state->not_64bit = true;
+        /* fall through */
+    case 0x80: case 0x81: case 0x83: /* Grp1 */
+        if ( (modrm_reg & 7) == 7 ) /* cmp */
+            state->desc = (state->desc & ByteOp) | DstNone | SrcMem;
+        break;
+
     case 0x90: /* nop / pause */
         if ( repe_prefix() )
             ctxt->opcode |= X86EMUL_OPC_F3(0, 0);
@@ -2521,6 +2528,11 @@ x86_decode_onebyte(
         imm2 = insn_fetch_type(uint8_t);
         break;
 
+    case 0xf6: case 0xf7: /* Grp3 */
+        if ( !(modrm_reg & 6) ) /* test */
+            state->desc = (state->desc & ByteOp) | DstNone | SrcMem;
+        break;
+
     case 0xff: /* Grp5 */
         switch ( modrm_reg & 7 )
         {
@@ -3928,13 +3940,11 @@ x86_emulate(
         break;
 
     case 0x38: case 0x39: cmp: /* cmp reg,mem */
-        if ( ops->rmw && dst.type == OP_MEM &&
-             (rc = read_ulong(dst.mem.seg, dst.mem.off, &dst.val,
-                              dst.bytes, ctxt, ops)) != X86EMUL_OKAY )
-            goto done;
-        /* fall through */
+        emulate_2op_SrcV("cmp", dst, src, _regs.eflags);
+        dst.type = OP_NONE;
+        break;
+
     case 0x3a ... 0x3d: /* cmp */
-        generate_exception_if(lock_prefix, EXC_UD);
         emulate_2op_SrcV("cmp", src, dst, _regs.eflags);
         dst.type = OP_NONE;
         break;
@@ -4239,7 +4249,9 @@ x86_emulate(
         case 4: goto and;
         case 5: goto sub;
         case 6: goto xor;
-        case 7: goto cmp;
+        case 7:
+            dst.val = imm1;
+            goto cmp;
         }
         break;
 
@@ -5233,11 +5245,8 @@ x86_emulate(
             unsigned long u[2], v;
 
         case 0 ... 1: /* test */
-            generate_exception_if(lock_prefix, EXC_UD);
-            if ( ops->rmw && dst.type == OP_MEM &&
-                 (rc = read_ulong(dst.mem.seg, dst.mem.off, &dst.val,
-                                  dst.bytes, ctxt, ops)) != X86EMUL_OKAY )
-                goto done;
+            dst.val = imm1;
+            dst.bytes = src.bytes;
             goto test;
         case 2: /* not */
             if ( ops->rmw && dst.type == OP_MEM )


Re: [PATCH v10 2/9] x86emul: rework CMP and TEST emulation
Posted by Andrew Cooper 3 years, 11 months ago
On 25/05/2020 15:26, Jan Beulich wrote:
> Unlike similarly encoded insns these don't write their memory operands,

"write to their".

> and hence x86_is_mem_write() should return false for them. However,
> rather than adding special logic there, rework how their emulation gets
> done, by making decoding attributes properly describe the r/o nature of
> their memory operands.

Describe how?  I see you've change the order of operands encoding, but
then override it back?

~Andrew

> Note how this also allows dropping custom LOCK prefix checks.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> v10: New.
>
> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -84,7 +84,7 @@ static const opcode_desc_t opcode_table[
>      ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
>      ByteOp|DstEax|SrcImm, DstEax|SrcImm, 0, ImplicitOps,
>      /* 0x38 - 0x3F */
> -    ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
> +    ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
>      ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
>      ByteOp|DstEax|SrcImm, DstEax|SrcImm, 0, ImplicitOps,
>      /* 0x40 - 0x4F */
> @@ -2481,7 +2481,6 @@ x86_decode_onebyte(
>      case 0x60: /* pusha */
>      case 0x61: /* popa */
>      case 0x62: /* bound */
> -    case 0x82: /* Grp1 (x86/32 only) */
>      case 0xc4: /* les */
>      case 0xc5: /* lds */
>      case 0xce: /* into */
> @@ -2491,6 +2490,14 @@ x86_decode_onebyte(
>          state->not_64bit = true;
>          break;
>  
> +    case 0x82: /* Grp1 (x86/32 only) */
> +        state->not_64bit = true;
> +        /* fall through */
> +    case 0x80: case 0x81: case 0x83: /* Grp1 */
> +        if ( (modrm_reg & 7) == 7 ) /* cmp */
> +            state->desc = (state->desc & ByteOp) | DstNone | SrcMem;
> +        break;
> +
>      case 0x90: /* nop / pause */
>          if ( repe_prefix() )
>              ctxt->opcode |= X86EMUL_OPC_F3(0, 0);
> @@ -2521,6 +2528,11 @@ x86_decode_onebyte(
>          imm2 = insn_fetch_type(uint8_t);
>          break;
>  
> +    case 0xf6: case 0xf7: /* Grp3 */
> +        if ( !(modrm_reg & 6) ) /* test */
> +            state->desc = (state->desc & ByteOp) | DstNone | SrcMem;
> +        break;
> +
>      case 0xff: /* Grp5 */
>          switch ( modrm_reg & 7 )
>          {
> @@ -3928,13 +3940,11 @@ x86_emulate(
>          break;
>  
>      case 0x38: case 0x39: cmp: /* cmp reg,mem */
> -        if ( ops->rmw && dst.type == OP_MEM &&
> -             (rc = read_ulong(dst.mem.seg, dst.mem.off, &dst.val,
> -                              dst.bytes, ctxt, ops)) != X86EMUL_OKAY )
> -            goto done;
> -        /* fall through */
> +        emulate_2op_SrcV("cmp", dst, src, _regs.eflags);
> +        dst.type = OP_NONE;
> +        break;
> +
>      case 0x3a ... 0x3d: /* cmp */
> -        generate_exception_if(lock_prefix, EXC_UD);
>          emulate_2op_SrcV("cmp", src, dst, _regs.eflags);
>          dst.type = OP_NONE;
>          break;
> @@ -4239,7 +4249,9 @@ x86_emulate(
>          case 4: goto and;
>          case 5: goto sub;
>          case 6: goto xor;
> -        case 7: goto cmp;
> +        case 7:
> +            dst.val = imm1;
> +            goto cmp;
>          }
>          break;
>  
> @@ -5233,11 +5245,8 @@ x86_emulate(
>              unsigned long u[2], v;
>  
>          case 0 ... 1: /* test */
> -            generate_exception_if(lock_prefix, EXC_UD);
> -            if ( ops->rmw && dst.type == OP_MEM &&
> -                 (rc = read_ulong(dst.mem.seg, dst.mem.off, &dst.val,
> -                                  dst.bytes, ctxt, ops)) != X86EMUL_OKAY )
> -                goto done;
> +            dst.val = imm1;
> +            dst.bytes = src.bytes;
>              goto test;
>          case 2: /* not */
>              if ( ops->rmw && dst.type == OP_MEM )
>


Re: [PATCH v10 2/9] x86emul: rework CMP and TEST emulation
Posted by Jan Beulich 3 years, 11 months ago
On 29.05.2020 14:24, Andrew Cooper wrote:
> On 25/05/2020 15:26, Jan Beulich wrote:
>> Unlike similarly encoded insns these don't write their memory operands,
> 
> "write to their".
> 
>> and hence x86_is_mem_write() should return false for them. However,
>> rather than adding special logic there, rework how their emulation gets
>> done, by making decoding attributes properly describe the r/o nature of
>> their memory operands.
> 
> Describe how?  I see you've change the order of operands encoding, but
> then override it back?

There's no overriding back, I don't think: I change the table entries
for opcodes 0x38 and 0x39, with no other adjustments the the attributes
later on. For the other opcodes I leave the table entries as they are,
and override the attributes for the specific sub-cases (identified by
ModRM.reg).

For opcodes 0x38 and 0x39 the change of the table entries implies
changing the order of operands as passed to emulate_2op_SrcV(), hence
the splitting of the cases in the main switch().

I didn't think this was necessary to spell out in the commit message,
but of course I can re-use most of the text above and add it into
there, if you think that would help.

Jan

Re: [PATCH v10 2/9] x86emul: rework CMP and TEST emulation
Posted by Andrew Cooper 3 years, 11 months ago
On 29/05/2020 14:41, Jan Beulich wrote:
> On 29.05.2020 14:24, Andrew Cooper wrote:
>> On 25/05/2020 15:26, Jan Beulich wrote:
>>> Unlike similarly encoded insns these don't write their memory operands,
>> "write to their".
>>
>>> and hence x86_is_mem_write() should return false for them. However,
>>> rather than adding special logic there, rework how their emulation gets
>>> done, by making decoding attributes properly describe the r/o nature of
>>> their memory operands.
>> Describe how?  I see you've change the order of operands encoding, but
>> then override it back?
> There's no overriding back, I don't think: I change the table entries
> for opcodes 0x38 and 0x39, with no other adjustments the the attributes
> later on. For the other opcodes I leave the table entries as they are,
> and override the attributes for the specific sub-cases (identified by
> ModRM.reg).
>
> For opcodes 0x38 and 0x39 the change of the table entries implies
> changing the order of operands as passed to emulate_2op_SrcV(), hence
> the splitting of the cases in the main switch().
> I didn't think this was necessary to spell out in the commit message,
> but of course I can re-use most of the text above and add it into
> there, if you think that would help.

Yes please.  With something suitable, Acked-by: Andrew Cooper
<andrew.cooper3@citrix.com>

~Andrew

[PATCH v10 3/9] x86emul: also test decoding and mem access / write logic
Posted by Jan Beulich 3 years, 11 months ago
x86emul_is_mem_{access,write}() (and their interaction with
x86_decode()) have become sufficiently complex that we should have a way
to test this logic. Start by covering legacy encoded GPR insns, with the
exception of a few the main emulator doesn't support yet (left as
comments in the respective tables, or about to be added by subsequent
patches). This has already helped spot a few flaws in said logic,
addressed by (revised) earlier patches.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v10: New.

--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -250,7 +250,7 @@ xop.h avx512f.h: simd-fma.c
 
 endif # 32-bit override
 
-$(TARGET): x86-emulate.o cpuid.o test_x86_emulator.o evex-disp8.o wrappers.o
+$(TARGET): x86-emulate.o cpuid.o test_x86_emulator.o evex-disp8.o predicates.o wrappers.o
 	$(HOSTCC) $(HOSTCFLAGS) -o $@ $^
 
 .PHONY: clean
@@ -289,7 +289,7 @@ x86.h := $(addprefix $(XEN_ROOT)/tools/i
                      cpuid.h cpuid-autogen.h)
 x86_emulate.h := x86-emulate.h x86_emulate/x86_emulate.h $(x86.h)
 
-x86-emulate.o cpuid.o test_x86_emulator.o evex-disp8.o wrappers.o: %.o: %.c $(x86_emulate.h)
+x86-emulate.o cpuid.o test_x86_emulator.o evex-disp8.o predicates.o wrappers.o: %.o: %.c $(x86_emulate.h)
 	$(HOSTCC) $(HOSTCFLAGS) -c -g -o $@ $<
 
 x86-emulate.o: x86_emulate/x86_emulate.c
--- /dev/null
+++ b/tools/tests/x86_emulator/predicates.c
@@ -0,0 +1,671 @@
+#include "x86-emulate.h"
+
+#include <stdio.h>
+
+enum mem_access { mem_none, mem_read, mem_write };
+enum pfx { pfx_none, pfx_66, pfx_f3, pfx_f2 };
+static const uint8_t prefixes[] = { 0x66, 0xf3, 0xf2 };
+
+#define F false
+#define T true
+
+#define N mem_none
+#define R mem_read
+#define W mem_write
+
+/*
+ * ModR/M bytes and immediates don't need spelling out in the opcodes,
+ * unless the implied zeros aren't good enough.
+ */
+static const struct {
+    uint8_t opc[8];
+    uint8_t len[2]; /* 32- and 64-bit mode */
+    bool modrm:1; /* Should register form (also) be tested? */
+    unsigned int mem:2;
+    unsigned int pfx:2;
+#define REG(opc, more...) \
+    { { (opc) | 0 }, more }, /* %?ax */ \
+    { { (opc) | 1 }, more }, /* %?cx */ \
+    { { (opc) | 2 }, more }, /* %?dx */ \
+    { { (opc) | 3 }, more }, /* %?bx */ \
+    { { (opc) | 4 }, more }, /* %?sp */ \
+    { { (opc) | 5 }, more }, /* %?bp */ \
+    { { (opc) | 6 }, more }, /* %?si */ \
+    { { (opc) | 7 }, more }  /* %?di */
+#define CND(opc, more...) \
+    { { (opc) | 0x0 }, more }, /* ..o */ \
+    { { (opc) | 0x1 }, more }, /* ..no */ \
+    { { (opc) | 0x2 }, more }, /* ..c / ..b */ \
+    { { (opc) | 0x3 }, more }, /* ..nc / ..nb */ \
+    { { (opc) | 0x4 }, more }, /* ..z / ..e */ \
+    { { (opc) | 0x5 }, more }, /* ..nz / ..ne */ \
+    { { (opc) | 0x6 }, more }, /* ..be / ..na */ \
+    { { (opc) | 0x7 }, more }, /* ..a / ..nbe */ \
+    { { (opc) | 0x8 }, more }, /* ..s */ \
+    { { (opc) | 0x9 }, more }, /* ..ns */ \
+    { { (opc) | 0xa }, more }, /* ..pe / ..p */ \
+    { { (opc) | 0xb }, more }, /* ..po / ..np */ \
+    { { (opc) | 0xc }, more }, /* ..l / ..nge */ \
+    { { (opc) | 0xd }, more }, /* ..ge / ..nl */ \
+    { { (opc) | 0xe }, more }, /* ..le / ..ng */ \
+    { { (opc) | 0xf }, more }  /* ..g / .. nle */
+} legacy[] = {
+    { { 0x00 }, { 2, 2 }, T, W }, /* add */
+    { { 0x01 }, { 2, 2 }, T, W }, /* add */
+    { { 0x02 }, { 2, 2 }, T, R }, /* add */
+    { { 0x03 }, { 2, 2 }, T, R }, /* add */
+    { { 0x04 }, { 2, 2 }, F, N }, /* add */
+    { { 0x05 }, { 5, 5 }, F, N }, /* add */
+    { { 0x06 }, { 1, 0 }, F, W }, /* push %es */
+    { { 0x07 }, { 1, 0 }, F, R }, /* pop %es */
+    { { 0x08 }, { 2, 2 }, T, W }, /* or */
+    { { 0x09 }, { 2, 2 }, T, W }, /* or */
+    { { 0x0a }, { 2, 2 }, T, R }, /* or */
+    { { 0x0b }, { 2, 2 }, T, R }, /* or */
+    { { 0x0c }, { 2, 2 }, F, N }, /* or */
+    { { 0x0d }, { 5, 5 }, F, N }, /* or */
+    { { 0x0e }, { 1, 0 }, F, W }, /* push %cs */
+    { { 0x10 }, { 2, 2 }, T, W }, /* adc */
+    { { 0x11 }, { 2, 2 }, T, W }, /* adc */
+    { { 0x12 }, { 2, 2 }, T, R }, /* adc */
+    { { 0x13 }, { 2, 2 }, T, R }, /* adc */
+    { { 0x14 }, { 2, 2 }, F, N }, /* adc */
+    { { 0x15 }, { 5, 5 }, F, N }, /* adc */
+    { { 0x16 }, { 1, 0 }, F, W }, /* push %ss */
+    { { 0x17 }, { 1, 0 }, F, R }, /* pop %ss */
+    { { 0x18 }, { 2, 2 }, T, W }, /* adc */
+    { { 0x19 }, { 2, 2 }, T, W }, /* adc */
+    { { 0x1a }, { 2, 2 }, T, R }, /* adc */
+    { { 0x1b }, { 2, 2 }, T, R }, /* adc */
+    { { 0x1c }, { 2, 2 }, F, N }, /* adc */
+    { { 0x1d }, { 5, 5 }, F, N }, /* adc */
+    { { 0x1e }, { 1, 0 }, F, W }, /* push %ds */
+    { { 0x1f }, { 1, 0 }, F, R }, /* pop %ds */
+    { { 0x20 }, { 2, 2 }, T, W }, /* and */
+    { { 0x21 }, { 2, 2 }, T, W }, /* and */
+    { { 0x22 }, { 2, 2 }, T, R }, /* and */
+    { { 0x23 }, { 2, 2 }, T, R }, /* and */
+    { { 0x24 }, { 2, 2 }, F, N }, /* and */
+    { { 0x25 }, { 5, 5 }, F, N }, /* and */
+    { { 0x27 }, { 1, 0 }, F, N }, /* daa */
+    { { 0x28 }, { 2, 2 }, T, W }, /* sub */
+    { { 0x29 }, { 2, 2 }, T, W }, /* sub */
+    { { 0x2a }, { 2, 2 }, T, R }, /* sub */
+    { { 0x2b }, { 2, 2 }, T, R }, /* sub */
+    { { 0x2c }, { 2, 2 }, F, N }, /* sub */
+    { { 0x2d }, { 5, 5 }, F, N }, /* sub */
+    { { 0x2f }, { 1, 0 }, F, N }, /* das */
+    { { 0x30 }, { 2, 2 }, T, W }, /* xor */
+    { { 0x31 }, { 2, 2 }, T, W }, /* xor */
+    { { 0x32 }, { 2, 2 }, T, R }, /* xor */
+    { { 0x33 }, { 2, 2 }, T, R }, /* xor */
+    { { 0x34 }, { 2, 2 }, F, N }, /* xor */
+    { { 0x35 }, { 5, 5 }, F, N }, /* xor */
+    { { 0x37 }, { 1, 0 }, F, N }, /* aaa */
+    { { 0x38 }, { 2, 2 }, T, R }, /* cmp */
+    { { 0x39 }, { 2, 2 }, T, R }, /* cmp */
+    { { 0x3a }, { 2, 2 }, T, R }, /* cmp */
+    { { 0x3b }, { 2, 2 }, T, R }, /* cmp */
+    { { 0x3c }, { 2, 2 }, F, N }, /* cmp */
+    { { 0x3d }, { 5, 5 }, F, N }, /* cmp */
+    { { 0x3f }, { 1, 0 }, F, N }, /* aas */
+    REG(0x40,   { 1, 0 }, F, N ), /* inc */
+    REG(0x48,   { 1, 0 }, F, N ), /* dec */
+    REG(0x50,   { 1, 0 }, F, W ), /* push */
+    REG(0x58,   { 1, 0 }, F, R ), /* pop */
+    { { 0x60 }, { 1, 0 }, F, W }, /* pusha */
+    { { 0x61 }, { 1, 0 }, F, R }, /* popa */
+    { { 0x62 }, { 2, 0 }, F, R }, /* bound */
+    { { 0x63 }, { 2, 0 }, F, W }, /* arpl */
+    { { 0x63 }, { 0, 2 }, F, R }, /* movsxd */
+    { { 0x68 }, { 5, 5 }, F, W }, /* push */
+    { { 0x69 }, { 6, 6 }, T, R }, /* imul */
+    { { 0x6a }, { 2, 2 }, F, W }, /* push */
+    { { 0x6b }, { 3, 3 }, T, R }, /* imul */
+    { { 0x6c }, { 1, 1 }, F, W }, /* ins */
+    { { 0x6d }, { 1, 1 }, F, W }, /* ins */
+    { { 0x6e }, { 1, 1 }, F, R }, /* outs */
+    { { 0x6f }, { 1, 1 }, F, R }, /* outs */
+    CND(0x70,   { 2, 2 }, F, N ), /* j<cc> */
+    { { 0x80, 0x00 }, { 3, 3 }, T, W }, /* add */
+    { { 0x80, 0x08 }, { 3, 3 }, T, W }, /* or */
+    { { 0x80, 0x10 }, { 3, 3 }, T, W }, /* adc */
+    { { 0x80, 0x18 }, { 3, 3 }, T, W }, /* sbb */
+    { { 0x80, 0x20 }, { 3, 3 }, T, W }, /* and */
+    { { 0x80, 0x28 }, { 3, 3 }, T, W }, /* sub */
+    { { 0x80, 0x30 }, { 3, 3 }, T, W }, /* xor */
+    { { 0x80, 0x38 }, { 3, 3 }, T, R }, /* cmp */
+    { { 0x81, 0x00 }, { 6, 6 }, T, W }, /* add */
+    { { 0x81, 0x08 }, { 6, 6 }, T, W }, /* or */
+    { { 0x81, 0x10 }, { 6, 6 }, T, W }, /* adc */
+    { { 0x81, 0x18 }, { 6, 6 }, T, W }, /* sbb */
+    { { 0x81, 0x20 }, { 6, 6 }, T, W }, /* and */
+    { { 0x81, 0x28 }, { 6, 6 }, T, W }, /* sub */
+    { { 0x81, 0x30 }, { 6, 6 }, T, W }, /* add */
+    { { 0x81, 0x38 }, { 6, 6 }, T, R }, /* cmp */
+    { { 0x82, 0x00 }, { 3, 0 }, T, W }, /* xor */
+    { { 0x82, 0x08 }, { 3, 0 }, T, W }, /* or */
+    { { 0x82, 0x10 }, { 3, 0 }, T, W }, /* adc */
+    { { 0x82, 0x18 }, { 3, 0 }, T, W }, /* sbb */
+    { { 0x82, 0x20 }, { 3, 0 }, T, W }, /* and */
+    { { 0x82, 0x28 }, { 3, 0 }, T, W }, /* sub */
+    { { 0x82, 0x30 }, { 3, 0 }, T, W }, /* xor */
+    { { 0x82, 0x38 }, { 3, 0 }, T, R }, /* cmp */
+    { { 0x83, 0x00 }, { 3, 3 }, T, W }, /* add */
+    { { 0x83, 0x08 }, { 3, 3 }, T, W }, /* or */
+    { { 0x83, 0x10 }, { 3, 3 }, T, W }, /* adc */
+    { { 0x83, 0x18 }, { 3, 3 }, T, W }, /* sbb */
+    { { 0x83, 0x20 }, { 3, 3 }, T, W }, /* and */
+    { { 0x83, 0x28 }, { 3, 3 }, T, W }, /* sub */
+    { { 0x83, 0x30 }, { 3, 3 }, T, W }, /* xor */
+    { { 0x83, 0x38 }, { 3, 3 }, T, R }, /* cmp */
+    { { 0x84 }, { 2, 2 }, T, R }, /* test */
+    { { 0x85 }, { 2, 2 }, T, R }, /* test */
+    { { 0x86 }, { 2, 2 }, T, W }, /* xchg */
+    { { 0x87 }, { 2, 2 }, T, W }, /* xchg */
+    { { 0x88 }, { 2, 2 }, T, W }, /* mov */
+    { { 0x89 }, { 2, 2 }, T, W }, /* mov */
+    { { 0x8a }, { 2, 2 }, T, R }, /* mov */
+    { { 0x8b }, { 2, 2 }, T, R }, /* mov */
+    { { 0x8c }, { 2, 2 }, T, W }, /* mov */
+    { { 0x8d }, { 2, 2 }, F, N }, /* lea */
+    { { 0x8e }, { 2, 2 }, T, R }, /* mov */
+    { { 0x8f, 0x00 }, { 2, 2 }, F, W }, /* pop */
+    { { 0x8f, 0xc0 }, { 2, 2 }, F, R }, /* pop */
+    REG(0x90,   { 1, 0 }, F, N ), /* xchg */
+    { { 0x98 }, { 1, 1 }, F, N }, /* cbw */
+    { { 0x99 }, { 1, 1 }, F, N }, /* cwd */
+    { { 0x9a }, { 7, 0 }, F, W }, /* lcall */
+    { { 0x9b }, { 1, 1 }, F, N }, /* wait */
+    { { 0x9c }, { 1, 1 }, F, W }, /* pushf */
+    { { 0x9d }, { 1, 1 }, F, R }, /* popf */
+    { { 0x9e }, { 1, 1 }, F, N }, /* sahf */
+    { { 0x9f }, { 1, 1 }, F, N }, /* lahf */
+    { { 0xa0 }, { 5, 9 }, F, R }, /* mov */
+    { { 0xa1 }, { 5, 9 }, F, R }, /* mov */
+    { { 0xa2 }, { 5, 9 }, F, W }, /* mov */
+    { { 0xa3 }, { 5, 9 }, F, W }, /* mov */
+    { { 0xa4 }, { 1, 1 }, F, W }, /* movs */
+    { { 0xa5 }, { 1, 1 }, F, W }, /* movs */
+    { { 0xa6 }, { 1, 1 }, F, R }, /* cmps */
+    { { 0xa7 }, { 1, 1 }, F, R }, /* cmps */
+    { { 0xa8 }, { 2, 2 }, F, N }, /* test */
+    { { 0xa9 }, { 5, 5 }, F, N }, /* test */
+    { { 0xaa }, { 1, 1 }, F, W }, /* stos */
+    { { 0xab }, { 1, 1 }, F, W }, /* stos */
+    { { 0xac }, { 1, 1 }, F, R }, /* lods */
+    { { 0xad }, { 1, 1 }, F, R }, /* lods */
+    { { 0xae }, { 1, 1 }, F, R }, /* scas */
+    { { 0xaf }, { 1, 1 }, F, R }, /* scas */
+    REG(0xb0,   { 2, 2 }, F, N ), /* mov */
+    REG(0xb8,   { 5, 5 }, F, N ), /* mov */
+    { { 0xc0, 0x00 }, { 3, 3 }, T, W }, /* rol */
+    { { 0xc0, 0x08 }, { 3, 3 }, T, W }, /* ror */
+    { { 0xc0, 0x10 }, { 3, 3 }, T, W }, /* rcl */
+    { { 0xc0, 0x18 }, { 3, 3 }, T, W }, /* rcr */
+    { { 0xc0, 0x20 }, { 3, 3 }, T, W }, /* shl */
+    { { 0xc0, 0x28 }, { 3, 3 }, T, W }, /* shr */
+    { { 0xc0, 0x30 }, { 3, 3 }, T, W }, /* sal */
+    { { 0xc0, 0x38 }, { 3, 3 }, T, W }, /* sar */
+    { { 0xc1, 0x00 }, { 3, 3 }, T, W }, /* rol */
+    { { 0xc1, 0x08 }, { 3, 3 }, T, W }, /* ror */
+    { { 0xc1, 0x10 }, { 3, 3 }, T, W }, /* rcl */
+    { { 0xc1, 0x18 }, { 3, 3 }, T, W }, /* rcr */
+    { { 0xc1, 0x20 }, { 3, 3 }, T, W }, /* shl */
+    { { 0xc1, 0x28 }, { 3, 3 }, T, W }, /* shr */
+    { { 0xc1, 0x30 }, { 3, 3 }, T, W }, /* sal */
+    { { 0xc1, 0x38 }, { 3, 3 }, T, W }, /* sar */
+    { { 0xc2 }, { 3, 3 }, F, R }, /* ret */
+    { { 0xc3 }, { 1, 1 }, F, R }, /* ret */
+    { { 0xc4 }, { 2, 0 }, F, R }, /* les */
+    { { 0xc5 }, { 2, 0 }, F, R }, /* lds */
+    { { 0xc6, 0x00 }, { 3, 3 }, T, W }, /* mov */
+    { { 0xc6, 0xf8 }, { 3, 3 }, F, N }, /* xabort */
+    { { 0xc7, 0x00 }, { 6, 6 }, T, W }, /* mov */
+    { { 0xc7, 0xf8 }, { 6, 6 }, F, N }, /* xbegin */
+    { { 0xc8 }, { 4, 4 }, F, W }, /* enter */
+    { { 0xc9 }, { 1, 1 }, F, R }, /* leave */
+    { { 0xca }, { 3, 3 }, F, R }, /* lret */
+    { { 0xcb }, { 1, 1 }, F, R }, /* lret */
+    { { 0xcc }, { 1, 1 }, F, N }, /* int3 */
+    { { 0xcd }, { 2, 2 }, F, N }, /* int */
+    { { 0xce }, { 1, 0 }, F, N }, /* into */
+    { { 0xcf }, { 1, 1 }, F, N }, /* iret */
+    { { 0xd0, 0x00 }, { 2, 2 }, T, W }, /* rol */
+    { { 0xd0, 0x08 }, { 2, 2 }, T, W }, /* ror */
+    { { 0xd0, 0x10 }, { 2, 2 }, T, W }, /* rcl */
+    { { 0xd0, 0x18 }, { 2, 2 }, T, W }, /* rcr */
+    { { 0xd0, 0x20 }, { 2, 2 }, T, W }, /* shl */
+    { { 0xd0, 0x28 }, { 2, 2 }, T, W }, /* shr */
+    { { 0xd0, 0x30 }, { 2, 2 }, T, W }, /* sal */
+    { { 0xd0, 0x38 }, { 2, 2 }, T, W }, /* sar */
+    { { 0xd1, 0x00 }, { 2, 2 }, T, W }, /* rol */
+    { { 0xd1, 0x08 }, { 2, 2 }, T, W }, /* ror */
+    { { 0xd1, 0x10 }, { 2, 2 }, T, W }, /* rcl */
+    { { 0xd1, 0x18 }, { 2, 2 }, T, W }, /* rcr */
+    { { 0xd1, 0x20 }, { 2, 2 }, T, W }, /* shl */
+    { { 0xd1, 0x28 }, { 2, 2 }, T, W }, /* shr */
+    { { 0xd1, 0x30 }, { 2, 2 }, T, W }, /* sal */
+    { { 0xd1, 0x38 }, { 2, 2 }, T, W }, /* sar */
+    { { 0xd2, 0x00 }, { 2, 2 }, T, W }, /* rol */
+    { { 0xd2, 0x08 }, { 2, 2 }, T, W }, /* ror */
+    { { 0xd2, 0x10 }, { 2, 2 }, T, W }, /* rcl */
+    { { 0xd2, 0x18 }, { 2, 2 }, T, W }, /* rcr */
+    { { 0xd2, 0x20 }, { 2, 2 }, T, W }, /* shl */
+    { { 0xd2, 0x28 }, { 2, 2 }, T, W }, /* shr */
+    { { 0xd2, 0x30 }, { 2, 2 }, T, W }, /* sal */
+    { { 0xd2, 0x38 }, { 2, 2 }, T, W }, /* sar */
+    { { 0xd3, 0x00 }, { 2, 2 }, T, W }, /* rol */
+    { { 0xd3, 0x08 }, { 2, 2 }, T, W }, /* ror */
+    { { 0xd3, 0x10 }, { 2, 2 }, T, W }, /* rcl */
+    { { 0xd3, 0x18 }, { 2, 2 }, T, W }, /* rcr */
+    { { 0xd3, 0x20 }, { 2, 2 }, T, W }, /* shl */
+    { { 0xd3, 0x28 }, { 2, 2 }, T, W }, /* shr */
+    { { 0xd3, 0x30 }, { 2, 2 }, T, W }, /* sal */
+    { { 0xd3, 0x38 }, { 2, 2 }, T, W }, /* sar */
+    { { 0xd4 }, { 2, 0 }, F, N }, /* aam */
+    { { 0xd5 }, { 2, 0 }, F, N }, /* aad */
+    { { 0xd6 }, { 1, 0 }, F, N }, /* salc */
+    { { 0xd7 }, { 1, 1 }, F, R }, /* xlat */
+    { { 0xe0 }, { 2, 2 }, F, N }, /* loopne */
+    { { 0xe1 }, { 2, 2 }, F, N }, /* loope */
+    { { 0xe2 }, { 2, 2 }, F, N }, /* loop */
+    { { 0xe3 }, { 2, 2 }, F, N }, /* j?cxz */
+    { { 0xe4 }, { 2, 2 }, F, N }, /* in */
+    { { 0xe5 }, { 2, 2 }, F, N }, /* in */
+    { { 0xe6 }, { 2, 2 }, F, N }, /* out */
+    { { 0xe7 }, { 2, 2 }, F, N }, /* out */
+    { { 0xe8 }, { 5, 5 }, F, W }, /* call */
+    { { 0xe9 }, { 5, 5 }, F, N }, /* jmp */
+    { { 0xea }, { 7, 0 }, F, N }, /* ljmp */
+    { { 0xeb }, { 2, 2 }, F, N }, /* jmp */
+    { { 0xec }, { 1, 1 }, F, N }, /* in */
+    { { 0xed }, { 1, 1 }, F, N }, /* in */
+    { { 0xee }, { 1, 1 }, F, N }, /* out */
+    { { 0xef }, { 1, 1 }, F, N }, /* out */
+    { { 0xf1 }, { 1, 1 }, F, N }, /* icebp */
+    { { 0xf4 }, { 1, 1 }, F, N }, /* hlt */
+    { { 0xf5 }, { 1, 1 }, F, N }, /* cmc */
+    { { 0xf6, 0x00 }, { 3, 3 }, T, R }, /* test */
+    { { 0xf6, 0x08 }, { 3, 3 }, T, R }, /* test */
+    { { 0xf6, 0x10 }, { 2, 2 }, T, W }, /* not */
+    { { 0xf6, 0x18 }, { 2, 2 }, T, W }, /* neg */
+    { { 0xf6, 0x20 }, { 2, 2 }, T, R }, /* mul */
+    { { 0xf6, 0x28 }, { 2, 2 }, T, R }, /* imul */
+    { { 0xf6, 0x30 }, { 2, 2 }, T, R }, /* div */
+    { { 0xf6, 0x38 }, { 2, 2 }, T, R }, /* idiv */
+    { { 0xf7, 0x00 }, { 6, 6 }, T, R }, /* test */
+    { { 0xf7, 0x08 }, { 6, 6 }, T, R }, /* test */
+    { { 0xf7, 0x10 }, { 2, 2 }, T, W }, /* not */
+    { { 0xf7, 0x18 }, { 2, 2 }, T, W }, /* neg */
+    { { 0xf7, 0x20 }, { 2, 2 }, T, R }, /* mul */
+    { { 0xf7, 0x28 }, { 2, 2 }, T, R }, /* imul */
+    { { 0xf7, 0x30 }, { 2, 2 }, T, R }, /* div */
+    { { 0xf7, 0x38 }, { 2, 2 }, T, R }, /* idiv */
+    { { 0xf8 }, { 1, 1 }, F, N }, /* clc */
+    { { 0xf9 }, { 1, 1 }, F, N }, /* stc */
+    { { 0xfa }, { 1, 1 }, F, N }, /* cli */
+    { { 0xfb }, { 1, 1 }, F, N }, /* sti */
+    { { 0xfc }, { 1, 1 }, F, N }, /* cld */
+    { { 0xfd }, { 1, 1 }, F, N }, /* std */
+    { { 0xfe, 0x00 }, { 2, 2 }, T, W }, /* inc */
+    { { 0xfe, 0x08 }, { 2, 2 }, T, W }, /* dec */
+    { { 0xff, 0x00 }, { 2, 2 }, T, W }, /* inc */
+    { { 0xff, 0x08 }, { 2, 2 }, T, W }, /* dec */
+    { { 0xff, 0x10 }, { 2, 2 }, F, W }, /* call */
+    { { 0xff, 0x18 }, { 2, 2 }, F, W }, /* lcall */
+    { { 0xff, 0x20 }, { 2, 2 }, T, R }, /* jmp */
+    { { 0xff, 0x28 }, { 2, 2 }, F, R }, /* ljmp */
+    { { 0xff, 0x30 }, { 2, 2 }, F, W }, /* push */
+    { { 0xff, 0xd0 }, { 2, 2 }, F, W }, /* call */
+    { { 0xff, 0xf0 }, { 2, 2 }, F, W }, /* push */
+}, legacy_0f[] = {
+    { { 0x00, 0x00 }, { 2, 2 }, T, W }, /* sldt */
+    { { 0x00, 0x08 }, { 2, 2 }, T, W }, /* str */
+    { { 0x00, 0x10 }, { 2, 2 }, T, R }, /* lldt */
+    { { 0x00, 0x18 }, { 2, 2 }, T, R }, /* ltr */
+    { { 0x00, 0x20 }, { 2, 2 }, T, R }, /* verr */
+    { { 0x00, 0x28 }, { 2, 2 }, T, R }, /* verw */
+    { { 0x01, 0x00 }, { 2, 2 }, F, W }, /* sgdt */
+    { { 0x01, 0x08 }, { 2, 2 }, F, W }, /* sidt */
+    { { 0x01, 0x10 }, { 2, 2 }, F, R }, /* lgdt */
+    { { 0x01, 0x18 }, { 2, 2 }, F, R }, /* lidt */
+    { { 0x01, 0x20 }, { 2, 2 }, T, W }, /* smsw */
+    /*{ 0x01, 0x28 }, { 2, 2 }, F, W, pfx_f3 }, rstorssp */
+    { { 0x01, 0x30 }, { 2, 2 }, T, R }, /* lmsw */
+    { { 0x01, 0x38 }, { 2, 2 }, F, N }, /* invlpg */
+    { { 0x01, 0xc0 }, { 2, 2 }, T, N }, /* enclv */
+    { { 0x01, 0xc1 }, { 2, 2 }, T, N }, /* vmcall */
+    /*{ 0x01, 0xc2 }, { 2, 2 }, F, R }, vmlaunch */
+    /*{ 0x01, 0xc3 }, { 2, 2 }, F, R }, vmresume */
+    { { 0x01, 0xc4 }, { 2, 2 }, T, N }, /* vmxoff */
+    { { 0x01, 0xc5 }, { 2, 2 }, T, N }, /* pconfig */
+    { { 0x01, 0xc8 }, { 2, 2 }, T, N }, /* monitor */
+    { { 0x01, 0xc9 }, { 2, 2 }, T, N }, /* mwait */
+    { { 0x01, 0xca }, { 2, 2 }, T, N }, /* clac */
+    { { 0x01, 0xcb }, { 2, 2 }, T, N }, /* stac */
+    { { 0x01, 0xcf }, { 2, 2 }, T, N }, /* encls */
+    { { 0x01, 0xd0 }, { 2, 2 }, T, N }, /* xgetbv */
+    { { 0x01, 0xd1 }, { 2, 2 }, T, N }, /* xsetbv */
+    { { 0x01, 0xd4 }, { 2, 2 }, T, N }, /* vmfunc */
+    { { 0x01, 0xd5 }, { 2, 2 }, T, N }, /* xend */
+    { { 0x01, 0xd6 }, { 2, 2 }, T, N }, /* xtest */
+    { { 0x01, 0xd7 }, { 2, 2 }, T, N }, /* enclu */
+    /*{ 0x01, 0xd8 }, { 2, 2 }, F, R }, vmrun */
+    { { 0x01, 0xd9 }, { 2, 2 }, T, N }, /* vmcall */
+    { { 0x01, 0xd9 }, { 2, 2 }, T, N, pfx_f3 }, /* vmgexit */
+    { { 0x01, 0xd9 }, { 2, 2 }, T, N, pfx_f2 }, /* vmgexit */
+    /*{ 0x01, 0xda }, { 2, 2 }, F, R }, vmload */
+    /*{ 0x01, 0xdb }, { 2, 2 }, F, W }, vmsave */
+    { { 0x01, 0xdc }, { 2, 2 }, T, N }, /* stgi */
+    { { 0x01, 0xdd }, { 2, 2 }, T, N }, /* clgi */
+    /*{ 0x01, 0xde }, { 2, 2 }, F, R }, skinit */
+    { { 0x01, 0xdf }, { 2, 2 }, T, N }, /* invlpga */
+    { { 0x01, 0xe8 }, { 2, 2 }, T, N }, /* serialize */
+    /*{ 0x01, 0xe8 }, { 2, 2 }, F, W, pfx_f3 }, setssbsy */
+    { { 0x01, 0xe8 }, { 2, 2 }, T, N, pfx_f2 }, /* xsusldtrk */
+    { { 0x01, 0xe9 }, { 2, 2 }, T, N, pfx_f2 }, /* xresldtrk */
+    /*{ 0x01, 0xea }, { 2, 2 }, F, W, pfx_f3 }, saveprevssp */
+    { { 0x01, 0xee }, { 2, 2 }, T, N }, /* rdpkru */
+    { { 0x01, 0xef }, { 2, 2 }, T, N }, /* wrpkru */
+    { { 0x01, 0xf8 }, { 0, 2 }, T, N }, /* swapgs */
+    { { 0x01, 0xf9 }, { 2, 2 }, T, N }, /* rdtscp */
+    { { 0x01, 0xfa }, { 2, 2 }, T, N }, /* monitorx */
+    { { 0x01, 0xfa }, { 2, 2 }, T, N, pfx_f3 }, /* mcommit */
+    { { 0x01, 0xfb }, { 2, 2 }, T, N }, /* mwaitx */
+    { { 0x01, 0xfc }, { 2, 2 }, F, W }, /* clzero */
+    { { 0x01, 0xfd }, { 2, 2 }, T, N }, /* rdpru */
+    { { 0x01, 0xfe }, { 2, 2 }, T, N }, /* invlpgb */
+    { { 0x01, 0xfe }, { 0, 2 }, T, N, pfx_f3 }, /* rmpadjust */
+    { { 0x01, 0xfe }, { 0, 2 }, T, N, pfx_f2 }, /* rmpupdate */
+    { { 0x01, 0xff }, { 2, 2 }, T, N }, /* tlbsync */
+    { { 0x01, 0xff }, { 0, 2 }, T, N, pfx_f3 }, /* psmash */
+    { { 0x01, 0xff }, { 0, 2 }, T, N, pfx_f2 }, /* pvalidate */
+    { { 0x02 }, { 2, 2 }, T, R }, /* lar */
+    { { 0x03 }, { 2, 2 }, T, R }, /* lsl */
+    { { 0x05 }, { 1, 1 }, F, N }, /* syscall */
+    { { 0x06 }, { 1, 1 }, F, N }, /* clts */
+    { { 0x07 }, { 1, 1 }, F, N }, /* sysret */
+    { { 0x08 }, { 1, 1 }, F, N }, /* invd */
+    { { 0x09 }, { 1, 1 }, F, N }, /* wbinvd */
+    { { 0x09 }, { 1, 1 }, F, N, pfx_f3 }, /* wbnoinvd */
+    { { 0x0b }, { 1, 1 }, F, N }, /* ud2 */
+    { { 0x0d, 0x00 }, { 2, 2 }, F, N }, /* prefetch */
+    { { 0x0d, 0x08 }, { 2, 2 }, F, N }, /* prefetchw */
+    { { 0x0e }, { 1, 1 }, F, N }, /* femms */
+    { { 0x18, 0x00 }, { 2, 2 }, F, N }, /* prefetchnta */
+    { { 0x18, 0x08 }, { 2, 2 }, F, N }, /* prefetch0 */
+    { { 0x18, 0x10 }, { 2, 2 }, F, N }, /* prefetch1 */
+    { { 0x18, 0x18 }, { 2, 2 }, F, N }, /* prefetch2 */
+    /*{ 0x1a }, { 2, 2 }, F, R }, bndldx */
+    /*{ 0x1a }, { 2, 2 }, T, R, pfx_66 }, bndmov */
+    { { 0x1a }, { 2, 2 }, T, N, pfx_f3 }, /* bndcl */
+    { { 0x1a }, { 2, 2 }, T, N, pfx_f2 }, /* bndcu */
+    /*{ 0x1b }, { 2, 2 }, F, W }, bndstx */
+    /*{ 0x1b }, { 2, 2 }, T, W, pfx_66 }, bndmov */
+    { { 0x1b }, { 2, 2 }, F, N, pfx_f3 }, /* bndmk */
+    { { 0x1b }, { 2, 2 }, T, N, pfx_f2 }, /* bndcn */
+    { { 0x1c, 0x00 }, { 2, 2 }, F, N }, /* cldemote */
+    { { 0x1e, 0xc8 }, { 2, 2 }, F, N, pfx_f3 }, /* rdssp */
+    { { 0x1e, 0xfa }, { 2, 2 }, F, N, pfx_f3 }, /* endbr64 */
+    { { 0x1e, 0xfb }, { 2, 2 }, F, N, pfx_f3 }, /* endbr32 */
+    { { 0x1f, 0x00 }, { 2, 2 }, T, N }, /* nop */
+    { { 0x20 }, { 2, 2 }, T, N }, /* mov */
+    { { 0x21 }, { 2, 2 }, T, N }, /* mov */
+    { { 0x22 }, { 2, 2 }, T, N }, /* mov */
+    { { 0x23 }, { 2, 2 }, T, N }, /* mov */
+    { { 0x30 }, { 1, 1 }, F, N }, /* wrmsr */
+    { { 0x31 }, { 1, 1 }, F, N }, /* rdtsc */
+    { { 0x32 }, { 1, 1 }, F, N }, /* rdmsr */
+    { { 0x33 }, { 1, 1 }, F, N }, /* rdpmc */
+    { { 0x34 }, { 1, 1 }, F, N }, /* sysenter */
+    { { 0x35 }, { 1, 1 }, F, N }, /* sysexit */
+    CND(0x40,   { 2, 2 }, T, R ), /* cmov<cc> */
+    /*{ 0x78 }, { 2, 2 }, T, W }, vmread */
+    { { 0x79 }, { 2, 2 }, T, R }, /* vmwrite */
+    CND(0x80,   { 5, 5 }, F, N ), /* j<cc> */
+    CND(0x90,   { 2, 2 }, T, W ), /* set<cc> */
+    { { 0xa0 }, { 1, 1 }, F, W }, /* push %fs */
+    { { 0xa1 }, { 1, 1 }, F, R }, /* pop %fs */
+    { { 0xa2 }, { 1, 1 }, F, N }, /* cpuid */
+    { { 0xa3 }, { 2, 2 }, T, R }, /* bt */
+    { { 0xa4 }, { 3, 3 }, T, W }, /* shld */
+    { { 0xa5 }, { 2, 2 }, T, W }, /* shld */
+    { { 0xa8 }, { 1, 1 }, F, W }, /* push %gs */
+    { { 0xa9 }, { 1, 1 }, F, R }, /* pop %gs */
+    { { 0xaa }, { 1, 1 }, F, N }, /* rsm */
+    { { 0xab }, { 2, 2 }, T, W }, /* bts */
+    { { 0xac }, { 3, 3 }, T, W }, /* shrd */
+    { { 0xad }, { 2, 2 }, T, W }, /* shrd */
+    { { 0xae, 0x00 }, { 2, 2 }, F, W }, /* fxsave */
+    { { 0xae, 0x08 }, { 2, 2 }, F, R }, /* fxrstor */
+    { { 0xae, 0x10 }, { 2, 2 }, F, R }, /* ldmxcsr */
+    { { 0xae, 0x18 }, { 2, 2 }, F, W }, /* stmxcsr */
+    { { 0xae, 0x20 }, { 2, 2 }, F, W }, /* xsave */
+    { { 0xae, 0x20 }, { 2, 2 }, F, R, pfx_f3 }, /* ptwrite */
+    { { 0xae, 0x28 }, { 2, 2 }, F, R }, /* xrstor */
+    { { 0xae, 0x30 }, { 2, 2 }, F, W }, /* xsaveopt */
+    { { 0xae, 0x30 }, { 2, 2 }, F, N, pfx_66 }, /* clwb */
+    /*{ 0xae, 0x30 }, { 2, 2 }, F, W, pfx_f3 }, clrssbsy */
+    { { 0xae, 0x38 }, { 2, 2 }, F, N }, /* clflush */
+    { { 0xae, 0x38 }, { 2, 2 }, F, N, pfx_66 }, /* clflushopt */
+    { { 0xae, 0xc0 }, { 0, 2 }, F, N, pfx_f3 }, /* rdfsbase */
+    { { 0xae, 0xc8 }, { 0, 2 }, F, N, pfx_f3 }, /* rdgsbase */
+    { { 0xae, 0xd0 }, { 0, 2 }, F, N, pfx_f3 }, /* wrfsbase */
+    { { 0xae, 0xd8 }, { 0, 2 }, F, N, pfx_f3 }, /* wrgsbase */
+    { { 0xae, 0xe8 }, { 2, 2 }, F, N }, /* lfence */
+    /*{ 0xae, 0xe8 }, { 2, 2 }, F, R, pfx_f3 }, incssp */
+    { { 0xae, 0xf0 }, { 2, 2 }, F, N }, /* mfence */
+    { { 0xae, 0xf0 }, { 2, 2 }, F, N, pfx_66 }, /* tpause */
+    { { 0xae, 0xf0 }, { 2, 2 }, F, N, pfx_f3 }, /* umonitor */
+    { { 0xae, 0xf0 }, { 2, 2 }, F, N, pfx_f2 }, /* umwait */
+    { { 0xae, 0xf8 }, { 2, 2 }, F, N }, /* sfence */
+    { { 0xaf }, { 2, 2 }, T, R }, /* imul */
+    { { 0xb0 }, { 2, 2 }, F, W }, /* cmpxchg */
+    { { 0xb1 }, { 2, 2 }, F, W }, /* cmpxchg */
+    { { 0xb2 }, { 2, 2 }, F, R }, /* lss */
+    { { 0xb3 }, { 2, 2 }, T, W }, /* btr */
+    { { 0xb4 }, { 2, 2 }, F, R }, /* lfs */
+    { { 0xb5 }, { 2, 2 }, F, R }, /* lgs */
+    { { 0xb6 }, { 2, 2 }, F, R }, /* movzx */
+    { { 0xb7 }, { 2, 2 }, F, R }, /* movzx */
+    { { 0xb8 }, { 2, 2 }, F, R }, /* popcnt */
+    { { 0xb9 }, { 2, 2 }, F, N }, /* ud1 */
+    { { 0xba, 0x20 }, { 3, 3 }, T, R }, /* bt */
+    { { 0xba, 0x28 }, { 3, 3 }, T, W }, /* bts */
+    { { 0xba, 0x30 }, { 3, 3 }, T, W }, /* btr */
+    { { 0xba, 0x38 }, { 3, 3 }, T, W }, /* btc */
+    { { 0xbb }, { 2, 2 }, T, W }, /* btc */
+    { { 0xbc }, { 2, 2 }, T, R }, /* bsf */
+    { { 0xbc }, { 2, 2 }, T, R, pfx_f3 }, /* tzcnt */
+    { { 0xbd }, { 2, 2 }, T, R }, /* bsr */
+    { { 0xbd }, { 2, 2 }, T, R, pfx_f3 }, /* lzcnt */
+    { { 0xbe }, { 2, 2 }, F, R }, /* movsx */
+    { { 0xbf }, { 2, 2 }, F, R }, /* movsx */
+    { { 0xc0 }, { 2, 2 }, F, W }, /* xadd */
+    { { 0xc1 }, { 2, 2 }, F, W }, /* xadd */
+    { { 0xc3 }, { 2, 2 }, F, W }, /* movnti */
+    { { 0xc7, 0x08 }, { 2, 2 }, F, W }, /* cmpxchg8b */
+    { { 0xc7, 0x18 }, { 2, 2 }, F, R }, /* xrstors */
+    { { 0xc7, 0x20 }, { 2, 2 }, F, W }, /* xsavec */
+    { { 0xc7, 0x28 }, { 2, 2 }, F, W }, /* xsaves */
+    { { 0xc7, 0x30 }, { 2, 2 }, F, R }, /* vmptrld */
+    { { 0xc7, 0x30 }, { 2, 2 }, F, R, pfx_66 }, /* vmclear */
+    { { 0xc7, 0x30 }, { 2, 2 }, F, R, pfx_f3 }, /* vmxon */
+    { { 0xc7, 0x38 }, { 2, 2 }, F, R }, /* vmptrst */
+    { { 0xc7, 0xf0 }, { 2, 2 }, F, N }, /* rdrand */
+    { { 0xc7, 0xf8 }, { 2, 2 }, F, N }, /* rdseed */
+    { { 0xc7, 0xf8 }, { 2, 2 }, F, N, pfx_f3 }, /* rdpid */
+    REG(0xc8,   { 1, 1 }, F, N ), /* bswap */
+    { { 0xff }, { 2, 2 }, F, N }, /* ud0 */
+}, legacy_0f38[] = {
+    { { 0x80 }, { 2, 2 }, T, R, pfx_66 }, /* invept */
+    { { 0x81 }, { 2, 2 }, T, R, pfx_66 }, /* invvpid */
+    { { 0x82 }, { 2, 2 }, T, R, pfx_66 }, /* invpcid */
+    { { 0xf0 }, { 2, 2 }, T, R }, /* movbe */
+    { { 0xf0 }, { 2, 2 }, T, R, pfx_f2 }, /* crc32 */
+    { { 0xf1 }, { 2, 2 }, T, W }, /* movbe */
+    { { 0xf1 }, { 2, 2 }, T, R, pfx_f2 }, /* crc32 */
+    /*{ 0xf5 }, { 2, 2 }, F, W, pfx_66 }, wruss */
+    /*{ 0xf6 }, { 2, 2 }, F, W }, wrss */
+    { { 0xf6 }, { 2, 2 }, T, R, pfx_66 }, /* adcx */
+    { { 0xf6 }, { 2, 2 }, T, R, pfx_f3 }, /* adox */
+};
+#undef CND
+#undef REG
+#undef F
+#undef N
+#undef R
+#undef T
+#undef W
+
+static unsigned int errors;
+
+static void print_insn(const uint8_t *instr, unsigned int len)
+{
+    if ( !errors++ )
+        puts("");
+    while ( len--)
+        printf("%02x%c", *instr++, len ? ' ' : ':');
+}
+
+void do_test(uint8_t *instr, unsigned int len, unsigned int modrm,
+             enum mem_access mem, struct x86_emulate_ctxt *ctxt,
+             int (*fetch)(enum x86_segment seg,
+                          unsigned long offset,
+                          void *p_data,
+                          unsigned int bytes,
+                          struct x86_emulate_ctxt *ctxt))
+{
+    struct x86_emulate_state *s;
+
+    if ( !modrm || mem != mem_none )
+    {
+        s = x86_decode_insn(ctxt, fetch);
+
+        if ( x86_insn_length(s, ctxt) != len )
+        {
+            print_insn(instr, len);
+            printf(" length %u (expected %u)\n", x86_insn_length(s, ctxt), len);
+        }
+
+        if ( x86_insn_is_mem_access(s, ctxt) != (mem != mem_none) )
+        {
+            print_insn(instr, len);
+            printf(" mem access %d (expected %d)\n",
+                   x86_insn_is_mem_access(s, ctxt), mem != mem_none);
+        }
+
+        if ( x86_insn_is_mem_write(s, ctxt) != (mem == mem_write) )
+        {
+            print_insn(instr, len);
+            printf(" mem write %d (expected %d)\n",
+                   x86_insn_is_mem_write(s, ctxt), mem == mem_write);
+        }
+
+        x86_emulate_free_state(s);
+    }
+
+    if ( modrm )
+    {
+        instr[modrm] |= 0xc0;
+
+        s = x86_decode_insn(ctxt, fetch);
+
+        if ( x86_insn_length(s, ctxt) != len )
+        {
+            print_insn(instr, len);
+            printf(" length %u (expected %u)\n", x86_insn_length(s, ctxt), len);
+        }
+
+        if ( x86_insn_is_mem_access(s, ctxt) ||
+             x86_insn_is_mem_write(s, ctxt) )
+        {
+            print_insn(instr, len);
+            printf(" mem access %d / write %d unexpected\n",
+                   x86_insn_is_mem_access(s, ctxt),
+                   x86_insn_is_mem_write(s, ctxt));
+        }
+
+        x86_emulate_free_state(s);
+    }
+}
+
+void predicates_test(void *instr, struct x86_emulate_ctxt *ctxt,
+                     int (*fetch)(enum x86_segment seg,
+                                  unsigned long offset,
+                                  void *p_data,
+                                  unsigned int bytes,
+                                  struct x86_emulate_ctxt *ctxt))
+{
+    unsigned int m;
+
+    ctxt->regs->eip = (unsigned long)instr;
+
+    for ( m = 0; m < sizeof(long) / sizeof(int); ++m )
+    {
+        unsigned int t;
+
+        ctxt->addr_size = 32 << m;
+        ctxt->sp_size = 32 << m;
+        ctxt->lma = ctxt->sp_size == 64;
+
+        printf("Testing %u-bit decoding / predicates...", ctxt->sp_size);
+
+        for ( t = 0; t < ARRAY_SIZE(legacy); ++t )
+        {
+            if ( !legacy[t].len[m] )
+                continue;
+
+            assert(!legacy[t].pfx);
+
+            memset(instr + 1, 0xcc, 14);
+            memcpy(instr, legacy[t].opc, legacy[t].len[m]);
+
+            do_test(instr, legacy[t].len[m], legacy[t].modrm, legacy[t].mem,
+                    ctxt, fetch);
+        }
+
+        for ( t = 0; t < ARRAY_SIZE(legacy_0f); ++t )
+        {
+            uint8_t *ptr = instr;
+
+            if ( !legacy_0f[t].len[m] )
+                continue;
+
+            memset(instr + 2, 0xcc, 13);
+            if ( legacy_0f[t].pfx )
+                *ptr++ = prefixes[legacy_0f[t].pfx - 1];
+            *ptr++ = 0x0f;
+            memcpy(ptr, legacy_0f[t].opc, legacy_0f[t].len[m]);
+
+            do_test(instr, legacy_0f[t].len[m] + ((void *)ptr - instr),
+                    legacy_0f[t].modrm ? (void *)ptr - instr + 1 : 0,
+                    legacy_0f[t].mem, ctxt, fetch);
+        }
+
+        for ( t = 0; t < ARRAY_SIZE(legacy_0f38); ++t )
+        {
+            uint8_t *ptr = instr;
+
+            if ( !legacy_0f38[t].len[m] )
+                continue;
+
+            memset(instr + 2, 0xcc, 13);
+            if ( legacy_0f38[t].pfx )
+                *ptr++ = prefixes[legacy_0f38[t].pfx - 1];
+            *ptr++ = 0x0f;
+            *ptr++ = 0x38;
+            memcpy(ptr, legacy_0f38[t].opc, legacy_0f38[t].len[m]);
+
+            do_test(instr, legacy_0f38[t].len[m] + ((void *)ptr - instr),
+                    legacy_0f38[t].modrm ? (void *)ptr - instr + 1 : 0,
+                    legacy_0f38[t].mem, ctxt, fetch);
+        }
+
+        if ( errors )
+            exit(1);
+
+        puts(" okay");
+    }
+}
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -4810,6 +4810,8 @@ int main(int argc, char **argv)
     if ( stack_exec )
         evex_disp8_test(instr, &ctxt, &emulops);
 
+    predicates_test(instr, &ctxt, fetch);
+
     for ( j = 0; j < ARRAY_SIZE(blobs); j++ )
     {
         if ( blobs[j].check_cpu && !blobs[j].check_cpu() )
--- a/tools/tests/x86_emulator/x86-emulate.c
+++ b/tools/tests/x86_emulator/x86-emulate.c
@@ -1,7 +1,13 @@
 #include "x86-emulate.h"
 
+#include <errno.h>
 #include <sys/mman.h>
 
+#define DEFINE_PER_CPU(type, var) type per_cpu_##var
+#define this_cpu(var) per_cpu_##var
+
+#define ERR_PTR(val) NULL
+
 #define cpu_has_amd_erratum(nr) 0
 #define cpu_has_mpx false
 #define read_bndcfgu() 0
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -101,6 +101,12 @@ WRAP(puts);
 
 void evex_disp8_test(void *instr, struct x86_emulate_ctxt *ctxt,
                      const struct x86_emulate_ops *ops);
+void predicates_test(void *instr, struct x86_emulate_ctxt *ctxt,
+                     int (*fetch)(enum x86_segment seg,
+                                  unsigned long offset,
+                                  void *p_data,
+                                  unsigned int bytes,
+                                  struct x86_emulate_ctxt *ctxt));
 
 static inline uint64_t xgetbv(uint32_t xcr)
 {
--- a/xen/arch/x86/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate.c
@@ -10,6 +10,7 @@
  */
 
 #include <xen/domain_page.h>
+#include <xen/err.h>
 #include <xen/event.h>
 #include <asm/x86_emulate.h>
 #include <asm/processor.h> /* current_cpu_info */
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -11382,10 +11382,6 @@ int x86_emulate_wrapper(
 }
 #endif
 
-#ifdef __XEN__
-
-#include <xen/err.h>
-
 struct x86_emulate_state *
 x86_decode_insn(
     struct x86_emulate_ctxt *ctxt,
@@ -11408,7 +11404,7 @@ x86_decode_insn(
     if ( unlikely(rc != X86EMUL_OKAY) )
         return ERR_PTR(-rc);
 
-#ifndef NDEBUG
+#if defined(__XEN__) && !defined(NDEBUG)
     /*
      * While we avoid memory allocation (by use of per-CPU data) above,
      * nevertheless make sure callers properly release the state structure
@@ -11428,12 +11424,12 @@ x86_decode_insn(
 
 static inline void check_state(const struct x86_emulate_state *state)
 {
-#ifndef NDEBUG
+#if defined(__XEN__) && !defined(NDEBUG)
     ASSERT(state->caller);
 #endif
 }
 
-#ifndef NDEBUG
+#if defined(__XEN__) && !defined(NDEBUG)
 void x86_emulate_free_state(struct x86_emulate_state *state)
 {
     check_state(state);
@@ -11806,5 +11802,3 @@ x86_insn_length(const struct x86_emulate
 
     return state->ip - ctxt->regs->r(ip);
 }
-
-#endif
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -730,8 +730,6 @@ x86emul_unhandleable_rw(
     unsigned int bytes,
     struct x86_emulate_ctxt *ctxt);
 
-#ifdef __XEN__
-
 struct x86_emulate_state *
 x86_decode_insn(
     struct x86_emulate_ctxt *ctxt,
@@ -767,12 +765,14 @@ bool
 x86_insn_is_cr_access(const struct x86_emulate_state *state,
                       const struct x86_emulate_ctxt *ctxt);
 
-#ifdef NDEBUG
+#if !defined(__XEN__) || defined(NDEBUG)
 static inline void x86_emulate_free_state(struct x86_emulate_state *state) {}
 #else
 void x86_emulate_free_state(struct x86_emulate_state *state);
 #endif
 
+#ifdef __XEN__
+
 int x86emul_read_xcr(unsigned int reg, uint64_t *val,
                      struct x86_emulate_ctxt *ctxt);
 int x86emul_write_xcr(unsigned int reg, uint64_t val,


Re: [PATCH v10 3/9] x86emul: also test decoding and mem access / write logic
Posted by Andrew Cooper 3 years, 11 months ago
On 25/05/2020 15:27, Jan Beulich wrote:
> x86emul_is_mem_{access,write}() (and their interaction with
> x86_decode()) have become sufficiently complex that we should have a way
> to test this logic. Start by covering legacy encoded GPR insns, with the
> exception of a few the main emulator doesn't support yet (left as
> comments in the respective tables, or about to be added by subsequent
> patches). This has already helped spot a few flaws in said logic,
> addressed by (revised) earlier patches.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

Extra testing is always good.

[PATCH v10 4/9] x86emul: disable FPU/MMX/SIMD insn emulation when !HVM
Posted by Jan Beulich 3 years, 11 months ago
In a pure PV environment (the PV shim in particular) we don't really
need emulation of all these. To limit #ifdef-ary utilize some of the
CASE_*() macros we have, by providing variants expanding to
(effectively) nothing (really a label, which in turn requires passing
-Wno-unused-label to the compiler when build such configurations).

Due to the mixture of macro and #ifdef use, the placement of some of
the #ifdef-s is a little arbitrary.

The resulting object file's .text is less than half the size of the
original, and looks to also be compiling a little more quickly.

This is meant as a first step; more parts can likely be disabled down
the road.

Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v7: Integrate into this series. Re-base.
---
I'll be happy to take suggestions allowing to avoid -Wno-unused-label.

--- a/xen/arch/x86/Makefile
+++ b/xen/arch/x86/Makefile
@@ -73,6 +73,9 @@ obj-y += vm_event.o
 obj-y += xstate.o
 extra-y += asm-macros.i
 
+ifneq ($(CONFIG_HVM),y)
+x86_emulate.o: CFLAGS-y += -Wno-unused-label
+endif
 x86_emulate.o: x86_emulate/x86_emulate.c x86_emulate/x86_emulate.h
 
 efi-y := $(shell if [ ! -r $(BASEDIR)/include/xen/compile.h -o \
--- a/xen/arch/x86/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate.c
@@ -43,6 +43,12 @@
     }                                                      \
 })
 
+#ifndef CONFIG_HVM
+# define X86EMUL_NO_FPU
+# define X86EMUL_NO_MMX
+# define X86EMUL_NO_SIMD
+#endif
+
 #include "x86_emulate/x86_emulate.c"
 
 int x86emul_read_xcr(unsigned int reg, uint64_t *val,
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -3506,6 +3506,7 @@ x86_decode(
             op_bytes = 4;
         break;
 
+#ifndef X86EMUL_NO_SIMD
     case simd_packed_int:
         switch ( vex.pfx )
         {
@@ -3571,6 +3572,7 @@ x86_decode(
     case simd_256:
         op_bytes = 32;
         break;
+#endif /* !X86EMUL_NO_SIMD */
 
     default:
         op_bytes = 0;
@@ -3725,6 +3727,7 @@ x86_emulate(
         break;
     }
 
+#ifndef X86EMUL_NO_SIMD
     /* With a memory operand, fetch the mask register in use (if any). */
     if ( ea.type == OP_MEM && evex.opmsk &&
          _get_fpu(fpu_type = X86EMUL_FPU_opmask, ctxt, ops) == X86EMUL_OKAY )
@@ -3755,6 +3758,7 @@ x86_emulate(
         put_fpu(X86EMUL_FPU_opmask, false, state, ctxt, ops);
         fpu_type = X86EMUL_FPU_none;
     }
+#endif /* !X86EMUL_NO_SIMD */
 
     /* Decode (but don't fetch) the destination operand: register or memory. */
     switch ( d & DstMask )
@@ -4400,11 +4404,13 @@ x86_emulate(
         singlestep = _regs.eflags & X86_EFLAGS_TF;
         break;
 
+#ifndef X86EMUL_NO_FPU
     case 0x9b:  /* wait/fwait */
         host_and_vcpu_must_have(fpu);
         get_fpu(X86EMUL_FPU_wait);
         emulate_fpu_insn_stub(b);
         break;
+#endif
 
     case 0x9c: /* pushf */
         if ( (_regs.eflags & X86_EFLAGS_VM) &&
@@ -4814,6 +4820,7 @@ x86_emulate(
         break;
     }
 
+#ifndef X86EMUL_NO_FPU
     case 0xd8: /* FPU 0xd8 */
         host_and_vcpu_must_have(fpu);
         get_fpu(X86EMUL_FPU_fpu);
@@ -5148,6 +5155,7 @@ x86_emulate(
             }
         }
         break;
+#endif /* !X86EMUL_NO_FPU */
 
     case 0xe0 ... 0xe2: /* loop{,z,nz} */ {
         unsigned long count = get_loop_count(&_regs, ad_bytes);
@@ -6124,6 +6132,8 @@ x86_emulate(
     case X86EMUL_OPC(0x0f, 0x19) ... X86EMUL_OPC(0x0f, 0x1f): /* nop */
         break;
 
+#ifndef X86EMUL_NO_MMX
+
     case X86EMUL_OPC(0x0f, 0x0e): /* femms */
         host_and_vcpu_must_have(3dnow);
         asm volatile ( "femms" );
@@ -6144,39 +6154,71 @@ x86_emulate(
         state->simd_size = simd_other;
         goto simd_0f_imm8;
 
-#define CASE_SIMD_PACKED_INT(pfx, opc)       \
+#endif /* !X86EMUL_NO_MMX */
+
+#if !defined(X86EMUL_NO_SIMD) && !defined(X86EMUL_NO_MMX)
+# define CASE_SIMD_PACKED_INT(pfx, opc)      \
     case X86EMUL_OPC(pfx, opc):              \
     case X86EMUL_OPC_66(pfx, opc)
-#define CASE_SIMD_PACKED_INT_VEX(pfx, opc)   \
+#elif !defined(X86EMUL_NO_SIMD)
+# define CASE_SIMD_PACKED_INT(pfx, opc)      \
+    case X86EMUL_OPC_66(pfx, opc)
+#elif !defined(X86EMUL_NO_MMX)
+# define CASE_SIMD_PACKED_INT(pfx, opc)      \
+    case X86EMUL_OPC(pfx, opc)
+#else
+# define CASE_SIMD_PACKED_INT(pfx, opc) C##pfx##_##opc
+#endif
+
+#ifndef X86EMUL_NO_SIMD
+
+# define CASE_SIMD_PACKED_INT_VEX(pfx, opc)  \
     CASE_SIMD_PACKED_INT(pfx, opc):          \
     case X86EMUL_OPC_VEX_66(pfx, opc)
 
-#define CASE_SIMD_ALL_FP(kind, pfx, opc)     \
+# define CASE_SIMD_ALL_FP(kind, pfx, opc)    \
     CASE_SIMD_PACKED_FP(kind, pfx, opc):     \
     CASE_SIMD_SCALAR_FP(kind, pfx, opc)
-#define CASE_SIMD_PACKED_FP(kind, pfx, opc)  \
+# define CASE_SIMD_PACKED_FP(kind, pfx, opc) \
     case X86EMUL_OPC##kind(pfx, opc):        \
     case X86EMUL_OPC##kind##_66(pfx, opc)
-#define CASE_SIMD_SCALAR_FP(kind, pfx, opc)  \
+# define CASE_SIMD_SCALAR_FP(kind, pfx, opc) \
     case X86EMUL_OPC##kind##_F3(pfx, opc):   \
     case X86EMUL_OPC##kind##_F2(pfx, opc)
-#define CASE_SIMD_SINGLE_FP(kind, pfx, opc)  \
+# define CASE_SIMD_SINGLE_FP(kind, pfx, opc) \
     case X86EMUL_OPC##kind(pfx, opc):        \
     case X86EMUL_OPC##kind##_F3(pfx, opc)
 
-#define CASE_SIMD_ALL_FP_VEX(pfx, opc)       \
+# define CASE_SIMD_ALL_FP_VEX(pfx, opc)      \
     CASE_SIMD_ALL_FP(, pfx, opc):            \
     CASE_SIMD_ALL_FP(_VEX, pfx, opc)
-#define CASE_SIMD_PACKED_FP_VEX(pfx, opc)    \
+# define CASE_SIMD_PACKED_FP_VEX(pfx, opc)   \
     CASE_SIMD_PACKED_FP(, pfx, opc):         \
     CASE_SIMD_PACKED_FP(_VEX, pfx, opc)
-#define CASE_SIMD_SCALAR_FP_VEX(pfx, opc)    \
+# define CASE_SIMD_SCALAR_FP_VEX(pfx, opc)   \
     CASE_SIMD_SCALAR_FP(, pfx, opc):         \
     CASE_SIMD_SCALAR_FP(_VEX, pfx, opc)
-#define CASE_SIMD_SINGLE_FP_VEX(pfx, opc)    \
+# define CASE_SIMD_SINGLE_FP_VEX(pfx, opc)   \
     CASE_SIMD_SINGLE_FP(, pfx, opc):         \
     CASE_SIMD_SINGLE_FP(_VEX, pfx, opc)
 
+#else
+
+# define CASE_SIMD_PACKED_INT_VEX(pfx, opc)  \
+    CASE_SIMD_PACKED_INT(pfx, opc)
+
+# define CASE_SIMD_ALL_FP(kind, pfx, opc)    C##kind##pfx##_##opc
+# define CASE_SIMD_PACKED_FP(kind, pfx, opc) Cp##kind##pfx##_##opc
+# define CASE_SIMD_SCALAR_FP(kind, pfx, opc) Cs##kind##pfx##_##opc
+# define CASE_SIMD_SINGLE_FP(kind, pfx, opc) C##kind##pfx##_##opc
+
+# define CASE_SIMD_ALL_FP_VEX(pfx, opc)    CASE_SIMD_ALL_FP(, pfx, opc)
+# define CASE_SIMD_PACKED_FP_VEX(pfx, opc) CASE_SIMD_PACKED_FP(, pfx, opc)
+# define CASE_SIMD_SCALAR_FP_VEX(pfx, opc) CASE_SIMD_SCALAR_FP(, pfx, opc)
+# define CASE_SIMD_SINGLE_FP_VEX(pfx, opc) CASE_SIMD_SINGLE_FP(, pfx, opc)
+
+#endif
+
     CASE_SIMD_SCALAR_FP(, 0x0f, 0x2b):     /* movnts{s,d} xmm,mem */
         host_and_vcpu_must_have(sse4a);
         /* fall through */
@@ -6314,6 +6356,8 @@ x86_emulate(
         insn_bytes = EVEX_PFX_BYTES + 2;
         break;
 
+#ifndef X86EMUL_NO_SIMD
+
     case X86EMUL_OPC_66(0x0f, 0x12):       /* movlpd m64,xmm */
     case X86EMUL_OPC_VEX_66(0x0f, 0x12):   /* vmovlpd m64,xmm,xmm */
     CASE_SIMD_PACKED_FP_VEX(0x0f, 0x13):   /* movlp{s,d} xmm,m64 */
@@ -6420,6 +6464,8 @@ x86_emulate(
         avx512_vlen_check(false);
         goto simd_zmm;
 
+#endif /* !X86EMUL_NO_SIMD */
+
     case X86EMUL_OPC(0x0f, 0x20): /* mov cr,reg */
     case X86EMUL_OPC(0x0f, 0x21): /* mov dr,reg */
     case X86EMUL_OPC(0x0f, 0x22): /* mov reg,cr */
@@ -6446,6 +6492,8 @@ x86_emulate(
             goto done;
         break;
 
+#if !defined(X86EMUL_NO_MMX) && !defined(X86EMUL_NO_SIMD)
+
     case X86EMUL_OPC_66(0x0f, 0x2a):       /* cvtpi2pd mm/m64,xmm */
         if ( ea.type == OP_REG )
         {
@@ -6457,6 +6505,8 @@ x86_emulate(
         op_bytes = (b & 4) && (vex.pfx & VEX_PREFIX_DOUBLE_MASK) ? 16 : 8;
         goto simd_0f_fp;
 
+#endif /* !X86EMUL_NO_MMX && !X86EMUL_NO_SIMD */
+
     CASE_SIMD_SCALAR_FP_VEX(0x0f, 0x2a):   /* {,v}cvtsi2s{s,d} r/m,xmm */
         if ( vex.opcx == vex_none )
         {
@@ -6803,6 +6853,8 @@ x86_emulate(
             dst.val = src.val;
         break;
 
+#ifndef X86EMUL_NO_SIMD
+
     case X86EMUL_OPC_VEX(0x0f, 0x4a):    /* kadd{w,q} k,k,k */
         if ( !vex.w )
             host_and_vcpu_must_have(avx512dq);
@@ -6857,6 +6909,8 @@ x86_emulate(
         generate_exception_if(!vex.l || vex.w, EXC_UD);
         goto opmask_common;
 
+#endif /* X86EMUL_NO_SIMD */
+
     CASE_SIMD_PACKED_FP_VEX(0x0f, 0x50):   /* movmskp{s,d} xmm,reg */
                                            /* vmovmskp{s,d} {x,y}mm,reg */
     CASE_SIMD_PACKED_INT_VEX(0x0f, 0xd7):  /* pmovmskb {,x}mm,reg */
@@ -6940,6 +6994,8 @@ x86_emulate(
                          evex.w);
         goto avx512f_all_fp;
 
+#ifndef X86EMUL_NO_SIMD
+
     CASE_SIMD_PACKED_FP_VEX(0x0f, 0x5b):   /* cvt{ps,dq}2{dq,ps} xmm/mem,xmm */
                                            /* vcvt{ps,dq}2{dq,ps} {x,y}mm/mem,{x,y}mm */
     case X86EMUL_OPC_F3(0x0f, 0x5b):       /* cvttps2dq xmm/mem,xmm */
@@ -6970,6 +7026,8 @@ x86_emulate(
         op_bytes = 16 << evex.lr;
         goto simd_zmm;
 
+#endif /* !X86EMUL_NO_SIMD */
+
     CASE_SIMD_PACKED_INT_VEX(0x0f, 0x60): /* punpcklbw {,x}mm/mem,{,x}mm */
                                           /* vpunpcklbw {x,y}mm/mem,{x,y}mm,{x,y}mm */
     CASE_SIMD_PACKED_INT_VEX(0x0f, 0x61): /* punpcklwd {,x}mm/mem,{,x}mm */
@@ -6996,6 +7054,7 @@ x86_emulate(
                                           /* vpackusbw {x,y}mm/mem,{x,y}mm,{x,y}mm */
     CASE_SIMD_PACKED_INT_VEX(0x0f, 0x6b): /* packsswd {,x}mm/mem,{,x}mm */
                                           /* vpacksswd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+#ifndef X86EMUL_NO_SIMD
     case X86EMUL_OPC_66(0x0f, 0x6c):     /* punpcklqdq xmm/m128,xmm */
     case X86EMUL_OPC_VEX_66(0x0f, 0x6c): /* vpunpcklqdq {x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_66(0x0f, 0x6d):     /* punpckhqdq xmm/m128,xmm */
@@ -7080,6 +7139,7 @@ x86_emulate(
                                           /* vpsubd {x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_66(0x0f, 0xfb):     /* psubq xmm/m128,xmm */
     case X86EMUL_OPC_VEX_66(0x0f, 0xfb): /* vpsubq {x,y}mm/mem,{x,y}mm,{x,y}mm */
+#endif /* !X86EMUL_NO_SIMD */
     CASE_SIMD_PACKED_INT_VEX(0x0f, 0xfc): /* paddb {,x}mm/mem,{,x}mm */
                                           /* vpaddb {x,y}mm/mem,{x,y}mm,{x,y}mm */
     CASE_SIMD_PACKED_INT_VEX(0x0f, 0xfd): /* paddw {,x}mm/mem,{,x}mm */
@@ -7087,6 +7147,7 @@ x86_emulate(
     CASE_SIMD_PACKED_INT_VEX(0x0f, 0xfe): /* paddd {,x}mm/mem,{,x}mm */
                                           /* vpaddd {x,y}mm/mem,{x,y}mm,{x,y}mm */
     simd_0f_int:
+#ifndef X86EMUL_NO_SIMD
         if ( vex.opcx != vex_none )
         {
     case X86EMUL_OPC_VEX_66(0x0f38, 0x00): /* vpshufb {x,y}mm/mem,{x,y}mm,{x,y}mm */
@@ -7128,11 +7189,14 @@ x86_emulate(
         }
         if ( vex.pfx )
             goto simd_0f_sse2;
+#endif /* !X86EMUL_NO_SIMD */
     simd_0f_mmx:
         host_and_vcpu_must_have(mmx);
         get_fpu(X86EMUL_FPU_mmx);
         goto simd_0f_common;
 
+#ifndef X86EMUL_NO_SIMD
+
     case X86EMUL_OPC_EVEX_66(0x0f, 0xf6): /* vpsadbw [xyz]mm/mem,[xyz]mm,[xyz]mm */
         generate_exception_if(evex.opmsk, EXC_UD);
         /* fall through */
@@ -7226,6 +7290,8 @@ x86_emulate(
         generate_exception_if(!evex.w, EXC_UD);
         goto avx512f_no_sae;
 
+#endif /* X86EMUL_NO_SIMD */
+
     CASE_SIMD_PACKED_INT_VEX(0x0f, 0x6e): /* mov{d,q} r/m,{,x}mm */
                                           /* vmov{d,q} r/m,xmm */
     CASE_SIMD_PACKED_INT_VEX(0x0f, 0x7e): /* mov{d,q} {,x}mm,r/m */
@@ -7267,6 +7333,8 @@ x86_emulate(
         ASSERT(!state->simd_size);
         break;
 
+#ifndef X86EMUL_NO_SIMD
+
     case X86EMUL_OPC_EVEX_66(0x0f, 0x6e): /* vmov{d,q} r/m,xmm */
     case X86EMUL_OPC_EVEX_66(0x0f, 0x7e): /* vmov{d,q} xmm,r/m */
         generate_exception_if((evex.lr || evex.opmsk || evex.brs ||
@@ -7339,11 +7407,15 @@ x86_emulate(
         d |= TwoOp;
         /* fall through */
     case X86EMUL_OPC_66(0x0f, 0xd6):     /* movq xmm,xmm/m64 */
+#endif /* !X86EMUL_NO_SIMD */
+#ifndef X86EMUL_NO_MMX
     case X86EMUL_OPC(0x0f, 0x6f):        /* movq mm/m64,mm */
     case X86EMUL_OPC(0x0f, 0x7f):        /* movq mm,mm/m64 */
+#endif
         op_bytes = 8;
         goto simd_0f_int;
 
+#ifndef X86EMUL_NO_SIMD
     CASE_SIMD_PACKED_INT_VEX(0x0f, 0x70):/* pshuf{w,d} $imm8,{,x}mm/mem,{,x}mm */
                                          /* vpshufd $imm8,{x,y}mm/mem,{x,y}mm */
     case X86EMUL_OPC_F3(0x0f, 0x70):     /* pshufhw $imm8,xmm/m128,xmm */
@@ -7352,12 +7424,15 @@ x86_emulate(
     case X86EMUL_OPC_VEX_F2(0x0f, 0x70): /* vpshuflw $imm8,{x,y}mm/mem,{x,y}mm */
         d = (d & ~SrcMask) | SrcMem | TwoOp;
         op_bytes = vex.pfx ? 16 << vex.l : 8;
+#endif
     simd_0f_int_imm8:
         if ( vex.opcx != vex_none )
         {
+#ifndef X86EMUL_NO_SIMD
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x0e): /* vpblendw $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x0f): /* vpalignr $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x42): /* vmpsadbw $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+#endif
             if ( vex.l )
             {
     simd_0f_imm8_avx2:
@@ -7365,6 +7440,7 @@ x86_emulate(
             }
             else
             {
+#ifndef X86EMUL_NO_SIMD
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x08): /* vroundps $imm8,{x,y}mm/mem,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x09): /* vroundpd $imm8,{x,y}mm/mem,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x0a): /* vroundss $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
@@ -7372,6 +7448,7 @@ x86_emulate(
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x0c): /* vblendps $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x0d): /* vblendpd $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x40): /* vdpps $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+#endif
     simd_0f_imm8_avx:
                 host_and_vcpu_must_have(avx);
             }
@@ -7405,6 +7482,8 @@ x86_emulate(
         insn_bytes = PFX_BYTES + 3;
         break;
 
+#ifndef X86EMUL_NO_SIMD
+
     case X86EMUL_OPC_EVEX_66(0x0f, 0x70): /* vpshufd $imm8,[xyz]mm/mem,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_F3(0x0f, 0x70): /* vpshufhw $imm8,[xyz]mm/mem,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_F2(0x0f, 0x70): /* vpshuflw $imm8,[xyz]mm/mem,[xyz]mm{k} */
@@ -7463,6 +7542,9 @@ x86_emulate(
         opc[1] = modrm;
         opc[2] = imm1;
         insn_bytes = PFX_BYTES + 3;
+
+#endif /* X86EMUL_NO_SIMD */
+
     simd_0f_reg_only:
         opc[insn_bytes - PFX_BYTES] = 0xc3;
 
@@ -7473,6 +7555,8 @@ x86_emulate(
         ASSERT(!state->simd_size);
         break;
 
+#ifndef X86EMUL_NO_SIMD
+
     case X86EMUL_OPC_EVEX_66(0x0f, 0x71): /* Grp12 */
         switch ( modrm_reg & 7 )
         {
@@ -7504,6 +7588,9 @@ x86_emulate(
         }
         goto unrecognized_insn;
 
+#endif /* !X86EMUL_NO_SIMD */
+#ifndef X86EMUL_NO_MMX
+
     case X86EMUL_OPC(0x0f, 0x73):        /* Grp14 */
         switch ( modrm_reg & 7 )
         {
@@ -7513,6 +7600,9 @@ x86_emulate(
         }
         goto unrecognized_insn;
 
+#endif /* !X86EMUL_NO_MMX */
+#ifndef X86EMUL_NO_SIMD
+
     case X86EMUL_OPC_66(0x0f, 0x73):
     case X86EMUL_OPC_VEX_66(0x0f, 0x73):
         switch ( modrm_reg & 7 )
@@ -7543,7 +7633,12 @@ x86_emulate(
         }
         goto unrecognized_insn;
 
+#endif /* !X86EMUL_NO_SIMD */
+
+#ifndef X86EMUL_NO_MMX
     case X86EMUL_OPC(0x0f, 0x77):        /* emms */
+#endif
+#ifndef X86EMUL_NO_SIMD
     case X86EMUL_OPC_VEX(0x0f, 0x77):    /* vzero{all,upper} */
         if ( vex.opcx != vex_none )
         {
@@ -7589,6 +7684,7 @@ x86_emulate(
 #endif
         }
         else
+#endif /* !X86EMUL_NO_SIMD */
         {
             host_and_vcpu_must_have(mmx);
             get_fpu(X86EMUL_FPU_mmx);
@@ -7602,6 +7698,8 @@ x86_emulate(
         insn_bytes = PFX_BYTES + 1;
         goto simd_0f_reg_only;
 
+#ifndef X86EMUL_NO_SIMD
+
     case X86EMUL_OPC_66(0x0f, 0x78):     /* Grp17 */
         switch ( modrm_reg & 7 )
         {
@@ -7699,6 +7797,8 @@ x86_emulate(
         op_bytes = 8;
         goto simd_zmm;
 
+#endif /* !X86EMUL_NO_SIMD */
+
     case X86EMUL_OPC(0x0f, 0x80) ... X86EMUL_OPC(0x0f, 0x8f): /* jcc (near) */
         if ( test_cc(b, _regs.eflags) )
             jmp_rel((int32_t)src.val);
@@ -7709,6 +7809,8 @@ x86_emulate(
         dst.val = test_cc(b, _regs.eflags);
         break;
 
+#ifndef X86EMUL_NO_SIMD
+
     case X86EMUL_OPC_VEX(0x0f, 0x91):    /* kmov{w,q} k,mem */
     case X86EMUL_OPC_VEX_66(0x0f, 0x91): /* kmov{b,d} k,mem */
         generate_exception_if(ea.type != OP_MEM, EXC_UD);
@@ -7857,6 +7959,8 @@ x86_emulate(
         dst.type = OP_NONE;
         break;
 
+#endif /* !X86EMUL_NO_SIMD */
+
     case X86EMUL_OPC(0x0f, 0xa2): /* cpuid */
         msr_val = 0;
         fail_if(ops->cpuid == NULL);
@@ -7953,6 +8057,7 @@ x86_emulate(
     case X86EMUL_OPC(0x0f, 0xae): case X86EMUL_OPC_66(0x0f, 0xae): /* Grp15 */
         switch ( modrm_reg & 7 )
         {
+#ifndef X86EMUL_NO_SIMD
         case 2: /* ldmxcsr */
             generate_exception_if(vex.pfx, EXC_UD);
             vcpu_must_have(sse);
@@ -7971,6 +8076,7 @@ x86_emulate(
             get_fpu(vex.opcx ? X86EMUL_FPU_ymm : X86EMUL_FPU_xmm);
             asm volatile ( "stmxcsr %0" : "=m" (dst.val) );
             break;
+#endif /* X86EMUL_NO_SIMD */
 
         case 5: /* lfence */
             fail_if(modrm_mod != 3);
@@ -8019,6 +8125,8 @@ x86_emulate(
         }
         break;
 
+#ifndef X86EMUL_NO_SIMD
+
     case X86EMUL_OPC_VEX(0x0f, 0xae): /* Grp15 */
         switch ( modrm_reg & 7 )
         {
@@ -8033,6 +8141,8 @@ x86_emulate(
         }
         goto unrecognized_insn;
 
+#endif /* !X86EMUL_NO_SIMD */
+
     case X86EMUL_OPC_F3(0x0f, 0xae): /* Grp15 */
         fail_if(modrm_mod != 3);
         generate_exception_if((modrm_reg & 4) || !mode_64bit(), EXC_UD);
@@ -8272,6 +8382,8 @@ x86_emulate(
         }
         goto simd_0f_imm8_avx;
 
+#ifndef X86EMUL_NO_SIMD
+
     CASE_SIMD_ALL_FP(_EVEX, 0x0f, 0xc2): /* vcmp{p,s}{s,d} $imm8,[xyz]mm/mem,[xyz]mm,k{k} */
         generate_exception_if((evex.w != (evex.pfx & VEX_PREFIX_DOUBLE_MASK) ||
                                (ea.type != OP_REG && evex.brs &&
@@ -8298,6 +8410,8 @@ x86_emulate(
         insn_bytes = EVEX_PFX_BYTES + 3;
         break;
 
+#endif /* !X86EMUL_NO_SIMD */
+
     case X86EMUL_OPC(0x0f, 0xc3): /* movnti */
         /* Ignore the non-temporal hint for now. */
         vcpu_must_have(sse2);
@@ -8312,6 +8426,8 @@ x86_emulate(
         ea.type = OP_MEM;
         goto simd_0f_int_imm8;
 
+#ifndef X86EMUL_NO_SIMD
+
     case X86EMUL_OPC_EVEX_66(0x0f, 0xc4):   /* vpinsrw $imm8,r32/m16,xmm,xmm */
     case X86EMUL_OPC_EVEX_66(0x0f3a, 0x20): /* vpinsrb $imm8,r32/m8,xmm,xmm */
     case X86EMUL_OPC_EVEX_66(0x0f3a, 0x22): /* vpinsr{d,q} $imm8,r/m,xmm,xmm */
@@ -8329,6 +8445,8 @@ x86_emulate(
         state->simd_size = simd_other;
         goto avx512f_imm8_no_sae;
 
+#endif /* !X86EMUL_NO_SIMD */
+
     CASE_SIMD_PACKED_INT_VEX(0x0f, 0xc5):  /* pextrw $imm8,{,x}mm,reg */
                                            /* vpextrw $imm8,xmm,reg */
         generate_exception_if(vex.l, EXC_UD);
@@ -8344,6 +8462,8 @@ x86_emulate(
         insn_bytes = PFX_BYTES + 3;
         goto simd_0f_to_gpr;
 
+#ifndef X86EMUL_NO_SIMD
+
     CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0xc6): /* vshufp{s,d} $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
         generate_exception_if(evex.w != (evex.pfx & VEX_PREFIX_DOUBLE_MASK),
                               EXC_UD);
@@ -8358,6 +8478,8 @@ x86_emulate(
         avx512_vlen_check(false);
         goto simd_imm8_zmm;
 
+#endif /* X86EMUL_NO_SIMD */
+
     case X86EMUL_OPC(0x0f, 0xc7): /* Grp9 */
     {
         union {
@@ -8548,6 +8670,8 @@ x86_emulate(
         }
         break;
 
+#ifndef X86EMUL_NO_SIMD
+
     case X86EMUL_OPC_EVEX_66(0x0f, 0xd2): /* vpsrld xmm/m128,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xd3): /* vpsrlq xmm/m128,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xe2): /* vpsra{d,q} xmm/m128,[xyz]mm,[xyz]mm{k} */
@@ -8569,12 +8693,18 @@ x86_emulate(
         generate_exception_if(evex.w != (b & 1), EXC_UD);
         goto avx512f_no_sae;
 
+#endif /* !X86EMUL_NO_SIMD */
+#ifndef X86EMUL_NO_MMX
+
     case X86EMUL_OPC(0x0f, 0xd4):        /* paddq mm/m64,mm */
     case X86EMUL_OPC(0x0f, 0xf4):        /* pmuludq mm/m64,mm */
     case X86EMUL_OPC(0x0f, 0xfb):        /* psubq mm/m64,mm */
         vcpu_must_have(sse2);
         goto simd_0f_mmx;
 
+#endif /* !X86EMUL_NO_MMX */
+#if !defined(X86EMUL_NO_MMX) && !defined(X86EMUL_NO_SIMD)
+
     case X86EMUL_OPC_F3(0x0f, 0xd6):     /* movq2dq mm,xmm */
     case X86EMUL_OPC_F2(0x0f, 0xd6):     /* movdq2q xmm,mm */
         generate_exception_if(ea.type != OP_REG, EXC_UD);
@@ -8582,6 +8712,9 @@ x86_emulate(
         host_and_vcpu_must_have(mmx);
         goto simd_0f_int;
 
+#endif /* !X86EMUL_NO_MMX && !X86EMUL_NO_SIMD */
+#ifndef X86EMUL_NO_MMX
+
     case X86EMUL_OPC(0x0f, 0xe7):        /* movntq mm,m64 */
         generate_exception_if(ea.type != OP_MEM, EXC_UD);
         sfence = true;
@@ -8597,6 +8730,9 @@ x86_emulate(
         vcpu_must_have(mmxext);
         goto simd_0f_mmx;
 
+#endif /* !X86EMUL_NO_MMX */
+#ifndef X86EMUL_NO_SIMD
+
     case X86EMUL_OPC_EVEX_66(0x0f, 0xda): /* vpminub [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xde): /* vpmaxub [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xe4): /* vpmulhuw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
@@ -8617,6 +8753,8 @@ x86_emulate(
         op_bytes = 8 << (!!(vex.pfx & VEX_PREFIX_DOUBLE_MASK) + vex.l);
         goto simd_0f_cvt;
 
+#endif /* !X86EMUL_NO_SIMD */
+
     CASE_SIMD_PACKED_INT_VEX(0x0f, 0xf7): /* {,v}maskmov{q,dqu} {,x}mm,{,x}mm */
         generate_exception_if(ea.type != OP_REG, EXC_UD);
         if ( vex.opcx != vex_none )
@@ -8720,6 +8858,8 @@ x86_emulate(
         insn_bytes = PFX_BYTES + 3;
         break;
 
+#ifndef X86EMUL_NO_SIMD
+
     case X86EMUL_OPC_VEX_66(0x0f38, 0x19): /* vbroadcastsd xmm/m64,ymm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x1a): /* vbroadcastf128 m128,ymm */
         generate_exception_if(!vex.l, EXC_UD);
@@ -9302,6 +9442,8 @@ x86_emulate(
         ASSERT(!state->simd_size);
         break;
 
+#endif /* !X86EMUL_NO_SIMD */
+
     case X86EMUL_OPC_66(0x0f38, 0x82): /* invpcid reg,m128 */
         vcpu_must_have(invpcid);
         generate_exception_if(ea.type != OP_MEM, EXC_UD);
@@ -9344,6 +9486,8 @@ x86_emulate(
         state->simd_size = simd_none;
         break;
 
+#ifndef X86EMUL_NO_SIMD
+
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x83): /* vpmultishiftqb [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
         generate_exception_if(!evex.w, EXC_UD);
         host_and_vcpu_must_have(avx512_vbmi);
@@ -9907,6 +10051,8 @@ x86_emulate(
         generate_exception_if(evex.brs || evex.opmsk, EXC_UD);
         goto avx512f_no_sae;
 
+#endif /* !X86EMUL_NO_SIMD */
+
     case X86EMUL_OPC(0x0f38, 0xf0): /* movbe m,r */
     case X86EMUL_OPC(0x0f38, 0xf1): /* movbe r,m */
         vcpu_must_have(movbe);
@@ -10072,6 +10218,8 @@ x86_emulate(
                             : "0" ((uint32_t)src.val), "rm" (_regs.edx) );
         break;
 
+#ifndef X86EMUL_NO_SIMD
+
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x00): /* vpermq $imm8,ymm/m256,ymm */
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x01): /* vpermpd $imm8,ymm/m256,ymm */
         generate_exception_if(!vex.l || !vex.w, EXC_UD);
@@ -10132,6 +10280,8 @@ x86_emulate(
         avx512_vlen_check(b & 2);
         goto simd_imm8_zmm;
 
+#endif /* X86EMUL_NO_SIMD */
+
     CASE_SIMD_PACKED_INT(0x0f3a, 0x0f): /* palignr $imm8,{,x}mm/mem,{,x}mm */
         host_and_vcpu_must_have(ssse3);
         if ( vex.pfx )
@@ -10159,6 +10309,8 @@ x86_emulate(
         insn_bytes = PFX_BYTES + 4;
         break;
 
+#ifndef X86EMUL_NO_SIMD
+
     case X86EMUL_OPC_EVEX_66(0x0f3a, 0x42): /* vdbpsadbw $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
         generate_exception_if(evex.w, EXC_UD);
         /* fall through */
@@ -10657,6 +10809,8 @@ x86_emulate(
         generate_exception_if(vex.l, EXC_UD);
         goto simd_0f_imm8_avx;
 
+#endif /* X86EMUL_NO_SIMD */
+
     case X86EMUL_OPC_VEX_F2(0x0f3a, 0xf0): /* rorx imm,r/m,r */
         vcpu_must_have(bmi2);
         generate_exception_if(vex.l || vex.reg != 0xf, EXC_UD);
@@ -10671,6 +10825,8 @@ x86_emulate(
             asm ( "rorl %b1,%k0" : "=g" (dst.val) : "c" (imm1), "0" (src.val) );
         break;
 
+#ifndef X86EMUL_NO_SIMD
+
     case X86EMUL_OPC_XOP(08, 0x85): /* vpmacssww xmm,xmm/m128,xmm,xmm */
     case X86EMUL_OPC_XOP(08, 0x86): /* vpmacsswd xmm,xmm/m128,xmm,xmm */
     case X86EMUL_OPC_XOP(08, 0x87): /* vpmacssdql xmm,xmm/m128,xmm,xmm */
@@ -10706,6 +10862,8 @@ x86_emulate(
         host_and_vcpu_must_have(xop);
         goto simd_0f_imm8_ymm;
 
+#endif /* X86EMUL_NO_SIMD */
+
     case X86EMUL_OPC_XOP(09, 0x01): /* XOP Grp1 */
         switch ( modrm_reg & 7 )
         {
@@ -10765,6 +10923,8 @@ x86_emulate(
         }
         goto unrecognized_insn;
 
+#ifndef X86EMUL_NO_SIMD
+
     case X86EMUL_OPC_XOP(09, 0x82): /* vfrczss xmm/m128,xmm */
     case X86EMUL_OPC_XOP(09, 0x83): /* vfrczsd xmm/m128,xmm */
         generate_exception_if(vex.l, EXC_UD);
@@ -10820,6 +10980,8 @@ x86_emulate(
         host_and_vcpu_must_have(xop);
         goto simd_0f_ymm;
 
+#endif /* X86EMUL_NO_SIMD */
+
     case X86EMUL_OPC_XOP(0a, 0x10): /* bextr imm,r/m,r */
     {
         uint8_t *buf = get_stub(stub);


Re: [PATCH v10 4/9] x86emul: disable FPU/MMX/SIMD insn emulation when !HVM
Posted by Andrew Cooper 3 years, 11 months ago
On 25/05/2020 15:27, Jan Beulich wrote:
> In a pure PV environment (the PV shim in particular) we don't really
> need emulation of all these. To limit #ifdef-ary utilize some of the
> CASE_*() macros we have, by providing variants expanding to
> (effectively) nothing (really a label, which in turn requires passing
> -Wno-unused-label to the compiler when build such configurations).
>
> Due to the mixture of macro and #ifdef use, the placement of some of
> the #ifdef-s is a little arbitrary.
>
> The resulting object file's .text is less than half the size of the
> original, and looks to also be compiling a little more quickly.
>
> This is meant as a first step; more parts can likely be disabled down
> the road.
>
> Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> v7: Integrate into this series. Re-base.
> ---
> I'll be happy to take suggestions allowing to avoid -Wno-unused-label.

I really would prefer the simplified version, which doesn't need
-Wno-unused-label at all.

I specifically don't see a need for these to be selected individually,
and a consequence of that is a vastly simplified patch.

However, to avoid this stalemate, Bregrudingly-acked-by: Andrew Cooper
<andrew.cooper3@citrix.com> if you still insist on taking this route.

[PATCH v10 5/9] x86emul: support MOVDIR{I,64B} insns
Posted by Jan Beulich 3 years, 11 months ago
Introduce a new blk() hook, paralleling the rmw() one in a certain way,
but being intended for larger data sizes, and hence its HVM intermediate
handling function doesn't fall back to splitting the operation if the
requested virtual address can't be mapped.

Note that SDM revision 071 doesn't specify exception behavior for
ModRM.mod == 0b11; assuming #UD here.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Paul Durrant <paul@xen.org>
---
v10: Re-base over changes earlier in the series as well as later parts
     comitted already.
v9: Fold in "x86/HVM: make hvmemul_blk() capable of handling r/o
    operations". Also adjust x86_insn_is_mem_write().
v7: Add blk_NONE. Move  harness'es setting of .blk. Correct indentation.
    Re-base.
v6: Fold MOVDIRI and MOVDIR64B changes again. Use blk() for both. All
    tags dropped.
v5: Introduce/use ->blk() hook. Correct asm() operands.
v4: Split MOVDIRI and MOVDIR64B and move this one ahead. Re-base.
v3: Update description.
---
(SDE: -tnt)

--- a/tools/tests/x86_emulator/predicates.c
+++ b/tools/tests/x86_emulator/predicates.c
@@ -510,6 +510,8 @@ static const struct {
     /*{ 0xf6 }, { 2, 2 }, F, W }, wrss */
     { { 0xf6 }, { 2, 2 }, T, R, pfx_66 }, /* adcx */
     { { 0xf6 }, { 2, 2 }, T, R, pfx_f3 }, /* adox */
+    { { 0xf8 }, { 2, 2 }, F, W, pfx_66 }, /* movdir64b */
+    { { 0xf9 }, { 2, 2 }, F, W }, /* movdiri */
 };
 #undef CND
 #undef REG
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -652,6 +652,18 @@ static int cmpxchg(
     return X86EMUL_OKAY;
 }
 
+static int blk(
+    enum x86_segment seg,
+    unsigned long offset,
+    void *p_data,
+    unsigned int bytes,
+    uint32_t *eflags,
+    struct x86_emulate_state *state,
+    struct x86_emulate_ctxt *ctxt)
+{
+    return x86_emul_blk((void *)offset, p_data, bytes, eflags, state, ctxt);
+}
+
 static int read_segment(
     enum x86_segment seg,
     struct segment_register *reg,
@@ -721,6 +733,7 @@ static struct x86_emulate_ops emulops =
     .insn_fetch = fetch,
     .write      = write,
     .cmpxchg    = cmpxchg,
+    .blk        = blk,
     .read_segment = read_segment,
     .cpuid      = emul_test_cpuid,
     .read_cr    = emul_test_read_cr,
@@ -2339,6 +2352,50 @@ int main(int argc, char **argv)
         goto fail;
     printf("okay\n");
 
+    printf("%-40s", "Testing movdiri %edx,(%ecx)...");
+    if ( stack_exec && cpu_has_movdiri )
+    {
+        instr[0] = 0x0f; instr[1] = 0x38; instr[2] = 0xf9; instr[3] = 0x11;
+
+        regs.eip = (unsigned long)&instr[0];
+        regs.ecx = (unsigned long)memset(res, -1, 16);
+        regs.edx = 0x44332211;
+
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( (rc != X86EMUL_OKAY) ||
+             (regs.eip != (unsigned long)&instr[4]) ||
+             res[0] != 0x44332211 || ~res[1] )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing movdir64b 144(%edx),%ecx...");
+    if ( stack_exec && cpu_has_movdir64b )
+    {
+        instr[0] = 0x66; instr[1] = 0x0f; instr[2] = 0x38; instr[3] = 0xf8;
+        instr[4] = 0x8a; instr[5] = 0x90; instr[8] = instr[7] = instr[6] = 0;
+
+        regs.eip = (unsigned long)&instr[0];
+        for ( i = 0; i < 64; ++i )
+            res[i] = i - 20;
+        regs.edx = (unsigned long)res;
+        regs.ecx = (unsigned long)(res + 16);
+
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( (rc != X86EMUL_OKAY) ||
+             (regs.eip != (unsigned long)&instr[9]) ||
+             res[15] != -5 || res[32] != 12 )
+            goto fail;
+        for ( i = 16; i < 32; ++i )
+            if ( res[i] != i )
+                goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
     printf("%-40s", "Testing movq %mm3,(%ecx)...");
     if ( stack_exec && cpu_has_mmx )
     {
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -160,6 +160,8 @@ static inline bool xcr0_mask(uint64_t ma
 #define cpu_has_avx512_vnni (cp.feat.avx512_vnni && xcr0_mask(0xe6))
 #define cpu_has_avx512_bitalg (cp.feat.avx512_bitalg && xcr0_mask(0xe6))
 #define cpu_has_avx512_vpopcntdq (cp.feat.avx512_vpopcntdq && xcr0_mask(0xe6))
+#define cpu_has_movdiri    cp.feat.movdiri
+#define cpu_has_movdir64b  cp.feat.movdir64b
 #define cpu_has_avx512_4vnniw (cp.feat.avx512_4vnniw && xcr0_mask(0xe6))
 #define cpu_has_avx512_4fmaps (cp.feat.avx512_4fmaps && xcr0_mask(0xe6))
 #define cpu_has_serialize  cp.feat.serialize
--- a/xen/arch/x86/arch.mk
+++ b/xen/arch/x86/arch.mk
@@ -47,6 +47,7 @@ $(call as-option-add,CFLAGS,CC,"rdseed %
 $(call as-option-add,CFLAGS,CC,"clwb (%rax)",-DHAVE_AS_CLWB)
 $(call as-option-add,CFLAGS,CC,".equ \"x\"$$(comma)1",-DHAVE_AS_QUOTED_SYM)
 $(call as-option-add,CFLAGS,CC,"invpcid (%rax)$$(comma)%rax",-DHAVE_AS_INVPCID)
+$(call as-option-add,CFLAGS,CC,"movdiri %rax$$(comma)(%rax)",-DHAVE_AS_MOVDIR)
 
 # GAS's idea of true is -1.  Clang's idea is 1
 $(call as-option-add,CFLAGS,CC,\
--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -1441,6 +1441,47 @@ static int hvmemul_rmw(
     return rc;
 }
 
+static int hvmemul_blk(
+    enum x86_segment seg,
+    unsigned long offset,
+    void *p_data,
+    unsigned int bytes,
+    uint32_t *eflags,
+    struct x86_emulate_state *state,
+    struct x86_emulate_ctxt *ctxt)
+{
+    struct hvm_emulate_ctxt *hvmemul_ctxt =
+        container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
+    unsigned long addr;
+    uint32_t pfec = PFEC_page_present;
+    int rc;
+    void *mapping = NULL;
+
+    rc = hvmemul_virtual_to_linear(
+        seg, offset, bytes, NULL, hvm_access_write, hvmemul_ctxt, &addr);
+    if ( rc != X86EMUL_OKAY || !bytes )
+        return rc;
+
+    if ( x86_insn_is_mem_write(state, ctxt) )
+        pfec |= PFEC_write_access;
+
+    if ( is_x86_system_segment(seg) )
+        pfec |= PFEC_implicit;
+    else if ( hvmemul_ctxt->seg_reg[x86_seg_ss].dpl == 3 )
+        pfec |= PFEC_user_mode;
+
+    mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt);
+    if ( IS_ERR(mapping) )
+        return ~PTR_ERR(mapping);
+    if ( !mapping )
+        return X86EMUL_UNHANDLEABLE;
+
+    rc = x86_emul_blk(mapping, p_data, bytes, eflags, state, ctxt);
+    hvmemul_unmap_linear_addr(mapping, addr, bytes, hvmemul_ctxt);
+
+    return rc;
+}
+
 static int hvmemul_write_discard(
     enum x86_segment seg,
     unsigned long offset,
@@ -2518,6 +2559,7 @@ static const struct x86_emulate_ops hvm_
     .write         = hvmemul_write,
     .rmw           = hvmemul_rmw,
     .cmpxchg       = hvmemul_cmpxchg,
+    .blk           = hvmemul_blk,
     .validate      = hvmemul_validate,
     .rep_ins       = hvmemul_rep_ins,
     .rep_outs      = hvmemul_rep_outs,
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -548,6 +548,8 @@ static const struct ext0f38_table {
     [0xf1] = { .to_mem = 1, .two_op = 1 },
     [0xf2 ... 0xf3] = {},
     [0xf5 ... 0xf7] = {},
+    [0xf8] = { .simd_size = simd_other },
+    [0xf9] = { .to_mem = 1, .two_op = 1 /* Mov */ },
 };
 
 /* Shift values between src and dst sizes of pmov{s,z}x{b,w,d}{w,d,q}. */
@@ -851,6 +853,10 @@ struct x86_emulate_state {
         rmw_xchg,
         rmw_xor,
     } rmw;
+    enum {
+        blk_NONE,
+        blk_movdir,
+    } blk;
     uint8_t modrm, modrm_mod, modrm_reg, modrm_rm;
     uint8_t sib_index, sib_scale;
     uint8_t rex_prefix;
@@ -1915,6 +1921,8 @@ amd_like(const struct x86_emulate_ctxt *
 #define vcpu_has_avx512_vpopcntdq() (ctxt->cpuid->feat.avx512_vpopcntdq)
 #define vcpu_has_tsxldtrk()    (ctxt->cpuid->feat.tsxldtrk)
 #define vcpu_has_rdpid()       (ctxt->cpuid->feat.rdpid)
+#define vcpu_has_movdiri()     (ctxt->cpuid->feat.movdiri)
+#define vcpu_has_movdir64b()   (ctxt->cpuid->feat.movdir64b)
 #define vcpu_has_avx512_4vnniw() (ctxt->cpuid->feat.avx512_4vnniw)
 #define vcpu_has_avx512_4fmaps() (ctxt->cpuid->feat.avx512_4fmaps)
 #define vcpu_has_serialize()   (ctxt->cpuid->feat.serialize)
@@ -2736,10 +2744,12 @@ x86_decode_0f38(
     {
     case 0x00 ... 0xef:
     case 0xf2 ... 0xf5:
-    case 0xf7 ... 0xff:
+    case 0xf7 ... 0xf8:
+    case 0xfa ... 0xff:
         op_bytes = 0;
         /* fall through */
     case 0xf6: /* adcx / adox */
+    case 0xf9: /* movdiri */
         ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
         break;
 
@@ -10218,6 +10228,34 @@ x86_emulate(
                             : "0" ((uint32_t)src.val), "rm" (_regs.edx) );
         break;
 
+    case X86EMUL_OPC_66(0x0f38, 0xf8): /* movdir64b r,m512 */
+        host_and_vcpu_must_have(movdir64b);
+        generate_exception_if(ea.type != OP_MEM, EXC_UD);
+        src.val = truncate_ea(*dst.reg);
+        generate_exception_if(!is_aligned(x86_seg_es, src.val, 64, ctxt, ops),
+                              EXC_GP, 0);
+        fail_if(!ops->blk);
+        state->blk = blk_movdir;
+        BUILD_BUG_ON(sizeof(*mmvalp) < 64);
+        if ( (rc = ops->read(ea.mem.seg, ea.mem.off, mmvalp, 64,
+                             ctxt)) != X86EMUL_OKAY ||
+             (rc = ops->blk(x86_seg_es, src.val, mmvalp, 64, &_regs.eflags,
+                            state, ctxt)) != X86EMUL_OKAY )
+            goto done;
+        state->simd_size = simd_none;
+        break;
+
+    case X86EMUL_OPC(0x0f38, 0xf9): /* movdiri mem,r */
+        host_and_vcpu_must_have(movdiri);
+        generate_exception_if(dst.type != OP_MEM, EXC_UD);
+        fail_if(!ops->blk);
+        state->blk = blk_movdir;
+        if ( (rc = ops->blk(dst.mem.seg, dst.mem.off, &src.val, op_bytes,
+                            &_regs.eflags, state, ctxt)) != X86EMUL_OKAY )
+            goto done;
+        dst.type = OP_NONE;
+        break;
+
 #ifndef X86EMUL_NO_SIMD
 
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x00): /* vpermq $imm8,ymm/m256,ymm */
@@ -11477,6 +11515,77 @@ int x86_emul_rmw(
     return X86EMUL_OKAY;
 }
 
+int x86_emul_blk(
+    void *ptr,
+    void *data,
+    unsigned int bytes,
+    uint32_t *eflags,
+    struct x86_emulate_state *state,
+    struct x86_emulate_ctxt *ctxt)
+{
+    switch ( state->blk )
+    {
+        /*
+         * Throughout this switch(), memory clobbers are used to compensate
+         * that other operands may not properly express the (full) memory
+         * ranges covered.
+         */
+    case blk_movdir:
+        switch ( bytes )
+        {
+#ifdef __x86_64__
+        case sizeof(uint32_t):
+# ifdef HAVE_AS_MOVDIR
+            asm ( "movdiri %0, (%1)"
+                  :: "r" (*(uint32_t *)data), "r" (ptr) : "memory" );
+# else
+            /* movdiri %esi, (%rdi) */
+            asm ( ".byte 0x0f, 0x38, 0xf9, 0x37"
+                  :: "S" (*(uint32_t *)data), "D" (ptr) : "memory" );
+# endif
+            break;
+#endif
+
+        case sizeof(unsigned long):
+#ifdef HAVE_AS_MOVDIR
+            asm ( "movdiri %0, (%1)"
+                  :: "r" (*(unsigned long *)data), "r" (ptr) : "memory" );
+#else
+            /* movdiri %rsi, (%rdi) */
+            asm ( ".byte 0x48, 0x0f, 0x38, 0xf9, 0x37"
+                  :: "S" (*(unsigned long *)data), "D" (ptr) : "memory" );
+#endif
+            break;
+
+        case 64:
+            if ( ((unsigned long)ptr & 0x3f) )
+            {
+                ASSERT_UNREACHABLE();
+                return X86EMUL_UNHANDLEABLE;
+            }
+#ifdef HAVE_AS_MOVDIR
+            asm ( "movdir64b (%0), %1" :: "r" (data), "r" (ptr) : "memory" );
+#else
+            /* movdir64b (%rsi), %rdi */
+            asm ( ".byte 0x66, 0x0f, 0x38, 0xf8, 0x3e"
+                  :: "S" (data), "D" (ptr) : "memory" );
+#endif
+            break;
+
+        default:
+            ASSERT_UNREACHABLE();
+            return X86EMUL_UNHANDLEABLE;
+        }
+        break;
+
+    default:
+        ASSERT_UNREACHABLE();
+        return X86EMUL_UNHANDLEABLE;
+    }
+
+    return X86EMUL_OKAY;
+}
+
 static void __init __maybe_unused build_assertions(void)
 {
     /* Check the values against SReg3 encoding in opcode/ModRM bytes. */
@@ -11759,6 +11868,9 @@ x86_insn_is_mem_write(const struct x86_e
         {
         case 0x63:                         /* ARPL */
             return !mode_64bit();
+
+        case X86EMUL_OPC_66(0x0f38, 0xf8): /* MOVDIR64B */
+            return true;
         }
 
         return false;
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -310,6 +310,22 @@ struct x86_emulate_ops
         struct x86_emulate_ctxt *ctxt);
 
     /*
+     * blk: Emulate a large (block) memory access.
+     * @p_data: [IN/OUT] (optional) Pointer to source/destination buffer.
+     * @eflags: [IN/OUT] Pointer to EFLAGS to be updated according to
+     *                   instruction effects.
+     * @state:  [IN/OUT] Pointer to (opaque) emulator state.
+     */
+    int (*blk)(
+        enum x86_segment seg,
+        unsigned long offset,
+        void *p_data,
+        unsigned int bytes,
+        uint32_t *eflags,
+        struct x86_emulate_state *state,
+        struct x86_emulate_ctxt *ctxt);
+
+    /*
      * validate: Post-decode, pre-emulate hook to allow caller controlled
      * filtering.
      */
@@ -793,6 +809,14 @@ x86_emul_rmw(
     unsigned int bytes,
     uint32_t *eflags,
     struct x86_emulate_state *state,
+    struct x86_emulate_ctxt *ctxt);
+int
+x86_emul_blk(
+    void *ptr,
+    void *data,
+    unsigned int bytes,
+    uint32_t *eflags,
+    struct x86_emulate_state *state,
     struct x86_emulate_ctxt *ctxt);
 
 static inline void x86_emul_hw_exception(
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -118,6 +118,8 @@
 #define cpu_has_avx512_bitalg   boot_cpu_has(X86_FEATURE_AVX512_BITALG)
 #define cpu_has_avx512_vpopcntdq boot_cpu_has(X86_FEATURE_AVX512_VPOPCNTDQ)
 #define cpu_has_rdpid           boot_cpu_has(X86_FEATURE_RDPID)
+#define cpu_has_movdiri         boot_cpu_has(X86_FEATURE_MOVDIRI)
+#define cpu_has_movdir64b       boot_cpu_has(X86_FEATURE_MOVDIR64B)
 
 /* CPUID level 0x80000007.edx */
 #define cpu_has_itsc            boot_cpu_has(X86_FEATURE_ITSC)
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -241,6 +241,8 @@ XEN_CPUFEATURE(AVX512_VPOPCNTDQ, 6*32+14
 XEN_CPUFEATURE(TSXLDTRK,      6*32+16) /*a  TSX load tracking suspend/resume insns */
 XEN_CPUFEATURE(RDPID,         6*32+22) /*A  RDPID instruction */
 XEN_CPUFEATURE(CLDEMOTE,      6*32+25) /*A  CLDEMOTE instruction */
+XEN_CPUFEATURE(MOVDIRI,       6*32+27) /*A  MOVDIRI instruction */
+XEN_CPUFEATURE(MOVDIR64B,     6*32+28) /*A  MOVDIR64B instruction */
 
 /* AMD-defined CPU features, CPUID level 0x80000007.edx, word 7 */
 XEN_CPUFEATURE(ITSC,          7*32+ 8) /*   Invariant TSC */


Re: [PATCH v10 5/9] x86emul: support MOVDIR{I,64B} insns
Posted by Andrew Cooper 3 years, 11 months ago
On 25/05/2020 15:28, Jan Beulich wrote:
> Introduce a new blk() hook, paralleling the rmw() one in a certain way,
> but being intended for larger data sizes, and hence its HVM intermediate
> handling function doesn't fall back to splitting the operation if the
> requested virtual address can't be mapped.
>
> Note that SDM revision 071 doesn't specify exception behavior for
> ModRM.mod == 0b11; assuming #UD here.

Once again - I don't think this wants calling out.  That encoding space
will be used for a new Grp at some point in the future, and be a
different instruction.

>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> Reviewed-by: Paul Durrant <paul@xen.org>

Acked-by: Andrew Cooper <andrew.cooper@citrix.com>, although with one
recommendation...

> --- a/xen/include/public/arch-x86/cpufeatureset.h
> +++ b/xen/include/public/arch-x86/cpufeatureset.h
> @@ -241,6 +241,8 @@ XEN_CPUFEATURE(AVX512_VPOPCNTDQ, 6*32+14
>  XEN_CPUFEATURE(TSXLDTRK,      6*32+16) /*a  TSX load tracking suspend/resume insns */
>  XEN_CPUFEATURE(RDPID,         6*32+22) /*A  RDPID instruction */
>  XEN_CPUFEATURE(CLDEMOTE,      6*32+25) /*A  CLDEMOTE instruction */
> +XEN_CPUFEATURE(MOVDIRI,       6*32+27) /*A  MOVDIRI instruction */
> +XEN_CPUFEATURE(MOVDIR64B,     6*32+28) /*A  MOVDIR64B instruction */

I'd be tempted to leave these as 'a' for now, seeing as we have the ability.

These instructions aren't actually of any use for domains without PCI
devices, and a "default" will be more migrateable as a consequence.

We're going to need further toolstack changes to make CXL-passthrough
viable, so instruction adjustments can be part of that work.

~Andrew

Re: [PATCH v10 5/9] x86emul: support MOVDIR{I,64B} insns
Posted by Jan Beulich 3 years, 11 months ago
On 29.05.2020 15:55, Andrew Cooper wrote:
> On 25/05/2020 15:28, Jan Beulich wrote:
>> Introduce a new blk() hook, paralleling the rmw() one in a certain way,
>> but being intended for larger data sizes, and hence its HVM intermediate
>> handling function doesn't fall back to splitting the operation if the
>> requested virtual address can't be mapped.
>>
>> Note that SDM revision 071 doesn't specify exception behavior for
>> ModRM.mod == 0b11; assuming #UD here.
> 
> Once again - I don't think this wants calling out.  That encoding space
> will be used for a new Grp at some point in the future, and be a
> different instruction.

Possible; without it spelled out one may also think (at least
for MOVDIRI) that the register-only encoding could be a
re-encoding of plain MOV. I'd prefer to keep it.

>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> Reviewed-by: Paul Durrant <paul@xen.org>
> 
> Acked-by: Andrew Cooper <andrew.cooper@citrix.com>, although with one
> recommendation...

Thanks and ...

>> --- a/xen/include/public/arch-x86/cpufeatureset.h
>> +++ b/xen/include/public/arch-x86/cpufeatureset.h
>> @@ -241,6 +241,8 @@ XEN_CPUFEATURE(AVX512_VPOPCNTDQ, 6*32+14
>>  XEN_CPUFEATURE(TSXLDTRK,      6*32+16) /*a  TSX load tracking suspend/resume insns */
>>  XEN_CPUFEATURE(RDPID,         6*32+22) /*A  RDPID instruction */
>>  XEN_CPUFEATURE(CLDEMOTE,      6*32+25) /*A  CLDEMOTE instruction */
>> +XEN_CPUFEATURE(MOVDIRI,       6*32+27) /*A  MOVDIRI instruction */
>> +XEN_CPUFEATURE(MOVDIR64B,     6*32+28) /*A  MOVDIR64B instruction */
> 
> I'd be tempted to leave these as 'a' for now, seeing as we have the ability.
> 
> These instructions aren't actually of any use for domains without PCI
> devices, and a "default" will be more migrateable as a consequence.

... okay, done.

Jan

[PATCH v10 6/9] x86emul: support ENQCMD insns
Posted by Jan Beulich 3 years, 11 months ago
Note that the ISA extensions document revision 038 doesn't specify
exception behavior for ModRM.mod == 0b11; assuming #UD here.

No tests are being added to the harness - this would be quite hard,
we can't just issue the insns against RAM. Their similarity with
MOVDIR64B should have the test case there be god enough to cover any
fundamental flaws.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
TBD: This doesn't (can't) consult PASID translation tables yet, as we
     have no VMX code for this so far. I guess for this we will want to
     replace the direct ->read_msr(MSR_PASID, ...) with a new
     ->read_pasid() hook.
---
v10: Re-base over changes earlier in the series as well as later parts
     comitted already.
v9: Consistently use named asm() operands. Also adjust
    x86_insn_is_mem_write(). A -> a in public header. Move
    asm-x86/msr-index.h addition and drop _IA32 from their names.
    Introduce _AC() into the emulator harness as a result.
v7: Re-base.
v6: Re-base.
v5: New.

--- a/tools/tests/x86_emulator/predicates.c
+++ b/tools/tests/x86_emulator/predicates.c
@@ -511,6 +511,8 @@ static const struct {
     { { 0xf6 }, { 2, 2 }, T, R, pfx_66 }, /* adcx */
     { { 0xf6 }, { 2, 2 }, T, R, pfx_f3 }, /* adox */
     { { 0xf8 }, { 2, 2 }, F, W, pfx_66 }, /* movdir64b */
+    { { 0xf8 }, { 2, 2 }, F, W, pfx_f3 }, /* enqcmds */
+    { { 0xf8 }, { 2, 2 }, F, W, pfx_f2 }, /* enqcmd */
     { { 0xf9 }, { 2, 2 }, F, W }, /* movdiri */
 };
 #undef CND
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -59,6 +59,9 @@
     (type *)((char *)mptr__ - offsetof(type, member)); \
 })
 
+#define AC_(n,t) (n##t)
+#define _AC(n,t) AC_(n,t)
+
 #define hweight32 __builtin_popcount
 #define hweight64 __builtin_popcountll
 
--- a/xen/arch/x86/arch.mk
+++ b/xen/arch/x86/arch.mk
@@ -48,6 +48,7 @@ $(call as-option-add,CFLAGS,CC,"clwb (%r
 $(call as-option-add,CFLAGS,CC,".equ \"x\"$$(comma)1",-DHAVE_AS_QUOTED_SYM)
 $(call as-option-add,CFLAGS,CC,"invpcid (%rax)$$(comma)%rax",-DHAVE_AS_INVPCID)
 $(call as-option-add,CFLAGS,CC,"movdiri %rax$$(comma)(%rax)",-DHAVE_AS_MOVDIR)
+$(call as-option-add,CFLAGS,CC,"enqcmd (%rax)$$(comma)%rax",-DHAVE_AS_ENQCMD)
 
 # GAS's idea of true is -1.  Clang's idea is 1
 $(call as-option-add,CFLAGS,CC,\
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -855,6 +855,7 @@ struct x86_emulate_state {
     } rmw;
     enum {
         blk_NONE,
+        blk_enqcmd,
         blk_movdir,
     } blk;
     uint8_t modrm, modrm_mod, modrm_reg, modrm_rm;
@@ -901,6 +902,7 @@ typedef union {
     uint64_t __attribute__ ((aligned(16))) xmm[2];
     uint64_t __attribute__ ((aligned(32))) ymm[4];
     uint64_t __attribute__ ((aligned(64))) zmm[8];
+    uint32_t data32[16];
 } mmval_t;
 
 /*
@@ -1923,6 +1925,7 @@ amd_like(const struct x86_emulate_ctxt *
 #define vcpu_has_rdpid()       (ctxt->cpuid->feat.rdpid)
 #define vcpu_has_movdiri()     (ctxt->cpuid->feat.movdiri)
 #define vcpu_has_movdir64b()   (ctxt->cpuid->feat.movdir64b)
+#define vcpu_has_enqcmd()      (ctxt->cpuid->feat.enqcmd)
 #define vcpu_has_avx512_4vnniw() (ctxt->cpuid->feat.avx512_4vnniw)
 #define vcpu_has_avx512_4fmaps() (ctxt->cpuid->feat.avx512_4fmaps)
 #define vcpu_has_serialize()   (ctxt->cpuid->feat.serialize)
@@ -10245,6 +10248,36 @@ x86_emulate(
         state->simd_size = simd_none;
         break;
 
+    case X86EMUL_OPC_F2(0x0f38, 0xf8): /* enqcmd r,m512 */
+    case X86EMUL_OPC_F3(0x0f38, 0xf8): /* enqcmds r,m512 */
+        host_and_vcpu_must_have(enqcmd);
+        generate_exception_if(ea.type != OP_MEM, EXC_UD);
+        generate_exception_if(vex.pfx != vex_f2 && !mode_ring0(), EXC_GP, 0);
+        src.val = truncate_ea(*dst.reg);
+        generate_exception_if(!is_aligned(x86_seg_es, src.val, 64, ctxt, ops),
+                              EXC_GP, 0);
+        fail_if(!ops->blk);
+        BUILD_BUG_ON(sizeof(*mmvalp) < 64);
+        if ( (rc = ops->read(ea.mem.seg, ea.mem.off, mmvalp, 64,
+                             ctxt)) != X86EMUL_OKAY )
+            goto done;
+        if ( vex.pfx == vex_f2 ) /* enqcmd */
+        {
+            fail_if(!ops->read_msr);
+            if ( (rc = ops->read_msr(MSR_PASID, &msr_val,
+                                     ctxt)) != X86EMUL_OKAY )
+                goto done;
+            generate_exception_if(!(msr_val & PASID_VALID), EXC_GP, 0);
+            mmvalp->data32[0] = MASK_EXTR(msr_val, PASID_PASID_MASK);
+        }
+        mmvalp->data32[0] &= ~0x7ff00000;
+        state->blk = blk_enqcmd;
+        if ( (rc = ops->blk(x86_seg_es, src.val, mmvalp, 64, &_regs.eflags,
+                            state, ctxt)) != X86EMUL_OKAY )
+            goto done;
+        state->simd_size = simd_none;
+        break;
+
     case X86EMUL_OPC(0x0f38, 0xf9): /* movdiri mem,r */
         host_and_vcpu_must_have(movdiri);
         generate_exception_if(dst.type != OP_MEM, EXC_UD);
@@ -11525,11 +11558,36 @@ int x86_emul_blk(
 {
     switch ( state->blk )
     {
+        bool zf;
+
         /*
          * Throughout this switch(), memory clobbers are used to compensate
          * that other operands may not properly express the (full) memory
          * ranges covered.
          */
+    case blk_enqcmd:
+        ASSERT(bytes == 64);
+        if ( ((unsigned long)ptr & 0x3f) )
+        {
+            ASSERT_UNREACHABLE();
+            return X86EMUL_UNHANDLEABLE;
+        }
+        *eflags &= ~EFLAGS_MASK;
+#ifdef HAVE_AS_ENQCMD
+        asm ( "enqcmds (%[src]), %[dst]" ASM_FLAG_OUT(, "; setz %[zf]")
+              : [zf] ASM_FLAG_OUT("=@ccz", "=qm") (zf)
+              : [src] "r" (data), [dst] "r" (ptr) : "memory" );
+#else
+        /* enqcmds (%rsi), %rdi */
+        asm ( ".byte 0xf3, 0x0f, 0x38, 0xf8, 0x3e"
+              ASM_FLAG_OUT(, "; setz %[zf]")
+              : [zf] ASM_FLAG_OUT("=@ccz", "=qm") (zf)
+              : "S" (data), "D" (ptr) : "memory" );
+#endif
+        if ( zf )
+            *eflags |= X86_EFLAGS_ZF;
+        break;
+
     case blk_movdir:
         switch ( bytes )
         {
@@ -11870,6 +11928,8 @@ x86_insn_is_mem_write(const struct x86_e
             return !mode_64bit();
 
         case X86EMUL_OPC_66(0x0f38, 0xf8): /* MOVDIR64B */
+        case X86EMUL_OPC_F2(0x0f38, 0xf8): /* ENQCMD */
+        case X86EMUL_OPC_F3(0x0f38, 0xf8): /* ENQCMDS */
             return true;
         }
 
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -120,6 +120,7 @@
 #define cpu_has_rdpid           boot_cpu_has(X86_FEATURE_RDPID)
 #define cpu_has_movdiri         boot_cpu_has(X86_FEATURE_MOVDIRI)
 #define cpu_has_movdir64b       boot_cpu_has(X86_FEATURE_MOVDIR64B)
+#define cpu_has_enqcmd          boot_cpu_has(X86_FEATURE_ENQCMD)
 
 /* CPUID level 0x80000007.edx */
 #define cpu_has_itsc            boot_cpu_has(X86_FEATURE_ITSC)
--- a/xen/include/asm-x86/msr-index.h
+++ b/xen/include/asm-x86/msr-index.h
@@ -74,6 +74,10 @@
 #define MSR_PL3_SSP                         0x000006a7
 #define MSR_INTERRUPT_SSP_TABLE             0x000006a8
 
+#define MSR_PASID                           0x00000d93
+#define  PASID_PASID_MASK                   0x000fffff
+#define  PASID_VALID                        (_AC(1, ULL) << 31)
+
 /*
  * Legacy MSR constants in need of cleanup.  No new MSRs below this comment.
  */
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -243,6 +243,7 @@ XEN_CPUFEATURE(RDPID,         6*32+22) /
 XEN_CPUFEATURE(CLDEMOTE,      6*32+25) /*A  CLDEMOTE instruction */
 XEN_CPUFEATURE(MOVDIRI,       6*32+27) /*A  MOVDIRI instruction */
 XEN_CPUFEATURE(MOVDIR64B,     6*32+28) /*A  MOVDIR64B instruction */
+XEN_CPUFEATURE(ENQCMD,        6*32+29) /*   ENQCMD{,S} instructions */
 
 /* AMD-defined CPU features, CPUID level 0x80000007.edx, word 7 */
 XEN_CPUFEATURE(ITSC,          7*32+ 8) /*   Invariant TSC */


[PATCH v10 7/9] x86emul: support FNSTENV and FNSAVE
Posted by Jan Beulich 3 years, 11 months ago
To avoid introducing another boolean into emulator state, the
rex_prefix field gets (ab)used to convey the real/VM86 vs protected mode
info (affecting structure layout, albeit not size) to x86_emul_blk().

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
TBD: The full 16-bit padding fields in the 32-bit structures get filled
     with all ones by modern CPUs (i.e. unlike what the comment says for
     FIP and FDP). We may want to mirror this as well (for the real mode
     variant), even if those fields' contents are unspecified.
---
v9: Fix !HVM build. Add /*state->*/ comments. Add memset(). Add/extend
    comments.
v7: New.

--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -129,6 +129,7 @@ static inline bool xcr0_mask(uint64_t ma
 }
 
 #define cache_line_size() (cp.basic.clflush_size * 8)
+#define cpu_has_fpu        cp.basic.fpu
 #define cpu_has_mmx        cp.basic.mmx
 #define cpu_has_fxsr       cp.basic.fxsr
 #define cpu_has_sse        cp.basic.sse
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -748,6 +748,25 @@ static struct x86_emulate_ops emulops =
 
 #define MMAP_ADDR 0x100000
 
+/*
+ * 64-bit OSes may not (be able to) properly restore the two selectors in
+ * the FPU environment. Zap them so that memcmp() on two saved images will
+ * work regardless of whether a context switch occurred in the middle.
+ */
+static void zap_fpsel(unsigned int *env, bool is_32bit)
+{
+    if ( is_32bit )
+    {
+        env[4] &= ~0xffff;
+        env[6] &= ~0xffff;
+    }
+    else
+    {
+        env[2] &= ~0xffff;
+        env[3] &= ~0xffff;
+    }
+}
+
 #ifdef __x86_64__
 # define STKVAL_DISP 64
 static const struct {
@@ -2394,6 +2413,62 @@ int main(int argc, char **argv)
         printf("okay\n");
     }
     else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing fnstenv 4(%ecx)...");
+    if ( stack_exec && cpu_has_fpu )
+    {
+        const uint16_t three = 3;
+
+        asm volatile ( "fninit\n\t"
+                       "fld1\n\t"
+                       "fidivs %1\n\t"
+                       "fstenv %0"
+                       : "=m" (res[9]) : "m" (three) : "memory" );
+        zap_fpsel(&res[9], true);
+        instr[0] = 0xd9; instr[1] = 0x71; instr[2] = 0x04;
+        regs.eip = (unsigned long)&instr[0];
+        regs.ecx = (unsigned long)res;
+        res[8] = 0xaa55aa55;
+        rc = x86_emulate(&ctxt, &emulops);
+        zap_fpsel(&res[1], true);
+        if ( (rc != X86EMUL_OKAY) ||
+             memcmp(res + 1, res + 9, 28) ||
+             res[8] != 0xaa55aa55 ||
+             (regs.eip != (unsigned long)&instr[3]) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing 16-bit fnsave (%ecx)...");
+    if ( stack_exec && cpu_has_fpu )
+    {
+        const uint16_t five = 5;
+
+        asm volatile ( "fninit\n\t"
+                       "fld1\n\t"
+                       "fidivs %1\n\t"
+                       "fsaves %0"
+                       : "=m" (res[25]) : "m" (five) : "memory" );
+        zap_fpsel(&res[25], false);
+        asm volatile ( "frstors %0" :: "m" (res[25]) : "memory" );
+        instr[0] = 0x66; instr[1] = 0xdd; instr[2] = 0x31;
+        regs.eip = (unsigned long)&instr[0];
+        regs.ecx = (unsigned long)res;
+        res[23] = 0xaa55aa55;
+        res[24] = 0xaa55aa55;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( (rc != X86EMUL_OKAY) ||
+             memcmp(res, res + 25, 94) ||
+             (res[23] >> 16) != 0xaa55 ||
+             res[24] != 0xaa55aa55 ||
+             (regs.eip != (unsigned long)&instr[3]) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
         printf("skipped\n");
 
     printf("%-40s", "Testing movq %mm3,(%ecx)...");
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -856,6 +856,9 @@ struct x86_emulate_state {
     enum {
         blk_NONE,
         blk_enqcmd,
+#ifndef X86EMUL_NO_FPU
+        blk_fst, /* FNSTENV, FNSAVE */
+#endif
         blk_movdir,
     } blk;
     uint8_t modrm, modrm_mod, modrm_reg, modrm_rm;
@@ -897,6 +900,50 @@ struct x86_emulate_state {
 #define PTR_POISON NULL /* 32-bit builds are for user-space, so NULL is OK. */
 #endif
 
+#ifndef X86EMUL_NO_FPU
+struct x87_env16 {
+    uint16_t fcw;
+    uint16_t fsw;
+    uint16_t ftw;
+    union {
+        struct {
+            uint16_t fip_lo;
+            uint16_t fop:11, :1, fip_hi:4;
+            uint16_t fdp_lo;
+            uint16_t :12, fdp_hi:4;
+        } real;
+        struct {
+            uint16_t fip;
+            uint16_t fcs;
+            uint16_t fdp;
+            uint16_t fds;
+        } prot;
+    } mode;
+};
+
+struct x87_env32 {
+    uint32_t fcw:16, :16;
+    uint32_t fsw:16, :16;
+    uint32_t ftw:16, :16;
+    union {
+        struct {
+            /* some CPUs/FPUs also store the full FIP here */
+            uint32_t fip_lo:16, :16;
+            uint32_t fop:11, :1, fip_hi:16, :4;
+            /* some CPUs/FPUs also store the full FDP here */
+            uint32_t fdp_lo:16, :16;
+            uint32_t :12, fdp_hi:16, :4;
+        } real;
+        struct {
+            uint32_t fip;
+            uint32_t fcs:16, fop:11, :5;
+            uint32_t fdp;
+            uint32_t fds:16, :16;
+        } prot;
+    } mode;
+};
+#endif
+
 typedef union {
     uint64_t mmx;
     uint64_t __attribute__ ((aligned(16))) xmm[2];
@@ -4924,9 +4971,22 @@ x86_emulate(
                     goto done;
                 emulate_fpu_insn_memsrc(b, modrm_reg & 7, src.val);
                 break;
-            case 6: /* fnstenv - TODO */
+            case 6: /* fnstenv */
+                fail_if(!ops->blk);
+                state->blk = blk_fst;
+                /*
+                 * REX is meaningless for this insn by this point - (ab)use
+                 * the field to communicate real vs protected mode to ->blk().
+                 */
+                /*state->*/rex_prefix = in_protmode(ctxt, ops);
+                if ( (rc = ops->blk(ea.mem.seg, ea.mem.off, NULL,
+                                    op_bytes > 2 ? sizeof(struct x87_env32)
+                                                 : sizeof(struct x87_env16),
+                                    &_regs.eflags,
+                                    state, ctxt)) != X86EMUL_OKAY )
+                    goto done;
                 state->fpu_ctrl = true;
-                goto unimplemented_insn;
+                break;
             case 7: /* fnstcw m2byte */
                 state->fpu_ctrl = true;
             fpu_memdst16:
@@ -5080,9 +5140,24 @@ x86_emulate(
                 emulate_fpu_insn_memdst(b, modrm_reg & 7, dst.val);
                 break;
             case 4: /* frstor - TODO */
-            case 6: /* fnsave - TODO */
                 state->fpu_ctrl = true;
                 goto unimplemented_insn;
+            case 6: /* fnsave */
+                fail_if(!ops->blk);
+                state->blk = blk_fst;
+                /*
+                 * REX is meaningless for this insn by this point - (ab)use
+                 * the field to communicate real vs protected mode to ->blk().
+                 */
+                /*state->*/rex_prefix = in_protmode(ctxt, ops);
+                if ( (rc = ops->blk(ea.mem.seg, ea.mem.off, NULL,
+                                    op_bytes > 2 ? sizeof(struct x87_env32) + 80
+                                                 : sizeof(struct x87_env16) + 80,
+                                    &_regs.eflags,
+                                    state, ctxt)) != X86EMUL_OKAY )
+                    goto done;
+                state->fpu_ctrl = true;
+                break;
             case 7: /* fnstsw m2byte */
                 state->fpu_ctrl = true;
                 goto fpu_memdst16;
@@ -11559,6 +11634,14 @@ int x86_emul_blk(
     switch ( state->blk )
     {
         bool zf;
+#ifndef X86EMUL_NO_FPU
+        struct {
+            struct x87_env32 env;
+            struct {
+               uint8_t bytes[10];
+            } freg[8];
+        } fpstate;
+#endif
 
         /*
          * Throughout this switch(), memory clobbers are used to compensate
@@ -11588,6 +11671,98 @@ int x86_emul_blk(
             *eflags |= X86_EFLAGS_ZF;
         break;
 
+#ifndef X86EMUL_NO_FPU
+
+    case blk_fst:
+        ASSERT(!data);
+
+        /* Don't chance consuming uninitialized data. */
+        memset(&fpstate, 0, sizeof(fpstate));
+        if ( bytes > sizeof(fpstate.env) )
+            asm ( "fnsave %0" : "+m" (fpstate) );
+        else
+            asm ( "fnstenv %0" : "+m" (fpstate.env) );
+
+        /* state->rex_prefix carries CR0.PE && !EFLAGS.VM setting */
+        switch ( bytes )
+        {
+        case sizeof(fpstate.env): /* 32-bit FNSTENV */
+        case sizeof(fpstate):     /* 32-bit FNSAVE */
+            if ( !state->rex_prefix )
+            {
+                /* Convert 32-bit prot to 32-bit real/vm86 format. */
+                unsigned int fip = fpstate.env.mode.prot.fip +
+                                   (fpstate.env.mode.prot.fcs << 4);
+                unsigned int fdp = fpstate.env.mode.prot.fdp +
+                                   (fpstate.env.mode.prot.fds << 4);
+                unsigned int fop = fpstate.env.mode.prot.fop;
+
+                memset(&fpstate.env.mode, 0, sizeof(fpstate.env.mode));
+                fpstate.env.mode.real.fip_lo = fip;
+                fpstate.env.mode.real.fip_hi = fip >> 16;
+                fpstate.env.mode.real.fop = fop;
+                fpstate.env.mode.real.fdp_lo = fdp;
+                fpstate.env.mode.real.fdp_hi = fdp >> 16;
+            }
+            memcpy(ptr, &fpstate.env, sizeof(fpstate.env));
+            if ( bytes == sizeof(fpstate.env) )
+                ptr = NULL;
+            else
+                ptr += sizeof(fpstate.env);
+            break;
+
+        case sizeof(struct x87_env16):                        /* 16-bit FNSTENV */
+        case sizeof(struct x87_env16) + sizeof(fpstate.freg): /* 16-bit FNSAVE */
+            if ( state->rex_prefix )
+            {
+                /* Convert 32-bit prot to 16-bit prot format. */
+                struct x87_env16 *env = ptr;
+
+                env->fcw = fpstate.env.fcw;
+                env->fsw = fpstate.env.fsw;
+                env->ftw = fpstate.env.ftw;
+                env->mode.prot.fip = fpstate.env.mode.prot.fip;
+                env->mode.prot.fcs = fpstate.env.mode.prot.fcs;
+                env->mode.prot.fdp = fpstate.env.mode.prot.fdp;
+                env->mode.prot.fds = fpstate.env.mode.prot.fds;
+            }
+            else
+            {
+                /* Convert 32-bit prot to 16-bit real/vm86 format. */
+                unsigned int fip = fpstate.env.mode.prot.fip +
+                                   (fpstate.env.mode.prot.fcs << 4);
+                unsigned int fdp = fpstate.env.mode.prot.fdp +
+                                   (fpstate.env.mode.prot.fds << 4);
+                struct x87_env16 env = {
+                    .fcw = fpstate.env.fcw,
+                    .fsw = fpstate.env.fsw,
+                    .ftw = fpstate.env.ftw,
+                    .mode.real.fip_lo = fip,
+                    .mode.real.fip_hi = fip >> 16,
+                    .mode.real.fop = fpstate.env.mode.prot.fop,
+                    .mode.real.fdp_lo = fdp,
+                    .mode.real.fdp_hi = fdp >> 16
+                };
+
+                memcpy(ptr, &env, sizeof(env));
+            }
+            if ( bytes == sizeof(struct x87_env16) )
+                ptr = NULL;
+            else
+                ptr += sizeof(struct x87_env16);
+            break;
+
+        default:
+            ASSERT_UNREACHABLE();
+            return X86EMUL_UNHANDLEABLE;
+        }
+
+        if ( ptr )
+            memcpy(ptr, fpstate.freg, sizeof(fpstate.freg));
+        break;
+
+#endif /* X86EMUL_NO_FPU */
+
     case blk_movdir:
         switch ( bytes )
         {


Re: [PATCH v10 7/9] x86emul: support FNSTENV and FNSAVE
Posted by Andrew Cooper 3 years, 11 months ago
On 25/05/2020 15:29, Jan Beulich wrote:
> To avoid introducing another boolean into emulator state, the
> rex_prefix field gets (ab)used to convey the real/VM86 vs protected mode
> info (affecting structure layout, albeit not size) to x86_emul_blk().
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>, with one suggestion.

> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -856,6 +856,9 @@ struct x86_emulate_state {
>      enum {
>          blk_NONE,
>          blk_enqcmd,
> +#ifndef X86EMUL_NO_FPU
> +        blk_fst, /* FNSTENV, FNSAVE */
> +#endif
>          blk_movdir,
>      } blk;
>      uint8_t modrm, modrm_mod, modrm_reg, modrm_rm;
> @@ -897,6 +900,50 @@ struct x86_emulate_state {
>  #define PTR_POISON NULL /* 32-bit builds are for user-space, so NULL is OK. */
>  #endif
>  
> +#ifndef X86EMUL_NO_FPU
> +struct x87_env16 {
> +    uint16_t fcw;
> +    uint16_t fsw;
> +    uint16_t ftw;
> +    union {
> +        struct {
> +            uint16_t fip_lo;
> +            uint16_t fop:11, :1, fip_hi:4;
> +            uint16_t fdp_lo;
> +            uint16_t :12, fdp_hi:4;
> +        } real;
> +        struct {
> +            uint16_t fip;
> +            uint16_t fcs;
> +            uint16_t fdp;
> +            uint16_t fds;
> +        } prot;
> +    } mode;
> +};
> +
> +struct x87_env32 {
> +    uint32_t fcw:16, :16;
> +    uint32_t fsw:16, :16;
> +    uint32_t ftw:16, :16;

uint16_t fcw, :16;
uint16_t fsw, :16;
uint16_t ftw, :16;

?

~Andrew

Re: [PATCH v10 7/9] x86emul: support FNSTENV and FNSAVE
Posted by Jan Beulich 3 years, 11 months ago
On 29.05.2020 16:08, Andrew Cooper wrote:
> On 25/05/2020 15:29, Jan Beulich wrote:
>> To avoid introducing another boolean into emulator state, the
>> rex_prefix field gets (ab)used to convey the real/VM86 vs protected mode
>> info (affecting structure layout, albeit not size) to x86_emul_blk().
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>,

Thanks.

> with one suggestion.
> 
>> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
>> @@ -856,6 +856,9 @@ struct x86_emulate_state {
>>      enum {
>>          blk_NONE,
>>          blk_enqcmd,
>> +#ifndef X86EMUL_NO_FPU
>> +        blk_fst, /* FNSTENV, FNSAVE */
>> +#endif
>>          blk_movdir,
>>      } blk;
>>      uint8_t modrm, modrm_mod, modrm_reg, modrm_rm;
>> @@ -897,6 +900,50 @@ struct x86_emulate_state {
>>  #define PTR_POISON NULL /* 32-bit builds are for user-space, so NULL is OK. */
>>  #endif
>>  
>> +#ifndef X86EMUL_NO_FPU
>> +struct x87_env16 {
>> +    uint16_t fcw;
>> +    uint16_t fsw;
>> +    uint16_t ftw;
>> +    union {
>> +        struct {
>> +            uint16_t fip_lo;
>> +            uint16_t fop:11, :1, fip_hi:4;
>> +            uint16_t fdp_lo;
>> +            uint16_t :12, fdp_hi:4;
>> +        } real;
>> +        struct {
>> +            uint16_t fip;
>> +            uint16_t fcs;
>> +            uint16_t fdp;
>> +            uint16_t fds;
>> +        } prot;
>> +    } mode;
>> +};
>> +
>> +struct x87_env32 {
>> +    uint32_t fcw:16, :16;
>> +    uint32_t fsw:16, :16;
>> +    uint32_t ftw:16, :16;
> 
> uint16_t fcw, :16;
> uint16_t fsw, :16;
> uint16_t ftw, :16;
> 
> ?

You had suggested this before, and I did reply that my intention
was to have x87_env16 use uint16_t throughout, and x87_env32
uint32_t respectively for all its pieces. In the end it doesn't
make a difference, and hence this cosmetic aspect is what made
me pick one of the various possible options.

Jan

[PATCH v10 8/9] x86emul: support FLDENV and FRSTOR
Posted by Jan Beulich 3 years, 11 months ago
While the Intel SDM claims that FRSTOR itself may raise #MF upon
completion, this was confirmed by Intel to be a doc error which will be
corrected in due course; behavior is like FLDENV, and like old hard copy
manuals describe it.

Re-arrange a switch() statement's case label order to allow for
fall-through from FLDENV handling to FNSTENV's.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v9: Refine description. Re-base over changes to earlier patch. Add
    comments.
v7: New.

--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -2442,6 +2442,27 @@ int main(int argc, char **argv)
     else
         printf("skipped\n");
 
+    printf("%-40s", "Testing fldenv 8(%edx)...");
+    if ( stack_exec && cpu_has_fpu )
+    {
+        asm volatile ( "fnstenv %0\n\t"
+                       "fninit"
+                       : "=m" (res[2]) :: "memory" );
+        zap_fpsel(&res[2], true);
+        instr[0] = 0xd9; instr[1] = 0x62; instr[2] = 0x08;
+        regs.eip = (unsigned long)&instr[0];
+        regs.edx = (unsigned long)res;
+        rc = x86_emulate(&ctxt, &emulops);
+        asm volatile ( "fnstenv %0" : "=m" (res[9]) :: "memory" );
+        if ( (rc != X86EMUL_OKAY) ||
+             memcmp(res + 2, res + 9, 28) ||
+             (regs.eip != (unsigned long)&instr[3]) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
     printf("%-40s", "Testing 16-bit fnsave (%ecx)...");
     if ( stack_exec && cpu_has_fpu )
     {
@@ -2468,6 +2489,31 @@ int main(int argc, char **argv)
             goto fail;
         printf("okay\n");
     }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing frstor (%edx)...");
+    if ( stack_exec && cpu_has_fpu )
+    {
+        const uint16_t seven = 7;
+
+        asm volatile ( "fninit\n\t"
+                       "fld1\n\t"
+                       "fidivs %1\n\t"
+                       "fnsave %0\n\t"
+                       : "=&m" (res[0]) : "m" (seven) : "memory" );
+        zap_fpsel(&res[0], true);
+        instr[0] = 0xdd; instr[1] = 0x22;
+        regs.eip = (unsigned long)&instr[0];
+        regs.edx = (unsigned long)res;
+        rc = x86_emulate(&ctxt, &emulops);
+        asm volatile ( "fnsave %0" : "=m" (res[27]) :: "memory" );
+        if ( (rc != X86EMUL_OKAY) ||
+             memcmp(res, res + 27, 108) ||
+             (regs.eip != (unsigned long)&instr[2]) )
+            goto fail;
+        printf("okay\n");
+    }
     else
         printf("skipped\n");
 
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -857,6 +857,7 @@ struct x86_emulate_state {
         blk_NONE,
         blk_enqcmd,
 #ifndef X86EMUL_NO_FPU
+        blk_fld, /* FLDENV, FRSTOR */
         blk_fst, /* FNSTENV, FNSAVE */
 #endif
         blk_movdir,
@@ -4960,22 +4961,15 @@ x86_emulate(
                 dst.bytes = 4;
                 emulate_fpu_insn_memdst(b, modrm_reg & 7, dst.val);
                 break;
-            case 4: /* fldenv - TODO */
-                state->fpu_ctrl = true;
-                goto unimplemented_insn;
-            case 5: /* fldcw m2byte */
-                state->fpu_ctrl = true;
-            fpu_memsrc16:
-                if ( (rc = ops->read(ea.mem.seg, ea.mem.off, &src.val,
-                                     2, ctxt)) != X86EMUL_OKAY )
-                    goto done;
-                emulate_fpu_insn_memsrc(b, modrm_reg & 7, src.val);
-                break;
+            case 4: /* fldenv */
+                /* Raise #MF now if there are pending unmasked exceptions. */
+                emulate_fpu_insn_stub(0xd9, 0xd0 /* fnop */);
+                /* fall through */
             case 6: /* fnstenv */
                 fail_if(!ops->blk);
-                state->blk = blk_fst;
+                state->blk = modrm_reg & 2 ? blk_fst : blk_fld;
                 /*
-                 * REX is meaningless for this insn by this point - (ab)use
+                 * REX is meaningless for these insns by this point - (ab)use
                  * the field to communicate real vs protected mode to ->blk().
                  */
                 /*state->*/rex_prefix = in_protmode(ctxt, ops);
@@ -4987,6 +4981,14 @@ x86_emulate(
                     goto done;
                 state->fpu_ctrl = true;
                 break;
+            case 5: /* fldcw m2byte */
+                state->fpu_ctrl = true;
+            fpu_memsrc16:
+                if ( (rc = ops->read(ea.mem.seg, ea.mem.off, &src.val,
+                                     2, ctxt)) != X86EMUL_OKAY )
+                    goto done;
+                emulate_fpu_insn_memsrc(b, modrm_reg & 7, src.val);
+                break;
             case 7: /* fnstcw m2byte */
                 state->fpu_ctrl = true;
             fpu_memdst16:
@@ -5139,14 +5141,15 @@ x86_emulate(
                 dst.bytes = 8;
                 emulate_fpu_insn_memdst(b, modrm_reg & 7, dst.val);
                 break;
-            case 4: /* frstor - TODO */
-                state->fpu_ctrl = true;
-                goto unimplemented_insn;
+            case 4: /* frstor */
+                /* Raise #MF now if there are pending unmasked exceptions. */
+                emulate_fpu_insn_stub(0xd9, 0xd0 /* fnop */);
+                /* fall through */
             case 6: /* fnsave */
                 fail_if(!ops->blk);
-                state->blk = blk_fst;
+                state->blk = modrm_reg & 2 ? blk_fst : blk_fld;
                 /*
-                 * REX is meaningless for this insn by this point - (ab)use
+                 * REX is meaningless for these insns by this point - (ab)use
                  * the field to communicate real vs protected mode to ->blk().
                  */
                 /*state->*/rex_prefix = in_protmode(ctxt, ops);
@@ -11673,6 +11676,92 @@ int x86_emul_blk(
 
 #ifndef X86EMUL_NO_FPU
 
+    case blk_fld:
+        ASSERT(!data);
+
+        /* state->rex_prefix carries CR0.PE && !EFLAGS.VM setting */
+        switch ( bytes )
+        {
+        case sizeof(fpstate.env): /* 32-bit FLDENV */
+        case sizeof(fpstate):     /* 32-bit FRSTOR */
+            memcpy(&fpstate.env, ptr, sizeof(fpstate.env));
+            if ( !state->rex_prefix )
+            {
+                /* Convert 32-bit real/vm86 to 32-bit prot format. */
+                unsigned int fip = fpstate.env.mode.real.fip_lo +
+                                   (fpstate.env.mode.real.fip_hi << 16);
+                unsigned int fdp = fpstate.env.mode.real.fdp_lo +
+                                   (fpstate.env.mode.real.fdp_hi << 16);
+                unsigned int fop = fpstate.env.mode.real.fop;
+
+                fpstate.env.mode.prot.fip = fip & 0xf;
+                fpstate.env.mode.prot.fcs = fip >> 4;
+                fpstate.env.mode.prot.fop = fop;
+                fpstate.env.mode.prot.fdp = fdp & 0xf;
+                fpstate.env.mode.prot.fds = fdp >> 4;
+            }
+
+            if ( bytes == sizeof(fpstate.env) )
+                ptr = NULL;
+            else
+                ptr += sizeof(fpstate.env);
+            break;
+
+        case sizeof(struct x87_env16):                        /* 16-bit FLDENV */
+        case sizeof(struct x87_env16) + sizeof(fpstate.freg): /* 16-bit FRSTOR */
+        {
+            const struct x87_env16 *env = ptr;
+
+            fpstate.env.fcw = env->fcw;
+            fpstate.env.fsw = env->fsw;
+            fpstate.env.ftw = env->ftw;
+
+            if ( state->rex_prefix )
+            {
+                /* Convert 16-bit prot to 32-bit prot format. */
+                fpstate.env.mode.prot.fip = env->mode.prot.fip;
+                fpstate.env.mode.prot.fcs = env->mode.prot.fcs;
+                fpstate.env.mode.prot.fdp = env->mode.prot.fdp;
+                fpstate.env.mode.prot.fds = env->mode.prot.fds;
+                fpstate.env.mode.prot.fop = 0; /* unknown */
+            }
+            else
+            {
+                /* Convert 16-bit real/vm86 to 32-bit prot format. */
+                unsigned int fip = env->mode.real.fip_lo +
+                                   (env->mode.real.fip_hi << 16);
+                unsigned int fdp = env->mode.real.fdp_lo +
+                                   (env->mode.real.fdp_hi << 16);
+                unsigned int fop = env->mode.real.fop;
+
+                fpstate.env.mode.prot.fip = fip & 0xf;
+                fpstate.env.mode.prot.fcs = fip >> 4;
+                fpstate.env.mode.prot.fop = fop;
+                fpstate.env.mode.prot.fdp = fdp & 0xf;
+                fpstate.env.mode.prot.fds = fdp >> 4;
+            }
+
+            if ( bytes == sizeof(*env) )
+                ptr = NULL;
+            else
+                ptr += sizeof(*env);
+            break;
+        }
+
+        default:
+            ASSERT_UNREACHABLE();
+            return X86EMUL_UNHANDLEABLE;
+        }
+
+        if ( ptr )
+        {
+            memcpy(fpstate.freg, ptr, sizeof(fpstate.freg));
+            asm volatile ( "frstor %0" :: "m" (fpstate) );
+        }
+        else
+            asm volatile ( "fldenv %0" :: "m" (fpstate.env) );
+        break;
+
     case blk_fst:
         ASSERT(!data);
 


Re: [PATCH v10 8/9] x86emul: support FLDENV and FRSTOR
Posted by Andrew Cooper 3 years, 11 months ago
On 25/05/2020 15:29, Jan Beulich wrote:
> While the Intel SDM claims that FRSTOR itself may raise #MF upon
> completion, this was confirmed by Intel to be a doc error which will be
> corrected in due course; behavior is like FLDENV, and like old hard copy
> manuals describe it.
>
> Re-arrange a switch() statement's case label order to allow for
> fall-through from FLDENV handling to FNSTENV's.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

[PATCH v10 9/9] x86emul: support FXSAVE/FXRSTOR
Posted by Jan Beulich 3 years, 11 months ago
Note that FPU selector handling as well as MXCSR mask saving for now
does not honor differences between host and guest visible featuresets.

While for Intel operation of the insns with CR4.OSFXSR=0 is
implementation dependent, use the easiest solution there: Simply don't
look at the bit in the first place. For AMD and alike the behavior is
well defined, so it gets handled together with FFXSR.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v9: Change a few field types in struct x86_fxsr. Leave reserved fields
    either entirely unnamed, or named "rsvd". Set state->fpu_ctrl. Avoid
    memory clobbers. Add memset() to FXSAVE logic. Add comments.
v8: Respect EFER.FFXSE and CR4.OSFXSR. Correct wrong X86EMUL_NO_*
    dependencies. Reduce #ifdef-ary.
v7: New.

--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -767,6 +767,12 @@ static void zap_fpsel(unsigned int *env,
     }
 }
 
+static void zap_xfpsel(unsigned int *env)
+{
+    env[3] &= ~0xffff;
+    env[5] &= ~0xffff;
+}
+
 #ifdef __x86_64__
 # define STKVAL_DISP 64
 static const struct {
@@ -2517,6 +2523,91 @@ int main(int argc, char **argv)
     else
         printf("skipped\n");
 
+    printf("%-40s", "Testing fxsave 4(%ecx)...");
+    if ( stack_exec && cpu_has_fxsr )
+    {
+        const uint16_t nine = 9;
+
+        memset(res + 0x80, 0xcc, 0x400);
+        if ( cpu_has_sse2 )
+            asm volatile ( "pcmpeqd %xmm7, %xmm7\n\t"
+                           "pxor %xmm6, %xmm6\n\t"
+                           "psubw %xmm7, %xmm6" );
+        asm volatile ( "fninit\n\t"
+                       "fld1\n\t"
+                       "fidivs %1\n\t"
+                       "fxsave %0"
+                       : "=m" (res[0x100]) : "m" (nine) : "memory" );
+        zap_xfpsel(&res[0x100]);
+        instr[0] = 0x0f; instr[1] = 0xae; instr[2] = 0x41; instr[3] = 0x04;
+        regs.eip = (unsigned long)&instr[0];
+        regs.ecx = (unsigned long)(res + 0x7f);
+        memset(res + 0x100 + 0x74, 0x33, 0x30);
+        memset(res + 0x80 + 0x74, 0x33, 0x30);
+        rc = x86_emulate(&ctxt, &emulops);
+        zap_xfpsel(&res[0x80]);
+        if ( (rc != X86EMUL_OKAY) ||
+             memcmp(res + 0x80, res + 0x100, 0x200) ||
+             (regs.eip != (unsigned long)&instr[4]) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing fxrstor -4(%ecx)...");
+    if ( stack_exec && cpu_has_fxsr )
+    {
+        const uint16_t eleven = 11;
+
+        memset(res + 0x80, 0xcc, 0x400);
+        asm volatile ( "fxsave %0" : "=m" (res[0x80]) :: "memory" );
+        zap_xfpsel(&res[0x80]);
+        if ( cpu_has_sse2 )
+            asm volatile ( "pxor %xmm7, %xmm6\n\t"
+                           "pxor %xmm7, %xmm3\n\t"
+                           "pxor %xmm7, %xmm0\n\t"
+                           "pxor %xmm7, %xmm7" );
+        asm volatile ( "fninit\n\t"
+                       "fld1\n\t"
+                       "fidivs %0\n\t"
+                       :: "m" (eleven) );
+        instr[0] = 0x0f; instr[1] = 0xae; instr[2] = 0x49; instr[3] = 0xfc;
+        regs.eip = (unsigned long)&instr[0];
+        regs.ecx = (unsigned long)(res + 0x81);
+        rc = x86_emulate(&ctxt, &emulops);
+        asm volatile ( "fxsave %0" : "=m" (res[0x100]) :: "memory" );
+        if ( (rc != X86EMUL_OKAY) ||
+             memcmp(res + 0x100, res + 0x80, 0x200) ||
+             (regs.eip != (unsigned long)&instr[4]) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+#ifdef __x86_64__
+    printf("%-40s", "Testing fxsaveq 8(%edx)...");
+    if ( stack_exec && cpu_has_fxsr )
+    {
+        memset(res + 0x80, 0xcc, 0x400);
+        asm volatile ( "fxsaveq %0" : "=m" (res[0x100]) :: "memory" );
+        instr[0] = 0x48; instr[1] = 0x0f; instr[2] = 0xae; instr[3] = 0x42; instr[4] = 0x08;
+        regs.eip = (unsigned long)&instr[0];
+        regs.edx = (unsigned long)(res + 0x7e);
+        memset(res + 0x100 + 0x74, 0x33, 0x30);
+        memset(res + 0x80 + 0x74, 0x33, 0x30);
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( (rc != X86EMUL_OKAY) ||
+             memcmp(res + 0x80, res + 0x100, 0x200) ||
+             (regs.eip != (unsigned long)&instr[5]) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+#endif
+
     printf("%-40s", "Testing movq %mm3,(%ecx)...");
     if ( stack_exec && cpu_has_mmx )
     {
--- a/tools/tests/x86_emulator/x86-emulate.c
+++ b/tools/tests/x86_emulator/x86-emulate.c
@@ -36,6 +36,13 @@ struct cpuid_policy cp;
 static char fpu_save_area[4096] __attribute__((__aligned__((64))));
 static bool use_xsave;
 
+/*
+ * Re-use the area above also as scratch space for the emulator itself.
+ * (When debugging the emulator, care needs to be taken when inserting
+ * printf() or alike function calls into regions using this.)
+ */
+#define FXSAVE_AREA ((struct x86_fxsr *)fpu_save_area)
+
 void emul_save_fpu_state(void)
 {
     if ( use_xsave )
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -860,6 +860,11 @@ struct x86_emulate_state {
         blk_fld, /* FLDENV, FRSTOR */
         blk_fst, /* FNSTENV, FNSAVE */
 #endif
+#if !defined(X86EMUL_NO_FPU) || !defined(X86EMUL_NO_MMX) || \
+    !defined(X86EMUL_NO_SIMD)
+        blk_fxrstor,
+        blk_fxsave,
+#endif
         blk_movdir,
     } blk;
     uint8_t modrm, modrm_mod, modrm_reg, modrm_rm;
@@ -953,6 +958,29 @@ typedef union {
     uint32_t data32[16];
 } mmval_t;
 
+struct x86_fxsr {
+    uint16_t fcw;
+    uint16_t fsw;
+    uint8_t ftw, :8;
+    uint16_t fop;
+    union {
+        struct {
+            uint32_t offs;
+            uint16_t sel, :16;
+        };
+        uint64_t addr;
+    } fip, fdp;
+    uint32_t mxcsr;
+    uint32_t mxcsr_mask;
+    struct {
+        uint8_t data[10];
+        uint16_t :16, :16, :16;
+    } fpreg[8];
+    uint64_t __attribute__ ((aligned(16))) xmm[16][2];
+    uint64_t rsvd[6];
+    uint64_t avl[6];
+};
+
 /*
  * While proper alignment gets specified above, this doesn't get honored by
  * the compiler for automatic variables. Use this helper to instantiate a
@@ -1910,6 +1938,7 @@ amd_like(const struct x86_emulate_ctxt *
 #define vcpu_has_cmov()        (ctxt->cpuid->basic.cmov)
 #define vcpu_has_clflush()     (ctxt->cpuid->basic.clflush)
 #define vcpu_has_mmx()         (ctxt->cpuid->basic.mmx)
+#define vcpu_has_fxsr()        (ctxt->cpuid->basic.fxsr)
 #define vcpu_has_sse()         (ctxt->cpuid->basic.sse)
 #define vcpu_has_sse2()        (ctxt->cpuid->basic.sse2)
 #define vcpu_has_sse3()        (ctxt->cpuid->basic.sse3)
@@ -8148,6 +8177,49 @@ x86_emulate(
     case X86EMUL_OPC(0x0f, 0xae): case X86EMUL_OPC_66(0x0f, 0xae): /* Grp15 */
         switch ( modrm_reg & 7 )
         {
+#if !defined(X86EMUL_NO_FPU) || !defined(X86EMUL_NO_MMX) || \
+    !defined(X86EMUL_NO_SIMD)
+        case 0: /* fxsave */
+        case 1: /* fxrstor */
+            generate_exception_if(vex.pfx, EXC_UD);
+            vcpu_must_have(fxsr);
+            generate_exception_if(ea.type != OP_MEM, EXC_UD);
+            generate_exception_if(!is_aligned(ea.mem.seg, ea.mem.off, 16,
+                                              ctxt, ops),
+                                  EXC_GP, 0);
+            fail_if(!ops->blk);
+            op_bytes =
+#ifdef __x86_64__
+                !mode_64bit() ? offsetof(struct x86_fxsr, xmm[8]) :
+#endif
+                sizeof(struct x86_fxsr);
+            if ( amd_like(ctxt) )
+            {
+                /* Assume "normal" operation in case of missing hooks. */
+                if ( !ops->read_cr ||
+                     ops->read_cr(4, &cr4, ctxt) != X86EMUL_OKAY )
+                    cr4 = X86_CR4_OSFXSR;
+                if ( !ops->read_msr ||
+                     ops->read_msr(MSR_EFER, &msr_val, ctxt) != X86EMUL_OKAY )
+                    msr_val = 0;
+                if ( !(cr4 & X86_CR4_OSFXSR) ||
+                     (mode_64bit() && mode_ring0() && (msr_val & EFER_FFXSE)) )
+                    op_bytes = offsetof(struct x86_fxsr, xmm[0]);
+            }
+            /*
+             * This could also be X86EMUL_FPU_mmx, but it shouldn't be
+             * X86EMUL_FPU_xmm, as we don't want CR4.OSFXSR checked.
+             */
+            get_fpu(X86EMUL_FPU_fpu);
+            state->fpu_ctrl = true;
+            state->blk = modrm_reg & 1 ? blk_fxrstor : blk_fxsave;
+            if ( (rc = ops->blk(ea.mem.seg, ea.mem.off, NULL,
+                                sizeof(struct x86_fxsr), &_regs.eflags,
+                                state, ctxt)) != X86EMUL_OKAY )
+                goto done;
+            break;
+#endif /* X86EMUL_NO_{FPU,MMX,SIMD} */
+
 #ifndef X86EMUL_NO_SIMD
         case 2: /* ldmxcsr */
             generate_exception_if(vex.pfx, EXC_UD);
@@ -11634,6 +11706,8 @@ int x86_emul_blk(
     struct x86_emulate_state *state,
     struct x86_emulate_ctxt *ctxt)
 {
+    int rc = X86EMUL_OKAY;
+
     switch ( state->blk )
     {
         bool zf;
@@ -11852,6 +11926,86 @@ int x86_emul_blk(
 
 #endif /* X86EMUL_NO_FPU */
 
+#if !defined(X86EMUL_NO_FPU) || !defined(X86EMUL_NO_MMX) || \
+    !defined(X86EMUL_NO_SIMD)
+
+    case blk_fxrstor:
+    {
+        struct x86_fxsr *fxsr = FXSAVE_AREA;
+
+        ASSERT(!data);
+        ASSERT(bytes == sizeof(*fxsr));
+        ASSERT(state->op_bytes <= bytes);
+
+        if ( state->op_bytes < sizeof(*fxsr) )
+        {
+            if ( state->rex_prefix & REX_W )
+            {
+                /*
+                 * The only way to force fxsaveq on a wide range of gas
+                 * versions. On older versions the rex64 prefix works only if
+                 * we force an addressing mode that doesn't require extended
+                 * registers.
+                 */
+                asm volatile ( ".byte 0x48; fxsave (%1)"
+                               : "=m" (*fxsr) : "R" (fxsr) );
+            }
+            else
+                asm volatile ( "fxsave %0" : "=m" (*fxsr) );
+        }
+
+        /*
+         * Don't chance the reserved or available ranges to contain any
+         * data FXRSTOR may actually consume in some way: Copy only the
+         * defined portion, and zero the rest.
+         */
+        memcpy(fxsr, ptr, min(state->op_bytes,
+                              (unsigned int)offsetof(struct x86_fxsr, rsvd)));
+        memset(fxsr->rsvd, 0, sizeof(*fxsr) - offsetof(struct x86_fxsr, rsvd));
+
+        generate_exception_if(fxsr->mxcsr & ~mxcsr_mask, EXC_GP, 0);
+
+        if ( state->rex_prefix & REX_W )
+        {
+            /* See above for why operand/constraints are this way. */
+            asm volatile ( ".byte 0x48; fxrstor (%1)"
+                           :: "m" (*fxsr), "R" (fxsr) );
+        }
+        else
+            asm volatile ( "fxrstor %0" :: "m" (*fxsr) );
+        break;
+    }
+
+    case blk_fxsave:
+    {
+        struct x86_fxsr *fxsr = FXSAVE_AREA;
+
+        ASSERT(!data);
+        ASSERT(bytes == sizeof(*fxsr));
+        ASSERT(state->op_bytes <= bytes);
+
+        if ( state->op_bytes < sizeof(*fxsr) )
+            /* Don't chance consuming uninitialized data. */
+            memset(fxsr, 0, state->op_bytes);
+        else
+            fxsr = ptr;
+
+        if ( state->rex_prefix & REX_W )
+        {
+            /* See above for why operand/constraints are this way. */
+            asm volatile ( ".byte 0x48; fxsave (%1)"
+                           : "=m" (*fxsr) : "R" (fxsr) );
+        }
+        else
+            asm volatile ( "fxsave %0" : "=m" (*fxsr) );
+
+        if ( fxsr != ptr ) /* i.e. state->op_bytes < sizeof(*fxsr) */
+            memcpy(ptr, fxsr, state->op_bytes);
+        break;
+    }
+
+#endif /* X86EMUL_NO_{FPU,MMX,SIMD} */
+
     case blk_movdir:
         switch ( bytes )
         {
@@ -11905,7 +12059,8 @@ int x86_emul_blk(
         return X86EMUL_UNHANDLEABLE;
     }
 
-    return X86EMUL_OKAY;
+ done:
+    return rc;
 }
 
 static void __init __maybe_unused build_assertions(void)
--- a/xen/arch/x86/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate.c
@@ -43,6 +43,8 @@
     }                                                      \
 })
 
+#define FXSAVE_AREA current->arch.fpu_ctxt
+
 #ifndef CONFIG_HVM
 # define X86EMUL_NO_FPU
 # define X86EMUL_NO_MMX


Re: [PATCH v10 9/9] x86emul: support FXSAVE/FXRSTOR
Posted by Andrew Cooper 3 years, 11 months ago
On 25/05/2020 15:30, Jan Beulich wrote:
> Note that FPU selector handling as well as MXCSR mask saving for now
> does not honor differences between host and guest visible featuresets.
>
> While for Intel operation of the insns with CR4.OSFXSR=0 is
> implementation dependent, use the easiest solution there: Simply don't
> look at the bit in the first place. For AMD and alike the behavior is
> well defined, so it gets handled together with FFXSR.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>