[Xen-devel] [PATCH 0/7] x86emul: (mainly) vendor specific behavior adjustments

Jan Beulich posted 7 patches 2 weeks ago
Only 0 patches received!

[Xen-devel] [PATCH 0/7] x86emul: (mainly) vendor specific behavior adjustments

Posted by Jan Beulich 2 weeks ago
There are quite a few more vendor differences than we currently support,
in particular in 64-bit mode. Now that I've made some progress on the
binutils side I felt more confident in making the changes here as well.

1: add wrappers to check for AMD-like behavior
2: vendor specific near RET behavior in 64-bit mode
3: vendor specific direct branch behavior in 64-bit mode
4: vendor specific near indirect branch behavior in 64-bit mode
5: vendor specific SYSENTER/SYSEXIT behavior in long mode
6: vendor specific SYSCALL behavior
7: support SYSRET

Jan

[Xen-devel] [PATCH 1/7] x86emul: add wrappers to check for AMD-like behavior

Posted by Jan Beulich 2 weeks ago
These are to aid readbility at their use sites, in particular because
we're going to gain more of them.

Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1836,6 +1836,18 @@ in_protmode(
     return !(in_realmode(ctxt, ops) || (ctxt->regs->eflags & X86_EFLAGS_VM));
 }
 
+static bool
+_amd_like(const struct cpuid_policy *cp)
+{
+    return cp->x86_vendor & (X86_VENDOR_AMD | X86_VENDOR_HYGON);
+}
+
+static bool
+amd_like(const struct x86_emulate_ctxt *ctxt)
+{
+    return _amd_like(ctxt->cpuid);
+}
+
 #define vcpu_has_fpu()         (ctxt->cpuid->basic.fpu)
 #define vcpu_has_sep()         (ctxt->cpuid->basic.sep)
 #define vcpu_has_cx8()         (ctxt->cpuid->basic.cx8)
@@ -1995,7 +2007,7 @@ protmode_load_seg(
         case x86_seg_tr:
             goto raise_exn;
         }
-        if ( !(cp->x86_vendor & (X86_VENDOR_AMD | X86_VENDOR_HYGON)) ||
+        if ( !_amd_like(cp) ||
              !ops->read_segment ||
              ops->read_segment(seg, sreg, ctxt) != X86EMUL_OKAY )
             memset(sreg, 0, sizeof(*sreg));
@@ -2122,9 +2134,7 @@ protmode_load_seg(
          *   - all 16 bytes read with the high 8 bytes ignored on AMD.
          */
         bool wide = desc.b & 0x1000
-                    ? false : (desc.b & 0xf00) != 0xc00 &&
-                               !(cp->x86_vendor &
-                                 (X86_VENDOR_AMD | X86_VENDOR_HYGON))
+                    ? false : (desc.b & 0xf00) != 0xc00 && !_amd_like(cp)
                                ? mode_64bit() : ctxt->lma;
 
         if ( wide )
@@ -2142,9 +2152,7 @@ protmode_load_seg(
             default:
                 return rc;
             }
-            if ( !mode_64bit() &&
-                 (cp->x86_vendor & (X86_VENDOR_AMD | X86_VENDOR_HYGON)) &&
-                 (desc.b & 0xf00) != 0xc00 )
+            if ( !mode_64bit() && _amd_like(cp) && (desc.b & 0xf00) != 0xc00 )
                 desc_hi.b = desc_hi.a = 0;
             if ( (desc_hi.b & 0x00001f00) ||
                  (seg != x86_seg_none &&
@@ -2525,9 +2533,7 @@ x86_decode_onebyte(
         case 3: /* call (far, absolute indirect) */
         case 5: /* jmp (far, absolute indirect) */
             /* REX.W ignored on a vendor-dependent basis. */
-            if ( op_bytes == 8 &&
-                 (ctxt->cpuid->x86_vendor &
-                  (X86_VENDOR_AMD | X86_VENDOR_HYGON)) )
+            if ( op_bytes == 8 && amd_like(ctxt) )
                 op_bytes = 4;
             state->desc = DstNone | SrcMem | Mov;
             break;
@@ -2651,8 +2657,7 @@ x86_decode_twobyte(
     case 0xb4: /* lfs */
     case 0xb5: /* lgs */
         /* REX.W ignored on a vendor-dependent basis. */
-        if ( op_bytes == 8 &&
-             (ctxt->cpuid->x86_vendor & (X86_VENDOR_AMD | X86_VENDOR_HYGON)) )
+        if ( op_bytes == 8 && amd_like(ctxt) )
             op_bytes = 4;
         break;
 
@@ -4068,9 +4073,7 @@ x86_emulate(
             if ( ea.type == OP_REG )
                 src.val = *ea.reg;
             else if ( (rc = read_ulong(ea.mem.seg, ea.mem.off, &src.val,
-                                       (op_bytes == 2 &&
-                                        !(ctxt->cpuid->x86_vendor &
-                                          (X86_VENDOR_AMD | X86_VENDOR_HYGON))
+                                       (op_bytes == 2 && !amd_like(ctxt)
                                         ? 2 : 4),
                                        ctxt, ops)) )
                 goto done;


Re: [Xen-devel] [PATCH 1/7] x86emul: add wrappers to check for AMD-like behavior

Posted by Andrew Cooper 2 weeks ago
On 24/03/2020 16:26, Jan Beulich wrote:
> @@ -1995,7 +2007,7 @@ protmode_load_seg(
>          case x86_seg_tr:
>              goto raise_exn;
>          }
> -        if ( !(cp->x86_vendor & (X86_VENDOR_AMD | X86_VENDOR_HYGON)) ||
> +        if ( !_amd_like(cp) ||
>               !ops->read_segment ||

Fold these two lines?

Either way, but preferably with, Reviewed-by: Andrew Cooper
<andrew.cooper3@citrix.com>

[Xen-devel] [PATCH 2/7] x86emul: vendor specific near RET behavior in 64-bit mode

Posted by Jan Beulich 2 weeks ago
Intel CPUs ignore operand size overrides here, while AMD ones don't.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -733,6 +733,34 @@ static struct x86_emulate_ops emulops =
 #define EFLAGS_ALWAYS_SET (X86_EFLAGS_IF | X86_EFLAGS_MBS)
 #define EFLAGS_MASK (X86_EFLAGS_ARITH_MASK | EFLAGS_ALWAYS_SET)
 
+#define MMAP_ADDR 0x100000
+
+#ifdef __x86_64__
+# define STKVAL_DISP 64
+static const struct {
+    const char *descr;
+    uint8_t opcode[8];
+    /* Index 0: AMD, index 1: Intel. */
+    uint8_t opc_len[2];
+    int8_t stkoff[2];
+    int32_t disp[2];
+} vendor_tests[] = {
+    {
+        .descr = "retw",
+        .opcode = { 0x66, 0xc3 },
+        .opc_len = { 2, 2 },
+        .stkoff = { 2, 8 },
+        .disp = { STKVAL_DISP - MMAP_ADDR, STKVAL_DISP },
+    }, {
+        .descr = "retw $16",
+        .opcode = { 0x66, 0xc2, 0x10, 0x00 },
+        .opc_len = { 4, 4 },
+        .stkoff = { 2 + 16, 8 + 16 },
+        .disp = { STKVAL_DISP - MMAP_ADDR, STKVAL_DISP },
+    },
+};
+#endif
+
 int main(int argc, char **argv)
 {
     struct x86_emulate_ctxt ctxt;
@@ -741,7 +769,9 @@ int main(int argc, char **argv)
     unsigned int *res, i, j;
     bool stack_exec;
     int rc;
-#ifndef __x86_64__
+#ifdef __x86_64__
+    unsigned int vendor_native;
+#else
     unsigned int bcdres_native, bcdres_emul;
 #endif
 
@@ -755,7 +785,7 @@ int main(int argc, char **argv)
     ctxt.addr_size = 8 * sizeof(void *);
     ctxt.sp_size   = 8 * sizeof(void *);
 
-    res = mmap((void *)0x100000, MMAP_SZ, PROT_READ|PROT_WRITE|PROT_EXEC,
+    res = mmap((void *)MMAP_ADDR, MMAP_SZ, PROT_READ|PROT_WRITE|PROT_EXEC,
                MAP_FIXED|MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
     if ( res == MAP_FAILED )
     {
@@ -1323,7 +1353,41 @@ int main(int argc, char **argv)
          (regs.eip != (unsigned long)&instr[3]) )
         goto fail;
     printf("okay\n");
-#endif
+
+    vendor_native = cp.x86_vendor;
+    for ( cp.x86_vendor = X86_VENDOR_AMD; ; )
+    {
+        unsigned int v = cp.x86_vendor == X86_VENDOR_INTEL;
+        const char *vendor = cp.x86_vendor == X86_VENDOR_INTEL ? "Intel" : "AMD";
+        uint64_t *stk = (void *)res + MMAP_SZ - 16;
+
+        for ( i = 0; i < ARRAY_SIZE(vendor_tests); ++i )
+        {
+            printf("%-*s",
+                   40 - printf("Testing %s [%s]", vendor_tests[i].descr, vendor),
+                   "...");
+            memcpy(instr, vendor_tests[i].opcode, vendor_tests[i].opc_len[v]);
+            regs.eflags = EFLAGS_ALWAYS_SET;
+            regs.rip    = (unsigned long)instr;
+            regs.rsp    = (unsigned long)stk;
+            stk[0]      = regs.rip + STKVAL_DISP;
+            rc = x86_emulate(&ctxt, &emulops);
+            if ( (rc != X86EMUL_OKAY) ||
+                 (regs.eflags != EFLAGS_ALWAYS_SET) ||
+                 (regs.rip != (unsigned long)instr +
+                              (vendor_tests[i].disp[v]
+                               ?: vendor_tests[i].opc_len[v])) ||
+                 (regs.rsp != (unsigned long)stk + vendor_tests[i].stkoff[v]) )
+                goto fail;
+            printf("okay\n");
+        }
+
+        if ( cp.x86_vendor == X86_VENDOR_INTEL )
+            break;
+        cp.x86_vendor = X86_VENDOR_INTEL;
+    }
+    cp.x86_vendor = vendor_native;
+#endif /* x86-64 */
 
     printf("%-40s", "Testing shld $1,%ecx,(%edx)...");
     res[0]      = 0x12345678;
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -4611,7 +4611,8 @@ x86_emulate(
 
     case 0xc2: /* ret imm16 (near) */
     case 0xc3: /* ret (near) */
-        op_bytes = ((op_bytes == 4) && mode_64bit()) ? 8 : op_bytes;
+        op_bytes = (op_bytes == 4 || !amd_like(ctxt)) && mode_64bit()
+                   ? 8 : op_bytes;
         if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes + src.val),
                               &dst.val, op_bytes, ctxt, ops)) != 0 ||
              (rc = ops->insn_fetch(x86_seg_cs, dst.val, NULL, 0, ctxt)) )


Re: [Xen-devel] [PATCH 2/7] x86emul: vendor specific near RET behavior in 64-bit mode

Posted by Andrew Cooper 2 weeks ago
On 24/03/2020 16:26, Jan Beulich wrote:
> Intel CPUs ignore operand size overrides here, while AMD ones don't.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

[Xen-devel] [PATCH 3/7] x86emul: vendor specific direct branch behavior in 64-bit mode

Posted by Jan Beulich 2 weeks ago
Intel CPUs ignore operand size overrides here, while AMD ones don't.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -757,6 +757,62 @@ static const struct {
         .opc_len = { 4, 4 },
         .stkoff = { 2 + 16, 8 + 16 },
         .disp = { STKVAL_DISP - MMAP_ADDR, STKVAL_DISP },
+    }, {
+        .descr = "jmpw .+16",
+        .opcode = { 0x66, 0xeb, 0x10 },
+        .opc_len = { 3, 3 },
+        .disp = { 3 + 16 - MMAP_ADDR, 3 + 16 },
+    }, {
+        .descr = "jmpw .+128",
+        .opcode = { 0x66, 0xe9, 0x80, 0x00, 0x00, 0x00 },
+        .opc_len = { 4, 6 },
+        .disp = { 4 + 128 - MMAP_ADDR, 6 + 128 },
+    }, {
+        .descr = "callw .+16",
+        .opcode = { 0x66, 0xe8, 0x10, 0x00, 0x00, 0x00 },
+        .opc_len = { 4, 6 },
+        .stkoff = { -2, -8 },
+        .disp = { 4 + 16 - MMAP_ADDR, 6 + 16 },
+    }, {
+        .descr = "jzw .+16",
+        .opcode = { 0x66, 0x74, 0x10 },
+        .opc_len = { 3, 3 },
+        .disp = { 3, 3 },
+    }, {
+        .descr = "jzw .+128",
+        .opcode = { 0x66, 0x0f, 0x84, 0x80, 0x00, 0x00, 0x00 },
+        .opc_len = { 5, 7 },
+        .disp = { 5, 7 },
+    }, {
+        .descr = "jnzw .+16",
+        .opcode = { 0x66, 0x75, 0x10 },
+        .opc_len = { 3, 3 },
+        .disp = { 3 + 16 - MMAP_ADDR, 3 + 16 },
+    }, {
+        .descr = "jnzw .+128",
+        .opcode = { 0x66, 0x0f, 0x85, 0x80, 0x00, 0x00, 0x00 },
+        .opc_len = { 5, 7 },
+        .disp = { 5 + 128 - MMAP_ADDR, 7 + 128 },
+    }, {
+        .descr = "loopqw .+16 (RCX>1)",
+        .opcode = { 0x66, 0xe0, 0x10 },
+        .opc_len = { 3, 3 },
+        .disp = { 3 + 16 - MMAP_ADDR, 3 + 16 },
+    }, {
+        .descr = "looplw .+16 (ECX=1)",
+        .opcode = { 0x66, 0x67, 0xe0, 0x10 },
+        .opc_len = { 4, 4 },
+        .disp = { 4, 4 },
+    }, {
+        .descr = "jrcxzw .+16 (RCX>0)",
+        .opcode = { 0x66, 0xe3, 0x10 },
+        .opc_len = { 3, 3 },
+        .disp = { 3, 3 },
+    }, {
+        .descr = "jecxzw .+16 (ECX=0)",
+        .opcode = { 0x66, 0x67, 0xe3, 0x10 },
+        .opc_len = { 4, 4 },
+        .disp = { 4 + 16 - MMAP_ADDR, 4 + 16 },
     },
 };
 #endif
@@ -1361,6 +1417,7 @@ int main(int argc, char **argv)
         const char *vendor = cp.x86_vendor == X86_VENDOR_INTEL ? "Intel" : "AMD";
         uint64_t *stk = (void *)res + MMAP_SZ - 16;
 
+        regs.rcx = 2;
         for ( i = 0; i < ARRAY_SIZE(vendor_tests); ++i )
         {
             printf("%-*s",
@@ -1370,6 +1427,7 @@ int main(int argc, char **argv)
             regs.eflags = EFLAGS_ALWAYS_SET;
             regs.rip    = (unsigned long)instr;
             regs.rsp    = (unsigned long)stk;
+            regs.rcx   |= 0x8765432100000000UL;
             stk[0]      = regs.rip + STKVAL_DISP;
             rc = x86_emulate(&ctxt, &emulops);
             if ( (rc != X86EMUL_OKAY) ||
@@ -1379,6 +1437,16 @@ int main(int argc, char **argv)
                                ?: vendor_tests[i].opc_len[v])) ||
                  (regs.rsp != (unsigned long)stk + vendor_tests[i].stkoff[v]) )
                 goto fail;
+            /* For now only call insns push something onto the stack. */
+            if ( regs.rsp < (unsigned long)stk )
+            {
+                unsigned long opc_end = (unsigned long)instr +
+                                        vendor_tests[i].opc_len[v];
+
+                if ( memcmp(&opc_end, (void *)regs.rsp,
+                            min((unsigned long)stk - regs.rsp, 8UL)) )
+                    goto fail;
+            }
             printf("okay\n");
         }
 
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1273,7 +1273,7 @@ do {
 #define jmp_rel(rel)                                                    \
 do {                                                                    \
     unsigned long ip = _regs.r(ip) + (int)(rel);                        \
-    if ( op_bytes == 2 )                                                \
+    if ( op_bytes == 2 && (amd_like(ctxt) || !mode_64bit()) )           \
         ip = (uint16_t)ip;                                              \
     else if ( !mode_64bit() )                                           \
         ip = (uint32_t)ip;                                              \
@@ -3392,7 +3392,13 @@ x86_decode(
 
     case SrcImm:
         if ( !(d & ByteOp) )
+        {
+            if ( mode_64bit() && !amd_like(ctxt) &&
+                 ((ext == ext_none && (b | 1) == 0xe9) /* call / jmp */ ||
+                  (ext == ext_0f && (b | 0xf) == 0x8f) /* jcc */ ) )
+                op_bytes = 4;
             bytes = op_bytes != 8 ? op_bytes : 4;
+        }
         else
         {
     case SrcImmByte:

Re: [Xen-devel] [PATCH 3/7] x86emul: vendor specific direct branch behavior in 64-bit mode

Posted by Andrew Cooper 2 weeks ago
On 24/03/2020 16:27, Jan Beulich wrote:
> Intel CPUs ignore operand size overrides here, while AMD ones don't.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

[Xen-devel] [PATCH 4/7] x86emul: vendor specific near indirect branch behavior in 64-bit mode

Posted by Jan Beulich 2 weeks ago
Intel CPUs ignore operand size overrides here, while AMD ones don't.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -813,6 +813,17 @@ static const struct {
         .opcode = { 0x66, 0x67, 0xe3, 0x10 },
         .opc_len = { 4, 4 },
         .disp = { 4 + 16 - MMAP_ADDR, 4 + 16 },
+    }, {
+        .descr = "jmpw *(%rsp)",
+        .opcode = { 0x66, 0xff, 0x24, 0x24 },
+        .opc_len = { 4, 4 },
+        .disp = { STKVAL_DISP - MMAP_ADDR, STKVAL_DISP },
+    }, {
+        .descr = "callw *(%rsp)",
+        .opcode = { 0x66, 0xff, 0x14, 0x24 },
+        .opc_len = { 4, 4 },
+        .stkoff = { -2, -8 },
+        .disp = { STKVAL_DISP - MMAP_ADDR, STKVAL_DISP },
     },
 };
 #endif
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -2524,8 +2524,7 @@ x86_decode_onebyte(
         {
         case 2: /* call (near) */
         case 4: /* jmp (near) */
-        case 6: /* push */
-            if ( mode_64bit() && op_bytes == 4 )
+            if ( mode_64bit() && (op_bytes == 4 || !amd_like(ctxt)) )
                 op_bytes = 8;
             state->desc = DstNone | SrcMem | Mov;
             break;
@@ -2537,6 +2536,12 @@ x86_decode_onebyte(
                 op_bytes = 4;
             state->desc = DstNone | SrcMem | Mov;
             break;
+
+        case 6: /* push */
+            if ( mode_64bit() && op_bytes == 4 )
+                op_bytes = 8;
+            state->desc = DstNone | SrcMem | Mov;
+            break;
         }
         break;
     }


Re: [Xen-devel] [PATCH 4/7] x86emul: vendor specific near indirect branch behavior in 64-bit mode

Posted by Andrew Cooper 2 weeks ago
On 24/03/2020 16:27, Jan Beulich wrote:
> Intel CPUs ignore operand size overrides here, while AMD ones don't.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

[Xen-devel] [PATCH 5/7] x86emul: vendor specific SYSENTER/SYSEXIT behavior in long mode

Posted by Jan Beulich 2 weeks ago
Intel CPUs permit both insns there while AMD ones don't.

While at it also
- drop the ring 0 check from SYSENTER handling - neither Intel's nor
  AMD's insn pages have any indication of #GP(0) getting raised when
  executed from ring 0, and trying it out in practice also confirms
  the check shouldn't be there,
- move SYSENTER segment register writing until after the (in principle
  able to fail) MSR reads.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -6588,7 +6588,7 @@ x86_emulate(
 
     case X86EMUL_OPC(0x0f, 0x34): /* sysenter */
         vcpu_must_have(sep);
-        generate_exception_if(mode_ring0(), EXC_GP, 0);
+        generate_exception_if(amd_like(ctxt) && ctxt->lma, EXC_UD);
         generate_exception_if(!in_protmode(ctxt, ops), EXC_GP, 0);
 
         fail_if(ops->read_msr == NULL);
@@ -6611,11 +6611,6 @@ x86_emulate(
         sreg.limit = ~0u;  /* 4GB limit */
         sreg.attr = 0xc93; /* G+DB+P+S+Data */
 
-        fail_if(ops->write_segment == NULL);
-        if ( (rc = ops->write_segment(x86_seg_cs, &cs, ctxt)) != 0 ||
-             (rc = ops->write_segment(x86_seg_ss, &sreg, ctxt)) != 0 )
-            goto done;
-
         if ( (rc = ops->read_msr(MSR_IA32_SYSENTER_EIP,
                                  &msr_val, ctxt)) != X86EMUL_OKAY )
             goto done;
@@ -6626,11 +6621,19 @@ x86_emulate(
             goto done;
         _regs.r(sp) = ctxt->lma ? msr_val : (uint32_t)msr_val;
 
+        fail_if(!ops->write_segment);
+        if ( (rc = ops->write_segment(x86_seg_cs, &cs,
+                                      ctxt)) != X86EMUL_OKAY ||
+             (rc = ops->write_segment(x86_seg_ss, &sreg,
+                                      ctxt)) != X86EMUL_OKAY )
+            goto done;
+
         singlestep = _regs.eflags & X86_EFLAGS_TF;
         break;
 
     case X86EMUL_OPC(0x0f, 0x35): /* sysexit */
         vcpu_must_have(sep);
+        generate_exception_if(amd_like(ctxt) && ctxt->lma, EXC_UD);
         generate_exception_if(!mode_ring0(), EXC_GP, 0);
         generate_exception_if(!in_protmode(ctxt, ops), EXC_GP, 0);
 

Re: [Xen-devel] [PATCH 5/7] x86emul: vendor specific SYSENTER/SYSEXIT behavior in long mode

Posted by Andrew Cooper 2 weeks ago
On 24/03/2020 16:28, Jan Beulich wrote:
> Intel CPUs permit both insns there while AMD ones don't.
>
> While at it also
> - drop the ring 0 check from SYSENTER handling - neither Intel's nor
>   AMD's insn pages have any indication of #GP(0) getting raised when
>   executed from ring 0, and trying it out in practice also confirms
>   the check shouldn't be there,
> - move SYSENTER segment register writing until after the (in principle
>   able to fail) MSR reads.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

[Xen-devel] [PATCH 6/7] x86emul: vendor specific SYSCALL behavior

Posted by Jan Beulich 2 weeks ago
AMD CPUs permit the insn everywhere (even outside of protected mode),
while Intel ones restrict it to 64-bit mode. While at it also add the
so far missing CPUID bit check.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1870,6 +1870,7 @@ amd_like(const struct x86_emulate_ctxt *
 #define vcpu_has_f16c()        (ctxt->cpuid->basic.f16c)
 #define vcpu_has_rdrand()      (ctxt->cpuid->basic.rdrand)
 
+#define vcpu_has_syscall()     (ctxt->cpuid->extd.syscall)
 #define vcpu_has_mmxext()      (ctxt->cpuid->extd.mmxext || vcpu_has_sse())
 #define vcpu_has_3dnow_ext()   (ctxt->cpuid->extd._3dnowext)
 #define vcpu_has_3dnow()       (ctxt->cpuid->extd._3dnow)
@@ -5897,13 +5898,13 @@ x86_emulate(
         break;
 
     case X86EMUL_OPC(0x0f, 0x05): /* syscall */
-        generate_exception_if(!in_protmode(ctxt, ops), EXC_UD);
-
+        vcpu_must_have(syscall);
         /* Inject #UD if syscall/sysret are disabled. */
         fail_if(ops->read_msr == NULL);
         if ( (rc = ops->read_msr(MSR_EFER, &msr_val, ctxt)) != X86EMUL_OKAY )
             goto done;
         generate_exception_if((msr_val & EFER_SCE) == 0, EXC_UD);
+        generate_exception_if(!amd_like(ctxt) && !mode_64bit(), EXC_UD);
 
         if ( (rc = ops->read_msr(MSR_STAR, &msr_val, ctxt)) != X86EMUL_OKAY )
             goto done;


Re: [Xen-devel] [PATCH 6/7] x86emul: vendor specific SYSCALL behavior

Posted by Andrew Cooper 2 weeks ago
On 24/03/2020 16:28, Jan Beulich wrote:
> AMD CPUs permit the insn everywhere (even outside of protected mode),
> while Intel ones restrict it to 64-bit mode. While at it also add the
> so far missing CPUID bit check.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>
> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -1870,6 +1870,7 @@ amd_like(const struct x86_emulate_ctxt *
>  #define vcpu_has_f16c()        (ctxt->cpuid->basic.f16c)
>  #define vcpu_has_rdrand()      (ctxt->cpuid->basic.rdrand)
>  
> +#define vcpu_has_syscall()     (ctxt->cpuid->extd.syscall)
>  #define vcpu_has_mmxext()      (ctxt->cpuid->extd.mmxext || vcpu_has_sse())
>  #define vcpu_has_3dnow_ext()   (ctxt->cpuid->extd._3dnowext)
>  #define vcpu_has_3dnow()       (ctxt->cpuid->extd._3dnow)
> @@ -5897,13 +5898,13 @@ x86_emulate(
>          break;
>  
>      case X86EMUL_OPC(0x0f, 0x05): /* syscall */
> -        generate_exception_if(!in_protmode(ctxt, ops), EXC_UD);
> -
> +        vcpu_must_have(syscall);
>          /* Inject #UD if syscall/sysret are disabled. */
>          fail_if(ops->read_msr == NULL);
>          if ( (rc = ops->read_msr(MSR_EFER, &msr_val, ctxt)) != X86EMUL_OKAY )
>              goto done;
>          generate_exception_if((msr_val & EFER_SCE) == 0, EXC_UD);

The CPUID check isn't actually missing, but it is fairly well hidden
here in the validity check to enable EFER.SCE in the first place.

In my (still incomplete and unposed) XSA-204 followup, I just commented
the fact here rather than introducing vcpu_must_have().

~Andrew

> +        generate_exception_if(!amd_like(ctxt) && !mode_64bit(), EXC_UD);
>  
>          if ( (rc = ops->read_msr(MSR_STAR, &msr_val, ctxt)) != X86EMUL_OKAY )
>              goto done;
>


[Xen-devel] [PATCH 7/7] x86emul: support SYSRET

Posted by Jan Beulich 2 weeks ago
This is to augment SYSCALL, which has been supported for quite some
time.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -5975,6 +5975,60 @@ x86_emulate(
             goto done;
         break;
 
+    case X86EMUL_OPC(0x0f, 0x07): /* sysret */
+        vcpu_must_have(syscall);
+        /* Inject #UD if syscall/sysret are disabled. */
+        fail_if(!ops->read_msr);
+        if ( (rc = ops->read_msr(MSR_EFER, &msr_val, ctxt)) != X86EMUL_OKAY )
+            goto done;
+        generate_exception_if((msr_val & EFER_SCE) == 0, EXC_UD);
+        generate_exception_if(!amd_like(ctxt) && !mode_64bit(), EXC_UD);
+        generate_exception_if(!mode_ring0(), EXC_GP, 0);
+        generate_exception_if(!in_protmode(ctxt, ops), EXC_GP, 0);
+
+        if ( (rc = ops->read_msr(MSR_STAR, &msr_val, ctxt)) != X86EMUL_OKAY )
+            goto done;
+
+        sreg.sel = ((msr_val >> 48) + 8) | 3; /* SELECTOR_RPL_MASK */
+        cs.sel = op_bytes == 8 ? sreg.sel + 8 : sreg.sel - 8;
+
+        cs.base = sreg.base = 0; /* flat segment */
+        cs.limit = sreg.limit = ~0u; /* 4GB limit */
+        cs.attr = 0xcfb; /* G+DB+P+DPL3+S+Code */
+        sreg.attr = 0xcf3; /* G+DB+P+DPL3+S+Data */
+
+#ifdef __x86_64__
+        if ( mode_64bit() )
+        {
+            if ( op_bytes == 8 )
+            {
+                cs.attr = 0xafb; /* L+DB+P+DPL3+S+Code */
+                generate_exception_if(!is_canonical_address(_regs.rcx) &&
+                                      !amd_like(ctxt), EXC_GP, 0);
+                _regs.rip = _regs.rcx;
+            }
+            else
+                _regs.rip = _regs.ecx;
+
+            _regs.eflags = _regs.r11 & ~(X86_EFLAGS_RF | X86_EFLAGS_VM);
+        }
+        else
+#endif
+        {
+            _regs.r(ip) = _regs.ecx;
+            _regs.eflags |= X86_EFLAGS_IF;
+        }
+
+        fail_if(!ops->write_segment);
+        if ( (rc = ops->write_segment(x86_seg_cs, &cs, ctxt)) != X86EMUL_OKAY ||
+             (!amd_like(ctxt) &&
+              (rc = ops->write_segment(x86_seg_ss, &sreg,
+                                       ctxt)) != X86EMUL_OKAY) )
+            goto done;
+
+        singlestep = _regs.eflags & X86_EFLAGS_TF;
+        break;
+
     case X86EMUL_OPC(0x0f, 0x08): /* invd */
     case X86EMUL_OPC(0x0f, 0x09): /* wbinvd / wbnoinvd */
         generate_exception_if(!mode_ring0(), EXC_GP, 0);


Re: [Xen-devel] [PATCH 7/7] x86emul: support SYSRET

Posted by Andrew Cooper 2 weeks ago
On 24/03/2020 16:29, Jan Beulich wrote:
> This is to augment SYSCALL, which has been supported for quite some
> time.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

I've compared this to the in-progress version I have in my XSA-204
follow-on series.  I'm afraid the behaviour has far more vendor specific
quirks than this.

>
> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -5975,6 +5975,60 @@ x86_emulate(
>              goto done;
>          break;
>  
> +    case X86EMUL_OPC(0x0f, 0x07): /* sysret */
> +        vcpu_must_have(syscall);
> +        /* Inject #UD if syscall/sysret are disabled. */
> +        fail_if(!ops->read_msr);
> +        if ( (rc = ops->read_msr(MSR_EFER, &msr_val, ctxt)) != X86EMUL_OKAY )
> +            goto done;
> +        generate_exception_if((msr_val & EFER_SCE) == 0, EXC_UD);

(as with the SYSCALL side), no need for the vcpu_must_have(syscall) as
well as this check.

> +        generate_exception_if(!amd_like(ctxt) && !mode_64bit(), EXC_UD);
> +        generate_exception_if(!mode_ring0(), EXC_GP, 0);
> +        generate_exception_if(!in_protmode(ctxt, ops), EXC_GP, 0);
> +

The Intel SYSRET vulnerability checks regs->rcx for canonicity here, and
raises #GP here.

I see you've got it below, but this is where the Intel pseudocode puts
it, before MSR_STAR gets read, and logically it should be grouped with
the other excpetions.

> +        if ( (rc = ops->read_msr(MSR_STAR, &msr_val, ctxt)) != X86EMUL_OKAY )
> +            goto done;
> +        sreg.sel = ((msr_val >> 48) + 8) | 3; /* SELECTOR_RPL_MASK */

This would be the logical behaviour...

AMD CPUs |3 into %cs.sel, but don't make an equivalent adjustment for
%ss.sel, and simply take MSR_START.SYSRET_CS + 8.

If you aren't careful with MSR_STAR, SYSRET will return to userspace
with mismatching RPL/DPL and userspace can really find itself with an
%ss with an RPL of 0.  (Of course, when you take an interrupt and
attempt to IRET back to this context, things fall apart).

I discovered this entirely by accident in XTF, but it is confirmed by
careful reading of the AMD SYSRET pseudocode.

> +        cs.sel = op_bytes == 8 ? sreg.sel + 8 : sreg.sel - 8;
> +
> +        cs.base = sreg.base = 0; /* flat segment */
> +        cs.limit = sreg.limit = ~0u; /* 4GB limit */
> +        cs.attr = 0xcfb; /* G+DB+P+DPL3+S+Code */
> +        sreg.attr = 0xcf3; /* G+DB+P+DPL3+S+Data */

Again, that would be the logical behaviour...

AMD CPU's don't update anything but %ss.sel, and even comment the fact
in pseudocode now.

This was discovered by Andy Luto, where he found that taking an
interrupt (unconditionally sets %ss to NUL), and opportunistic sysret
back to 32bit userspace lets userspace see a sane %ss value, but with
the attrs still empty, and the stack unusable.

> +
> +#ifdef __x86_64__
> +        if ( mode_64bit() )
> +        {
> +            if ( op_bytes == 8 )
> +            {
> +                cs.attr = 0xafb; /* L+DB+P+DPL3+S+Code */
> +                generate_exception_if(!is_canonical_address(_regs.rcx) &&
> +                                      !amd_like(ctxt), EXC_GP, 0);

Wherever this ends up living, I think it needs calling out with a
comment /* CVE-xxx, Intel privilege escalation hole */, as it is a very
subtle piece of vendor specific behaviour.

Do we have a Centaur/other CPU to try with?  I'd err on the side of
going with == Intel rather than !AMD to avoid introducing known
vulnerabilities into models which stand half a chance of not being affected.

> +                _regs.rip = _regs.rcx;
> +            }
> +            else
> +                _regs.rip = _regs.ecx;
> +
> +            _regs.eflags = _regs.r11 & ~(X86_EFLAGS_RF | X86_EFLAGS_VM);
> +        }
> +        else
> +#endif
> +        {
> +            _regs.r(ip) = _regs.ecx;
> +            _regs.eflags |= X86_EFLAGS_IF;
> +        }
> +
> +        fail_if(!ops->write_segment);
> +        if ( (rc = ops->write_segment(x86_seg_cs, &cs, ctxt)) != X86EMUL_OKAY ||
> +             (!amd_like(ctxt) &&
> +              (rc = ops->write_segment(x86_seg_ss, &sreg,
> +                                       ctxt)) != X86EMUL_OKAY) )

Oh - here is the AMD behaviour with %ss, but its not quite correct.

AFAICT, the correct behaviour is to read the old %ss on AMD-like, set
flat attributes on Intel, and write back normally, because %ss.sel does
get updated.

~Andrew

> +            goto done;
> +
> +        singlestep = _regs.eflags & X86_EFLAGS_TF;
> +        break;
> +
>      case X86EMUL_OPC(0x0f, 0x08): /* invd */
>      case X86EMUL_OPC(0x0f, 0x09): /* wbinvd / wbnoinvd */
>          generate_exception_if(!mode_ring0(), EXC_GP, 0);
>


Re: [Xen-devel] [PATCH 7/7] x86emul: support SYSRET

Posted by Jan Beulich 2 weeks ago
On 25.03.2020 11:00, Andrew Cooper wrote:
> On 24/03/2020 16:29, Jan Beulich wrote:
>> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
>> @@ -5975,6 +5975,60 @@ x86_emulate(
>>              goto done;
>>          break;
>>  
>> +    case X86EMUL_OPC(0x0f, 0x07): /* sysret */
>> +        vcpu_must_have(syscall);
>> +        /* Inject #UD if syscall/sysret are disabled. */
>> +        fail_if(!ops->read_msr);
>> +        if ( (rc = ops->read_msr(MSR_EFER, &msr_val, ctxt)) != X86EMUL_OKAY )
>> +            goto done;
>> +        generate_exception_if((msr_val & EFER_SCE) == 0, EXC_UD);
> 
> (as with the SYSCALL side), no need for the vcpu_must_have(syscall) as
> well as this check.

Upon re-reading I'm now confused - are you suggesting to also drop
the EFER.SCE check? That's not what you said in reply to 6/7. If
so, what's your thinking behind saying so? If I'm to guess, this
may go along the lines of you suggesting to drop the explicit CPUID
checks from SYSENTER/SYSEXIT as well, but I'm not seeing there
either why you would think this way (albeit there it's also a
little vague what exact changes you're thinking of at the MSR
handling side).

Jan

Re: [Xen-devel] [PATCH 7/7] x86emul: support SYSRET

Posted by Andrew Cooper 2 weeks ago
On 25/03/2020 11:55, Jan Beulich wrote:
> On 25.03.2020 11:00, Andrew Cooper wrote:
>> On 24/03/2020 16:29, Jan Beulich wrote:
>>> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
>>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
>>> @@ -5975,6 +5975,60 @@ x86_emulate(
>>>              goto done;
>>>          break;
>>>  
>>> +    case X86EMUL_OPC(0x0f, 0x07): /* sysret */
>>> +        vcpu_must_have(syscall);
>>> +        /* Inject #UD if syscall/sysret are disabled. */
>>> +        fail_if(!ops->read_msr);
>>> +        if ( (rc = ops->read_msr(MSR_EFER, &msr_val, ctxt)) != X86EMUL_OKAY )
>>> +            goto done;
>>> +        generate_exception_if((msr_val & EFER_SCE) == 0, EXC_UD);
>> (as with the SYSCALL side), no need for the vcpu_must_have(syscall) as
>> well as this check.
> Upon re-reading I'm now confused - are you suggesting to also drop
> the EFER.SCE check?

No.  The SCE check is critical and needs to remain.

The exact delta I had put together was:

diff --git a/xen/arch/x86/x86_emulate/x86_emulate.c
b/xen/arch/x86/x86_emulate/x86_emulate.c
index c730511ebe..57ce7e00be 100644
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -5883,9 +5883,11 @@ x86_emulate(
 
 #ifdef __XEN__
     case X86EMUL_OPC(0x0f, 0x05): /* syscall */
-        generate_exception_if(!in_protmode(ctxt, ops), EXC_UD);
+        if ( !in_protmode(ctxt, ops) ||
+             ((ctxt->cpuid->x86_vendor & X86_VENDOR_INTEL) &&
!mode_64bit()) )
+            generate_exception(EXC_UD);
 
-        /* Inject #UD if syscall/sysret are disabled. */
+        /* Inject #UD if SCE is disabled.  Subsumes the SYSCALL CPUID
check. */
         fail_if(ops->read_msr == NULL);
         if ( (rc = ops->read_msr(MSR_EFER, &msr_val, ctxt)) !=
X86EMUL_OKAY )
             goto done;


(Looking at the commit date, Mon Dec 19 13:32:11 2016 is quite a long
time ago...)

~Andrew


Re: [Xen-devel] [PATCH 7/7] x86emul: support SYSRET

Posted by Jan Beulich 2 weeks ago
On 25.03.2020 11:00, Andrew Cooper wrote:
> On 24/03/2020 16:29, Jan Beulich wrote:
>> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
>> @@ -5975,6 +5975,60 @@ x86_emulate(
>>              goto done;
>>          break;
>>  
>> +    case X86EMUL_OPC(0x0f, 0x07): /* sysret */
>> +        vcpu_must_have(syscall);
>> +        /* Inject #UD if syscall/sysret are disabled. */
>> +        fail_if(!ops->read_msr);
>> +        if ( (rc = ops->read_msr(MSR_EFER, &msr_val, ctxt)) != X86EMUL_OKAY )
>> +            goto done;
>> +        generate_exception_if((msr_val & EFER_SCE) == 0, EXC_UD);
> 
> (as with the SYSCALL side), no need for the vcpu_must_have(syscall) as
> well as this check.

Hmm, yes, we do so elsewhere too, so I'll adjust this there and here.

>> +        generate_exception_if(!amd_like(ctxt) && !mode_64bit(), EXC_UD);
>> +        generate_exception_if(!mode_ring0(), EXC_GP, 0);
>> +        generate_exception_if(!in_protmode(ctxt, ops), EXC_GP, 0);
>> +
> 
> The Intel SYSRET vulnerability checks regs->rcx for canonicity here, and
> raises #GP here.
> 
> I see you've got it below, but this is where the Intel pseudocode puts
> it, before MSR_STAR gets read, and logically it should be grouped with
> the other excpetions.

I had it here first, then moved it down to avoid yet another mode_64bit()
instance. I didn't see why the ordering would matter for the overall
result, on the basis that the STAR read ought not to fail under normal
circumstances. I'll move it back where it was since you ask for it.

>> +        if ( (rc = ops->read_msr(MSR_STAR, &msr_val, ctxt)) != X86EMUL_OKAY )
>> +            goto done;
>> +        sreg.sel = ((msr_val >> 48) + 8) | 3; /* SELECTOR_RPL_MASK */
> 
> This would be the logical behaviour...
> 
> AMD CPUs |3 into %cs.sel, but don't make an equivalent adjustment for
> %ss.sel, and simply take MSR_START.SYSRET_CS + 8.
> 
> If you aren't careful with MSR_STAR, SYSRET will return to userspace
> with mismatching RPL/DPL and userspace can really find itself with an
> %ss with an RPL of 0.  (Of course, when you take an interrupt and
> attempt to IRET back to this context, things fall apart).
> 
> I discovered this entirely by accident in XTF, but it is confirmed by
> careful reading of the AMD SYSRET pseudocode.

I did notice this in their pseudocode, but it looked too wrong to
be true. Will change.

>> +        cs.sel = op_bytes == 8 ? sreg.sel + 8 : sreg.sel - 8;
>> +
>> +        cs.base = sreg.base = 0; /* flat segment */
>> +        cs.limit = sreg.limit = ~0u; /* 4GB limit */
>> +        cs.attr = 0xcfb; /* G+DB+P+DPL3+S+Code */
>> +        sreg.attr = 0xcf3; /* G+DB+P+DPL3+S+Data */
> 
> Again, that would be the logical behaviour...
> 
> AMD CPU's don't update anything but %ss.sel, and even comment the fact
> in pseudocode now.
> 
> This was discovered by Andy Luto, where he found that taking an
> interrupt (unconditionally sets %ss to NUL), and opportunistic sysret
> back to 32bit userspace lets userspace see a sane %ss value, but with
> the attrs still empty, and the stack unusable.
> 
>> +
>> +#ifdef __x86_64__
>> +        if ( mode_64bit() )
>> +        {
>> +            if ( op_bytes == 8 )
>> +            {
>> +                cs.attr = 0xafb; /* L+DB+P+DPL3+S+Code */
>> +                generate_exception_if(!is_canonical_address(_regs.rcx) &&
>> +                                      !amd_like(ctxt), EXC_GP, 0);
> 
> Wherever this ends up living, I think it needs calling out with a
> comment /* CVE-xxx, Intel privilege escalation hole */, as it is a very
> subtle piece of vendor specific behaviour.
> 
> Do we have a Centaur/other CPU to try with?  I'd err on the side of
> going with == Intel rather than !AMD to avoid introducing known
> vulnerabilities into models which stand half a chance of not being affected.

I'd rather not - this exception behavior is spelled out by the
SDM, and hence imo pretty likely to be followed by clones.
While I do have a VIA box somewhere, it's not stable enough to
run for more than a couple of minutes.

>> +                _regs.rip = _regs.rcx;
>> +            }
>> +            else
>> +                _regs.rip = _regs.ecx;
>> +
>> +            _regs.eflags = _regs.r11 & ~(X86_EFLAGS_RF | X86_EFLAGS_VM);
>> +        }
>> +        else
>> +#endif
>> +        {
>> +            _regs.r(ip) = _regs.ecx;
>> +            _regs.eflags |= X86_EFLAGS_IF;
>> +        }
>> +
>> +        fail_if(!ops->write_segment);
>> +        if ( (rc = ops->write_segment(x86_seg_cs, &cs, ctxt)) != X86EMUL_OKAY ||
>> +             (!amd_like(ctxt) &&
>> +              (rc = ops->write_segment(x86_seg_ss, &sreg,
>> +                                       ctxt)) != X86EMUL_OKAY) )
> 
> Oh - here is the AMD behaviour with %ss, but its not quite correct.
> 
> AFAICT, the correct behaviour is to read the old %ss on AMD-like, set
> flat attributes on Intel, and write back normally, because %ss.sel does
> get updated.

Oh, of course - I meant to, got distracted, and then forgot. Will fix.

Jan

Re: [Xen-devel] [PATCH 7/7] x86emul: support SYSRET

Posted by Andrew Cooper 2 weeks ago
On 25/03/2020 10:19, Jan Beulich wrote:
> On 25.03.2020 11:00, Andrew Cooper wrote:
>> On 24/03/2020 16:29, Jan Beulich wrote:
>>> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
>>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
>>> @@ -5975,6 +5975,60 @@ x86_emulate(
>>>              goto done;
>>>          break;
>>>  
>>> +    case X86EMUL_OPC(0x0f, 0x07): /* sysret */
>>> +        vcpu_must_have(syscall);
>>> +        /* Inject #UD if syscall/sysret are disabled. */
>>> +        fail_if(!ops->read_msr);
>>> +        if ( (rc = ops->read_msr(MSR_EFER, &msr_val, ctxt)) != X86EMUL_OKAY )
>>> +            goto done;
>>> +        generate_exception_if((msr_val & EFER_SCE) == 0, EXC_UD);
>> (as with the SYSCALL side), no need for the vcpu_must_have(syscall) as
>> well as this check.
> Hmm, yes, we do so elsewhere too, so I'll adjust this there and here.

In theory, the SEP checks for SYSENTER/SYSEXIT could be similarly
dropped, once the MSR logic is updated to perform proper availability
checks.

>>> +        if ( (rc = ops->read_msr(MSR_STAR, &msr_val, ctxt)) != X86EMUL_OKAY )
>>> +            goto done;
>>> +        sreg.sel = ((msr_val >> 48) + 8) | 3; /* SELECTOR_RPL_MASK */
>> This would be the logical behaviour...
>>
>> AMD CPUs |3 into %cs.sel, but don't make an equivalent adjustment for
>> %ss.sel, and simply take MSR_START.SYSRET_CS + 8.
>>
>> If you aren't careful with MSR_STAR, SYSRET will return to userspace
>> with mismatching RPL/DPL and userspace can really find itself with an
>> %ss with an RPL of 0.  (Of course, when you take an interrupt and
>> attempt to IRET back to this context, things fall apart).
>>
>> I discovered this entirely by accident in XTF, but it is confirmed by
>> careful reading of the AMD SYSRET pseudocode.
> I did notice this in their pseudocode, but it looked too wrong to
> be true. Will change.

The main reason why my 204 followon series is still pending is because I
never got around to completing an XTF test for all of these corner cases.

I'm happy to drop my series to Xen in light of this series of yours, but
I'd still like to complete the XTF side of things at some point.

>>> +
>>> +#ifdef __x86_64__
>>> +        if ( mode_64bit() )
>>> +        {
>>> +            if ( op_bytes == 8 )
>>> +            {
>>> +                cs.attr = 0xafb; /* L+DB+P+DPL3+S+Code */
>>> +                generate_exception_if(!is_canonical_address(_regs.rcx) &&
>>> +                                      !amd_like(ctxt), EXC_GP, 0);
>> Wherever this ends up living, I think it needs calling out with a
>> comment /* CVE-xxx, Intel privilege escalation hole */, as it is a very
>> subtle piece of vendor specific behaviour.
>>
>> Do we have a Centaur/other CPU to try with?  I'd err on the side of
>> going with == Intel rather than !AMD to avoid introducing known
>> vulnerabilities into models which stand half a chance of not being affected.
> I'd rather not - this exception behavior is spelled out by the
> SDM, and hence imo pretty likely to be followed by clones.

In pseudocode which certainly used to state somewhere "for reference
only, and not to be taken as an precise specification of behaviour". 
(And yes - that statement was still at the beginning of Vol2 when Intel
also claimed that "SYSRET was working according to the spec" in the
embargo period of XSA-7, because I called them out on it).

And anyway - it is a part of the AMD64 spec, not the Intel32 spec.  A
3rd party implementing it for 64bit support is more likely to go with
AMD's writings of how it behaves.

> While I do have a VIA box somewhere, it's not stable enough to
> run for more than a couple of minutes.

Fundamentally, it boils down to this.

Intel behaviour leaves a privilege escalation vulnerability available to
userspace.

Assuming AMD behaviour for unknown parts is the safer course of action,
because we don't need to issue an XSA/CVE to fix the emulator when it
turns out that we're wrong.

~Andrew