:p
atchew
Login
1: x86emul: support LKGS 2: x86emul: support CMPccXADD 3: x86emul+VMX: support {RD,WR}MSRLIST 4: x86: introduce x86_seg_sys 5: x86emul: support USER_MSR instructions 6: x86/cpu-policy: re-arrange no-VMX logic 7: VMX: support USER_MSR Jan
Provide support for this insn, which is a prereq to FRED. CPUID-wise introduce both its and FRED's bit at this occasion, thus allowing to also express the dependency right away. While adding a testcase, also add a SWAPGS one. In order to not affect the behavior of pre-existing tests, install write_{segment,msr} hooks only transiently. Signed-off-by: Jan Beulich <jbeulich@suse.com> --- Instead of ->read_segment() we could of course also use ->read_msr() to fetch the original GS base. I don't think I can see a clear advantage of either approach; the way it's done it matches how we handle SWAPGS. For PV save_segments() would need adjustment, but the insn being restricted to ring 0 means PV guests can't use it anyway (unless we wanted to emulate it as another privileged insn). --- v5: Re-base. v3: Add dependency on LM. Re-base. v2: Use X86_EXC_*. Add comments. --- a/tools/tests/x86_emulator/predicates.c +++ b/tools/tests/x86_emulator/predicates.c @@ -XXX,XX +XXX,XX @@ static const struct { { { 0x00, 0x18 }, { 2, 2 }, T, R }, /* ltr */ { { 0x00, 0x20 }, { 2, 2 }, T, R }, /* verr */ { { 0x00, 0x28 }, { 2, 2 }, T, R }, /* verw */ + { { 0x00, 0x30 }, { 0, 2 }, T, R, pfx_f2 }, /* lkgs */ { { 0x01, 0x00 }, { 2, 2 }, F, W }, /* sgdt */ { { 0x01, 0x08 }, { 2, 2 }, F, W }, /* sidt */ { { 0x01, 0x10 }, { 2, 2 }, F, R }, /* lgdt */ --- a/tools/tests/x86_emulator/test_x86_emulator.c +++ b/tools/tests/x86_emulator/test_x86_emulator.c @@ -XXX,XX +XXX,XX @@ static int blk( return x86_emul_blk((void *)offset, p_data, bytes, eflags, state, ctxt); } +#ifdef __x86_64__ +static unsigned long gs_base, gs_base_shadow; +#endif + static int read_segment( enum x86_segment seg, struct segment_register *reg, @@ -XXX,XX +XXX,XX @@ static int read_segment( return X86EMUL_UNHANDLEABLE; memset(reg, 0, sizeof(*reg)); reg->p = 1; + +#ifdef __x86_64__ + if ( seg == x86_seg_gs ) + reg->base = gs_base; +#endif + + return X86EMUL_OKAY; +} + +#ifdef __x86_64__ +static int write_segment( + enum x86_segment seg, + const struct segment_register *reg, + struct x86_emulate_ctxt *ctxt) +{ + if ( !is_x86_user_segment(seg) ) + return X86EMUL_UNHANDLEABLE; + + if ( seg == x86_seg_gs ) + gs_base = reg->base; + return X86EMUL_OKAY; } +#endif static int read_msr( unsigned int reg, @@ -XXX,XX +XXX,XX @@ static int read_msr( *val = ctxt->addr_size > 32 ? 0x500 /* LME|LMA */ : 0; return X86EMUL_OKAY; +#ifdef __x86_64__ + case 0xc0000101: /* GS_BASE */ + if ( ctxt->addr_size < 64 ) + break; + *val = gs_base; + return X86EMUL_OKAY; + + case 0xc0000102: /* SHADOW_GS_BASE */ + if ( ctxt->addr_size < 64 ) + break; + *val = gs_base_shadow; + return X86EMUL_OKAY; +#endif + case 0xc0000103: /* TSC_AUX */ #define TSC_AUX_VALUE 0xCACACACA *val = TSC_AUX_VALUE; @@ -XXX,XX +XXX,XX @@ static int read_msr( return X86EMUL_UNHANDLEABLE; } +#ifdef __x86_64__ +static int write_msr( + unsigned int reg, + uint64_t val, + struct x86_emulate_ctxt *ctxt) +{ + switch ( reg ) + { + case 0xc0000101: /* GS_BASE */ + if ( ctxt->addr_size < 64 || !is_canonical_address(val) ) + break; + gs_base = val; + return X86EMUL_OKAY; + + case 0xc0000102: /* SHADOW_GS_BASE */ + if ( ctxt->addr_size < 64 || !is_canonical_address(val) ) + break; + gs_base_shadow = val; + return X86EMUL_OKAY; + } + + return X86EMUL_UNHANDLEABLE; +} +#endif + #define INVPCID_ADDR 0x12345678 #define INVPCID_PCID 0x123 @@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv) printf("%u bytes read - ", bytes_read); goto fail; } + printf("okay\n"); + + emulops.write_segment = write_segment; + emulops.write_msr = write_msr; + + printf("%-40s", "Testing swapgs..."); + instr[0] = 0x0f; instr[1] = 0x01; instr[2] = 0xf8; + regs.eip = (unsigned long)&instr[0]; + gs_base = 0xffffeeeecccc8888UL; + gs_base_shadow = 0x0000111122224444UL; + rc = x86_emulate(&ctxt, &emulops); + if ( (rc != X86EMUL_OKAY) || + (regs.eip != (unsigned long)&instr[3]) || + (gs_base != 0x0000111122224444UL) || + (gs_base_shadow != 0xffffeeeecccc8888UL) ) + goto fail; + printf("okay\n"); + + printf("%-40s", "Testing lkgs 2(%rdx)..."); + instr[0] = 0xf2; instr[1] = 0x0f; instr[2] = 0x00; instr[3] = 0x72; instr[4] = 0x02; + regs.eip = (unsigned long)&instr[0]; + regs.edx = (unsigned long)res; + res[0] = 0x00004444; + res[1] = 0x8888cccc; + i = cp.extd.nscb; cp.extd.nscb = true; /* for AMD */ + rc = x86_emulate(&ctxt, &emulops); + if ( (rc != X86EMUL_OKAY) || + (regs.eip != (unsigned long)&instr[5]) || + (gs_base != 0x0000111122224444UL) || + gs_base_shadow ) + goto fail; + + cp.extd.nscb = i; + emulops.write_segment = NULL; + emulops.write_msr = NULL; #endif printf("okay\n"); --- a/tools/tests/x86_emulator/x86-emulate.c +++ b/tools/tests/x86_emulator/x86-emulate.c @@ -XXX,XX +XXX,XX @@ bool emul_test_init(void) cp.feat.invpcid = true; cp.feat.adx = true; cp.feat.rdpid = true; + cp.feat.lkgs = true; cp.feat.wrmsrns = true; cp.extd.clzero = true; --- a/xen/arch/x86/x86_emulate/decode.c +++ b/xen/arch/x86/x86_emulate/decode.c @@ -XXX,XX +XXX,XX @@ decode_twobyte(struct x86_emulate_state case 0: s->desc |= DstMem | SrcImplicit | Mov; break; + case 6: + if ( !(s->modrm_reg & 1) && mode_64bit() ) + { case 2: case 4: - s->desc |= SrcMem16; + s->desc |= SrcMem16; + } break; } break; --- a/xen/arch/x86/x86_emulate/private.h +++ b/xen/arch/x86/x86_emulate/private.h @@ -XXX,XX +XXX,XX @@ amd_like(const struct x86_emulate_ctxt * #define vcpu_has_sm4() (ctxt->cpuid->feat.sm4) #define vcpu_has_avx_vnni() (ctxt->cpuid->feat.avx_vnni) #define vcpu_has_avx512_bf16() (ctxt->cpuid->feat.avx512_bf16) +#define vcpu_has_lkgs() (ctxt->cpuid->feat.lkgs) #define vcpu_has_wrmsrns() (ctxt->cpuid->feat.wrmsrns) #define vcpu_has_avx_ifma() (ctxt->cpuid->feat.avx_ifma) #define vcpu_has_avx_vnni_int8() (ctxt->cpuid->feat.avx_vnni_int8) --- a/xen/arch/x86/x86_emulate/x86_emulate.c +++ b/xen/arch/x86/x86_emulate/x86_emulate.c @@ -XXX,XX +XXX,XX @@ x86_emulate( break; } break; - default: - generate_exception_if(true, X86_EXC_UD); + case 6: /* lkgs */ + generate_exception_if((modrm_reg & 1) || vex.pfx != vex_f2, + X86_EXC_UD); + generate_exception_if(!mode_64bit() || !mode_ring0(), X86_EXC_UD); + vcpu_must_have(lkgs); + fail_if(!ops->read_segment || !ops->read_msr || + !ops->write_segment || !ops->write_msr); + if ( (rc = ops->read_msr(MSR_SHADOW_GS_BASE, &msr_val, + ctxt)) != X86EMUL_OKAY || + (rc = ops->read_segment(x86_seg_gs, &sreg, + ctxt)) != X86EMUL_OKAY ) + goto done; + dst.orig_val = sreg.base; /* Preserve full GS Base. */ + if ( (rc = protmode_load_seg(x86_seg_gs, src.val, false, &sreg, + ctxt, ops)) != X86EMUL_OKAY || + /* Write (32-bit) base into SHADOW_GS. */ + (rc = ops->write_msr(MSR_SHADOW_GS_BASE, sreg.base, + ctxt)) != X86EMUL_OKAY ) + goto done; + sreg.base = dst.orig_val; /* Reinstate full GS Base. */ + if ( (rc = ops->write_segment(x86_seg_gs, &sreg, + ctxt)) != X86EMUL_OKAY ) + { + /* Best effort unwind (i.e. no real error checking). */ + if ( ops->write_msr(MSR_SHADOW_GS_BASE, msr_val, + ctxt) == X86EMUL_EXCEPTION ) + x86_emul_reset_event(ctxt); + goto done; + } break; } break; --- a/xen/include/public/arch-x86/cpufeatureset.h +++ b/xen/include/public/arch-x86/cpufeatureset.h @@ -XXX,XX +XXX,XX @@ XEN_CPUFEATURE(AVX512_BF16, 10*32+ 5) / XEN_CPUFEATURE(FZRM, 10*32+10) /*A Fast Zero-length REP MOVSB */ XEN_CPUFEATURE(FSRS, 10*32+11) /*A Fast Short REP STOSB */ XEN_CPUFEATURE(FSRCS, 10*32+12) /*A Fast Short REP CMPSB/SCASB */ +XEN_CPUFEATURE(FRED, 10*32+17) /* Flexible Return and Event Delivery */ +XEN_CPUFEATURE(LKGS, 10*32+18) /*S Load Kernel GS Base */ XEN_CPUFEATURE(WRMSRNS, 10*32+19) /*S WRMSR Non-Serialising */ XEN_CPUFEATURE(AVX_IFMA, 10*32+23) /*A AVX-IFMA Instructions */ --- a/xen/tools/gen-cpuid.py +++ b/xen/tools/gen-cpuid.py @@ -XXX,XX +XXX,XX @@ def crunch_numbers(state): # superpages, PCID and PKU are only available in 4 level paging. # NO_LMSL indicates the absense of Long Mode Segment Limits, which # have been dropped in hardware. - LM: [CX16, PCID, LAHF_LM, PAGE1GB, PKU, NO_LMSL], + LM: [CX16, PCID, LAHF_LM, PAGE1GB, PKU, NO_LMSL, LKGS], # AMD K6-2+ and K6-III processors shipped with 3DNow+, beyond the # standard 3DNow in the earlier K6 processors. @@ -XXX,XX +XXX,XX @@ def crunch_numbers(state): # The behaviour described by RRSBA depend on eIBRS being active. EIBRS: [RRSBA], + + # FRED builds on the LKGS instruction. + LKGS: [FRED], } deep_features = tuple(sorted(deps.keys()))
Unconditionally wire this through the ->rmw() hook. Since x86_emul_rmw() now wants to construct and invoke a stub, make stub_exn available to it via a new field in the emulator state structure. Signed-off-by: Jan Beulich <jbeulich@suse.com> --- v5: Re-base. v3: Add dependency on LM. Re-base. v2: Use X86_EXC_*. Move past introduction of stub_exn in struct x86_emulate_state. Keep feature at just "a" for now. --- SDE: -grr or -srf --- a/tools/tests/x86_emulator/predicates.c +++ b/tools/tests/x86_emulator/predicates.c @@ -XXX,XX +XXX,XX @@ static const struct vex { { { 0xdd }, 2, T, R, pfx_66, WIG, Ln }, /* vaesenclast */ { { 0xde }, 2, T, R, pfx_66, WIG, Ln }, /* vaesdec */ { { 0xdf }, 2, T, R, pfx_66, WIG, Ln }, /* vaesdeclast */ + { { 0xe0 }, 2, F, W, pfx_66, Wn, L0 }, /* cmpoxadd */ + { { 0xe1 }, 2, F, W, pfx_66, Wn, L0 }, /* cmpnoxadd */ + { { 0xe2 }, 2, F, W, pfx_66, Wn, L0 }, /* cmpbxadd */ + { { 0xe3 }, 2, F, W, pfx_66, Wn, L0 }, /* cmpnbxadd */ + { { 0xe4 }, 2, F, W, pfx_66, Wn, L0 }, /* cmpexadd */ + { { 0xe5 }, 2, F, W, pfx_66, Wn, L0 }, /* cmpnexadd */ + { { 0xe6 }, 2, F, W, pfx_66, Wn, L0 }, /* cmpbexadd */ + { { 0xe7 }, 2, F, W, pfx_66, Wn, L0 }, /* cmpaxadd */ + { { 0xe8 }, 2, F, W, pfx_66, Wn, L0 }, /* cmpsxadd */ + { { 0xe9 }, 2, F, W, pfx_66, Wn, L0 }, /* cmpnsxadd */ + { { 0xea }, 2, F, W, pfx_66, Wn, L0 }, /* cmppxadd */ + { { 0xeb }, 2, F, W, pfx_66, Wn, L0 }, /* cmpnpxadd */ + { { 0xec }, 2, F, W, pfx_66, Wn, L0 }, /* cmplxadd */ + { { 0xed }, 2, F, W, pfx_66, Wn, L0 }, /* cmpgexadd */ + { { 0xee }, 2, F, W, pfx_66, Wn, L0 }, /* cmplexadd */ + { { 0xef }, 2, F, W, pfx_66, Wn, L0 }, /* cmpgxadd */ { { 0xf2 }, 2, T, R, pfx_no, Wn, L0 }, /* andn */ { { 0xf3, 0x08 }, 2, T, R, pfx_no, Wn, L0 }, /* blsr */ { { 0xf3, 0x10 }, 2, T, R, pfx_no, Wn, L0 }, /* blsmsk */ --- a/tools/tests/x86_emulator/test_x86_emulator.c +++ b/tools/tests/x86_emulator/test_x86_emulator.c @@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv) } printf("okay\n"); + printf("%-40s", "Testing cmpbxadd %rbx,%r9,(%rdx)..."); + if ( stack_exec && cpu_has_cmpccxadd ) + { + instr[0] = 0xc4; instr[1] = 0x62; instr[2] = 0xe1; instr[3] = 0xe2; instr[4] = 0x0a; + regs.rip = (unsigned long)&instr[0]; + regs.eflags = EFLAGS_ALWAYS_SET; + res[0] = 0x11223344; + res[1] = 0x01020304; + regs.rdx = (unsigned long)res; + regs.r9 = 0x0001020300112233UL; + regs.rbx = 0x0101010101010101UL; + rc = x86_emulate(&ctxt, &emulops); + if ( (rc != X86EMUL_OKAY) || + (regs.eip != (unsigned long)&instr[5]) || + (regs.r9 != 0x0102030411223344UL) || + (regs.rbx != 0x0101010101010101UL) || + ((regs.eflags & EFLAGS_MASK) != + (X86_EFLAGS_PF | EFLAGS_ALWAYS_SET)) || + (res[0] != 0x11223344) || + (res[1] != 0x01020304) ) + goto fail; + + regs.rip = (unsigned long)&instr[0]; + regs.r9 <<= 8; + rc = x86_emulate(&ctxt, &emulops); + if ( (rc != X86EMUL_OKAY) || + (regs.eip != (unsigned long)&instr[5]) || + (regs.r9 != 0x0102030411223344UL) || + (regs.rbx != 0x0101010101010101UL) || + ((regs.eflags & EFLAGS_MASK) != + (X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_SF | + EFLAGS_ALWAYS_SET)) || + (res[0] != 0x12233445) || + (res[1] != 0x02030405) ) + goto fail; + printf("okay\n"); + + printf("%-40s", "Testing cmpsxadd %r9d,%ebx,4(%r10)..."); + instr[1] = 0xc2; instr[2] = 0x31; instr[3] = 0xe8; instr[4] = 0x5a; instr[5] = 0x04; + regs.rip = (unsigned long)&instr[0]; + res[2] = res[0] = ~0; + regs.r10 = (unsigned long)res; + rc = x86_emulate(&ctxt, &emulops); + if ( (rc != X86EMUL_OKAY) || + (regs.eip != (unsigned long)&instr[6]) || + (regs.r9 != 0x0102030411223344UL) || + (regs.rbx != 0x02030405) || + ((regs.eflags & EFLAGS_MASK) != EFLAGS_ALWAYS_SET) || + (res[0] + 1) || + (res[1] != 0x02030405) || + (res[2] + 1) ) + goto fail; + + regs.rip = (unsigned long)&instr[0]; + regs.rbx <<= 8; + rc = x86_emulate(&ctxt, &emulops); + if ( (rc != X86EMUL_OKAY) || + (regs.eip != (unsigned long)&instr[6]) || + (regs.r9 != 0x0102030411223344UL) || + (regs.rbx != 0x02030405) || + ((regs.eflags & EFLAGS_MASK) != + (X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_SF | + EFLAGS_ALWAYS_SET)) || + (res[0] + 1) || + (res[1] != 0x13253749) || + (res[2] + 1) ) + goto fail; + printf("okay\n"); + } + else + printf("skipped\n"); + emulops.write_segment = write_segment; emulops.write_msr = write_msr; --- a/tools/tests/x86_emulator/x86-emulate.h +++ b/tools/tests/x86_emulator/x86-emulate.h @@ -XXX,XX +XXX,XX @@ void wrpkru(unsigned int val); #define cpu_has_sm4 (cp.feat.sm4 && xcr0_mask(6)) #define cpu_has_avx_vnni (cp.feat.avx_vnni && xcr0_mask(6)) #define cpu_has_avx512_bf16 (cp.feat.avx512_bf16 && xcr0_mask(0xe6)) +#define cpu_has_cmpccxadd cp.feat.cmpccxadd #define cpu_has_avx_ifma (cp.feat.avx_ifma && xcr0_mask(6)) #define cpu_has_avx_vnni_int8 (cp.feat.avx_vnni_int8 && xcr0_mask(6)) #define cpu_has_avx_ne_convert (cp.feat.avx_ne_convert && xcr0_mask(6)) --- a/xen/arch/x86/include/asm/cpufeature.h +++ b/xen/arch/x86/include/asm/cpufeature.h @@ -XXX,XX +XXX,XX @@ static inline bool boot_cpu_has(unsigned #define cpu_has_sm4 boot_cpu_has(X86_FEATURE_SM4) #define cpu_has_avx_vnni boot_cpu_has(X86_FEATURE_AVX_VNNI) #define cpu_has_avx512_bf16 boot_cpu_has(X86_FEATURE_AVX512_BF16) +#define cpu_has_cmpccxadd boot_cpu_has(X86_FEATURE_CMPCCXADD) #define cpu_has_avx_ifma boot_cpu_has(X86_FEATURE_AVX_IFMA) /* CPUID level 0x00000007:1.edx */ --- a/xen/arch/x86/x86_emulate/decode.c +++ b/xen/arch/x86/x86_emulate/decode.c @@ -XXX,XX +XXX,XX @@ static const struct ext0f38_table { [0xda] = { .simd_size = simd_other }, [0xdb] = { .simd_size = simd_packed_int, .two_op = 1 }, [0xdc ... 0xdf] = { .simd_size = simd_packed_int, .d8s = d8s_vl }, + [0xe0 ... 0xef] = { .to_mem = 1 }, [0xf0] = { .two_op = 1 }, [0xf1] = { .to_mem = 1, .two_op = 1 }, [0xf2 ... 0xf3] = {}, @@ -XXX,XX +XXX,XX @@ decode_0f38(struct x86_emulate_state *s, ctxt->opcode |= MASK_INSR(s->vex.pfx, X86EMUL_OPC_PFX_MASK); break; + case X86EMUL_OPC_VEX_66(0, 0xe0) ... + X86EMUL_OPC_VEX_66(0, 0xef): /* cmp<cc>xadd */ case X86EMUL_OPC_VEX(0, 0xf2): /* andn */ case X86EMUL_OPC_VEX(0, 0xf3): /* Grp 17 */ case X86EMUL_OPC_VEX(0, 0xf5): /* bzhi */ --- a/xen/arch/x86/x86_emulate/private.h +++ b/xen/arch/x86/x86_emulate/private.h @@ -XXX,XX +XXX,XX @@ struct x86_emulate_state { rmw_btc, rmw_btr, rmw_bts, + rmw_cmpccxadd, rmw_dec, rmw_inc, rmw_neg, @@ -XXX,XX +XXX,XX @@ amd_like(const struct x86_emulate_ctxt * #define vcpu_has_sm4() (ctxt->cpuid->feat.sm4) #define vcpu_has_avx_vnni() (ctxt->cpuid->feat.avx_vnni) #define vcpu_has_avx512_bf16() (ctxt->cpuid->feat.avx512_bf16) +#define vcpu_has_cmpccxadd() (ctxt->cpuid->feat.cmpccxadd) #define vcpu_has_lkgs() (ctxt->cpuid->feat.lkgs) #define vcpu_has_wrmsrns() (ctxt->cpuid->feat.wrmsrns) #define vcpu_has_avx_ifma() (ctxt->cpuid->feat.avx_ifma) --- a/xen/arch/x86/x86_emulate/x86_emulate.c +++ b/xen/arch/x86/x86_emulate/x86_emulate.c @@ -XXX,XX +XXX,XX @@ x86_emulate( #endif /* !X86EMUL_NO_SIMD */ + case X86EMUL_OPC_VEX_66(0x0f38, 0xe0) ... + X86EMUL_OPC_VEX_66(0x0f38, 0xef): /* cmp<cc>xadd r,r,m */ + generate_exception_if(!mode_64bit() || dst.type != OP_MEM || vex.l, + X86_EXC_UD); + host_and_vcpu_must_have(cmpccxadd); + fail_if(!ops->rmw); + state->rmw = rmw_cmpccxadd; + break; + case X86EMUL_OPC(0x0f38, 0xf0): /* movbe m,r */ case X86EMUL_OPC(0x0f38, 0xf1): /* movbe r,m */ vcpu_must_have(movbe); @@ -XXX,XX +XXX,XX @@ x86_emulate( { ea.val = src.val; op_bytes = dst.bytes; + state->stub_exn = &stub_exn; rc = ops->rmw(dst.mem.seg, dst.mem.off, dst.bytes, &_regs.eflags, state, ctxt); +#ifdef __XEN__ + if ( rc == X86EMUL_stub_failure ) + goto emulation_stub_failure; +#endif if ( rc != X86EMUL_OKAY ) goto done; /* Some operations require a register to be written. */ switch ( state->rmw ) { + case rmw_cmpccxadd: case rmw_xchg: case rmw_xadd: switch ( dst.bytes ) @@ -XXX,XX +XXX,XX @@ int x86_emul_rmw( uint32_t *eflags, struct x86_emulate_state *s, struct x86_emulate_ctxt *ctxt) +#define stub_exn (*s->stub_exn) /* for invoke_stub() */ { unsigned long *dst = ptr; @@ -XXX,XX +XXX,XX @@ int x86_emul_rmw( #undef BINOP #undef SHIFT +#ifdef __x86_64__ + case rmw_cmpccxadd: + { + struct x86_emulate_stub stub = {}; + uint8_t *buf = get_stub(stub); + typeof(s->vex) *pvex = container_of(buf + 1, typeof(s->vex), + raw[0]); + unsigned long dummy; + + buf[0] = 0xc4; + *pvex = s->vex; + pvex->b = 1; + pvex->r = 1; + pvex->reg = 0xf; /* rAX */ + buf[3] = ctxt->opcode; + buf[4] = 0x11; /* reg=rDX r/m=(%RCX) */ + buf[5] = 0xc3; + + *eflags &= ~EFLAGS_MASK; + invoke_stub("", + _POST_EFLAGS("[eflags]", "[mask]", "[tmp]"), + "+m" (*dst), "+d" (s->ea.val), + [tmp] "=&r" (dummy), [eflags] "+g" (*eflags) + : "a" (*decode_vex_gpr(s->vex.reg, ctxt->regs, ctxt)), + "c" (dst), [mask] "i" (EFLAGS_MASK)); + + put_stub(stub); + break; + } +#endif + case rmw_not: switch ( s->op_bytes ) { @@ -XXX,XX +XXX,XX @@ int x86_emul_rmw( #undef JCXZ return X86EMUL_OKAY; + +#if defined(__XEN__) && defined(__x86_64__) + emulation_stub_failure: + return X86EMUL_stub_failure; +#endif } +#undef stub_exn static void __init __maybe_unused build_assertions(void) { --- a/xen/include/public/arch-x86/cpufeatureset.h +++ b/xen/include/public/arch-x86/cpufeatureset.h @@ -XXX,XX +XXX,XX @@ XEN_CPUFEATURE(SM3, 10*32+ 1) / XEN_CPUFEATURE(SM4, 10*32+ 2) /*A SM4 Instructions */ XEN_CPUFEATURE(AVX_VNNI, 10*32+ 4) /*A AVX-VNNI Instructions */ XEN_CPUFEATURE(AVX512_BF16, 10*32+ 5) /*A AVX512 BFloat16 Instructions */ +XEN_CPUFEATURE(CMPCCXADD, 10*32+ 7) /*a CMPccXADD Instructions */ XEN_CPUFEATURE(FZRM, 10*32+10) /*A Fast Zero-length REP MOVSB */ XEN_CPUFEATURE(FSRS, 10*32+11) /*A Fast Short REP STOSB */ XEN_CPUFEATURE(FSRCS, 10*32+12) /*A Fast Short REP CMPSB/SCASB */ --- a/xen/tools/gen-cpuid.py +++ b/xen/tools/gen-cpuid.py @@ -XXX,XX +XXX,XX @@ def crunch_numbers(state): # superpages, PCID and PKU are only available in 4 level paging. # NO_LMSL indicates the absense of Long Mode Segment Limits, which # have been dropped in hardware. - LM: [CX16, PCID, LAHF_LM, PAGE1GB, PKU, NO_LMSL, LKGS], + LM: [CX16, PCID, LAHF_LM, PAGE1GB, PKU, NO_LMSL, LKGS, CMPCCXADD], # AMD K6-2+ and K6-III processors shipped with 3DNow+, beyond the # standard 3DNow in the earlier K6 processors.
These are "compound" instructions to issue a series of RDMSR / WRMSR respectively. In the emulator we can therefore implement them by using the existing msr_{read,write}() hooks. The memory accesses utilize that the HVM ->read() / ->write() hooks are already linear-address (x86_seg_none) aware (by way of hvmemul_virtual_to_linear() handling this case). Preemption is being checked for in WRMSRLIST handling only, as only MSR writes are expected to possibly take long. Signed-off-by: Jan Beulich <jbeulich@suse.com> --- RFC: In vmx_vmexit_handler() handling is forwarded to the emulator blindly. Alternatively we could consult the exit qualification and process just a single MSR at a time (without involving the emulator), exiting back to the guest after every iteration. (I don't think a mix of both models makes a lot of sense.) The precise behavior of MSR_BARRIER is still not spelled out in ISE 050, so the (minimal) implementation continues to be a guess for now. Wouldn't calculate_hvm_max_policy() for MPX better behave the same way as done here, at least from an abstract perspective (assuming that AMD won't add such functionality now that Intel have deprecated it)? --- v5: Add missing vmx_init_vmcs_config() and construct_vmcs() adjustments. Avoid unnecessary uses of r(). Re-base. v3: Add dependency on LM. Limit exposure to HVM. Utilize new info from ISE 050. Re-base. v2: Use X86_EXC_*. Add preemption checking to WRMSRLIST handling. Remove the feature from "max" when the VMX counterpart isn't available. --- a/tools/tests/x86_emulator/predicates.c +++ b/tools/tests/x86_emulator/predicates.c @@ -XXX,XX +XXX,XX @@ static const struct { { { 0x01, 0xc4 }, { 2, 2 }, F, N }, /* vmxoff */ { { 0x01, 0xc5 }, { 2, 2 }, F, N }, /* pconfig */ { { 0x01, 0xc6 }, { 2, 2 }, F, N }, /* wrmsrns */ + { { 0x01, 0xc6 }, { 0, 2 }, F, W, pfx_f2 }, /* rdmsrlist */ + { { 0x01, 0xc6 }, { 0, 2 }, F, R, pfx_f3 }, /* wrmsrlist */ { { 0x01, 0xc8 }, { 2, 2 }, F, N }, /* monitor */ { { 0x01, 0xc9 }, { 2, 2 }, F, N }, /* mwait */ { { 0x01, 0xca }, { 2, 2 }, F, N }, /* clac */ --- a/tools/tests/x86_emulator/test_x86_emulator.c +++ b/tools/tests/x86_emulator/test_x86_emulator.c @@ -XXX,XX +XXX,XX @@ static int read( default: if ( !is_x86_user_segment(seg) ) return X86EMUL_UNHANDLEABLE; + case x86_seg_none: bytes_read += bytes; break; } @@ -XXX,XX +XXX,XX @@ static int write( if ( verbose ) printf("** %s(%u, %p,, %u,)\n", __func__, seg, (void *)offset, bytes); - if ( !is_x86_user_segment(seg) ) + if ( !is_x86_user_segment(seg) && seg != x86_seg_none ) return X86EMUL_UNHANDLEABLE; memcpy((void *)offset, p_data, bytes); return X86EMUL_OKAY; @@ -XXX,XX +XXX,XX @@ static int read_msr( { switch ( reg ) { + case 0x0000002f: /* BARRIER */ + *val = 0; + return X86EMUL_OKAY; + case 0xc0000080: /* EFER */ *val = ctxt->addr_size > 32 ? 0x500 /* LME|LMA */ : 0; return X86EMUL_OKAY; @@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv) (gs_base != 0x0000111122224444UL) || gs_base_shadow ) goto fail; + printf("okay\n"); cp.extd.nscb = i; emulops.write_segment = NULL; + + printf("%-40s", "Testing rdmsrlist..."); + instr[0] = 0xf2; instr[1] = 0x0f; instr[2] = 0x01; instr[3] = 0xc6; + regs.rip = (unsigned long)&instr[0]; + regs.rsi = (unsigned long)(res + 0x80); + regs.rdi = (unsigned long)(res + 0x80 + 0x40 * 2); + regs.rcx = 0x0002000100008000UL; + gs_base_shadow = 0x0000222244446666UL; + memset(res + 0x80, ~0, 0x40 * 8 * 2); + res[0x80 + 0x0f * 2] = 0xc0000101; /* GS_BASE */ + res[0x80 + 0x0f * 2 + 1] = 0; + res[0x80 + 0x20 * 2] = 0xc0000102; /* SHADOW_GS_BASE */ + res[0x80 + 0x20 * 2 + 1] = 0; + res[0x80 + 0x31 * 2] = 0x2f; /* BARRIER */ + res[0x80 + 0x31 * 2 + 1] = 0; + rc = x86_emulate(&ctxt, &emulops); + if ( (rc != X86EMUL_OKAY) || + (regs.rip != (unsigned long)&instr[4]) || + regs.rcx || + (res[0x80 + (0x40 + 0x0f) * 2] != (unsigned int)gs_base) || + (res[0x80 + (0x40 + 0x0f) * 2 + 1] != (gs_base >> (8 * sizeof(int)))) || + (res[0x80 + (0x40 + 0x20) * 2] != (unsigned int)gs_base_shadow) || + (res[0x80 + (0x40 + 0x20) * 2 + 1] != (gs_base_shadow >> (8 * sizeof(int)))) || + res[0x80 + (0x40 + 0x31) * 2] || res[0x80 + (0x40 + 0x31) * 2 + 1] ) + goto fail; + printf("okay\n"); + + printf("%-40s", "Testing wrmsrlist..."); + instr[0] = 0xf3; instr[1] = 0x0f; instr[2] = 0x01; instr[3] = 0xc6; + regs.eip = (unsigned long)&instr[0]; + regs.rsi -= 0x11 * 8; + regs.rdi -= 0x11 * 8; + regs.rcx = 0x0002000100000000UL; + res[0x80 + 0x0f * 2] = 0xc0000102; /* SHADOW_GS_BASE */ + res[0x80 + 0x20 * 2] = 0xc0000101; /* GS_BASE */ + rc = x86_emulate(&ctxt, &emulops); + if ( (rc != X86EMUL_OKAY) || + (regs.rip != (unsigned long)&instr[4]) || + regs.rcx || + (gs_base != 0x0000222244446666UL) || + (gs_base_shadow != 0x0000111122224444UL) ) + goto fail; + emulops.write_msr = NULL; #endif printf("okay\n"); --- a/tools/tests/x86_emulator/x86-emulate.c +++ b/tools/tests/x86_emulator/x86-emulate.c @@ -XXX,XX +XXX,XX @@ bool emul_test_init(void) cp.feat.rdpid = true; cp.feat.lkgs = true; cp.feat.wrmsrns = true; + cp.feat.msrlist = true; cp.extd.clzero = true; if ( cpu_has_xsave ) --- a/xen/arch/x86/cpu-policy.c +++ b/xen/arch/x86/cpu-policy.c @@ -XXX,XX +XXX,XX @@ static void __init calculate_hvm_max_pol __clear_bit(X86_FEATURE_XSAVES, fs); } + if ( !cpu_has_vmx_msrlist ) + __clear_bit(X86_FEATURE_MSRLIST, fs); + /* * Xen doesn't use PKS, so the guest support for it has opted to not use * the VMCS load/save controls for efficiency reasons. This depends on --- a/xen/arch/x86/hvm/vmx/vmcs.c +++ b/xen/arch/x86/hvm/vmx/vmcs.c @@ -XXX,XX +XXX,XX @@ static int vmx_init_vmcs_config(bool bsp if ( _vmx_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS ) { - uint64_t opt = TERTIARY_EXEC_VIRT_SPEC_CTRL; + uint64_t opt = TERTIARY_EXEC_ENABLE_MSRLIST | + TERTIARY_EXEC_VIRT_SPEC_CTRL; _vmx_tertiary_exec_control = adjust_vmx_controls2( "Tertiary Exec Control", 0, opt, @@ -XXX,XX +XXX,XX @@ static int construct_vmcs(struct vcpu *v v->arch.hvm.vmx.exec_control |= CPU_BASED_RDTSC_EXITING; v->arch.hvm.vmx.secondary_exec_control = vmx_secondary_exec_control; - v->arch.hvm.vmx.tertiary_exec_control = vmx_tertiary_exec_control; + v->arch.hvm.vmx.tertiary_exec_control = vmx_tertiary_exec_control & + ~TERTIARY_EXEC_ENABLE_MSRLIST; /* * Disable features which we don't want active by default: --- a/xen/arch/x86/hvm/vmx/vmx.c +++ b/xen/arch/x86/hvm/vmx/vmx.c @@ -XXX,XX +XXX,XX @@ static void cf_check vmx_cpuid_policy_ch else vmx_set_msr_intercept(v, MSR_PKRS, VMX_MSR_RW); + if ( cp->feat.msrlist ) + { + vmx_clear_msr_intercept(v, MSR_BARRIER, VMX_MSR_RW); + v->arch.hvm.vmx.tertiary_exec_control |= TERTIARY_EXEC_ENABLE_MSRLIST; + vmx_update_tertiary_exec_control(v); + } + else if ( v->arch.hvm.vmx.tertiary_exec_control & + TERTIARY_EXEC_ENABLE_MSRLIST ) + { + vmx_set_msr_intercept(v, MSR_BARRIER, VMX_MSR_RW); + v->arch.hvm.vmx.tertiary_exec_control &= ~TERTIARY_EXEC_ENABLE_MSRLIST; + vmx_update_tertiary_exec_control(v); + } + out: vmx_vmcs_exit(v); @@ -XXX,XX +XXX,XX @@ gp_fault: return X86EMUL_EXCEPTION; } +static bool cf_check is_msrlist( + const struct x86_emulate_state *state, const struct x86_emulate_ctxt *ctxt) +{ + + if ( ctxt->opcode == X86EMUL_OPC(0x0f, 0x01) ) + { + unsigned int rm, reg; + int mode = x86_insn_modrm(state, &rm, ®); + + /* This also includes WRMSRNS; should be okay. */ + return mode == 3 && rm == 6 && !reg; + } + + return false; +} + static void vmx_do_extint(struct cpu_user_regs *regs) { unsigned long vector; @@ -XXX,XX +XXX,XX @@ void asmlinkage vmx_vmexit_handler(struc } break; + case EXIT_REASON_RDMSRLIST: + case EXIT_REASON_WRMSRLIST: + if ( vmx_guest_x86_mode(v) != 8 || !currd->arch.cpuid->feat.msrlist ) + { + ASSERT_UNREACHABLE(); + hvm_inject_hw_exception(X86_EXC_UD, X86_EVENT_NO_EC); + } + else if ( !hvm_emulate_one_insn(is_msrlist, "MSR list") ) + hvm_inject_hw_exception(X86_EXC_GP, 0); + break; + case EXIT_REASON_VMXOFF: case EXIT_REASON_VMXON: case EXIT_REASON_VMCLEAR: --- a/xen/arch/x86/include/asm/hvm/vmx/vmcs.h +++ b/xen/arch/x86/include/asm/hvm/vmx/vmcs.h @@ -XXX,XX +XXX,XX @@ extern u32 vmx_secondary_exec_control; #define TERTIARY_EXEC_EPT_PAGING_WRITE BIT(2, UL) #define TERTIARY_EXEC_GUEST_PAGING_VERIFY BIT(3, UL) #define TERTIARY_EXEC_IPI_VIRT BIT(4, UL) +#define TERTIARY_EXEC_ENABLE_MSRLIST BIT(6, UL) #define TERTIARY_EXEC_VIRT_SPEC_CTRL BIT(7, UL) extern uint64_t vmx_tertiary_exec_control; @@ -XXX,XX +XXX,XX @@ extern u64 vmx_ept_vpid_cap; #define cpu_has_vmx_notify_vm_exiting \ (IS_ENABLED(CONFIG_INTEL_VMX) && \ vmx_secondary_exec_control & SECONDARY_EXEC_NOTIFY_VM_EXITING) +#define cpu_has_vmx_msrlist \ + (IS_ENABLED(CONFIG_INTEL_VMX) && \ + (vmx_tertiary_exec_control & TERTIARY_EXEC_ENABLE_MSRLIST)) #define VMCS_RID_TYPE_MASK 0x80000000U --- a/xen/arch/x86/include/asm/hvm/vmx/vmx.h +++ b/xen/arch/x86/include/asm/hvm/vmx/vmx.h @@ -XXX,XX +XXX,XX @@ static inline void pi_clear_sn(struct pi #define EXIT_REASON_XRSTORS 64 #define EXIT_REASON_BUS_LOCK 74 #define EXIT_REASON_NOTIFY 75 +#define EXIT_REASON_RDMSRLIST 78 +#define EXIT_REASON_WRMSRLIST 79 /* Remember to also update VMX_PERF_EXIT_REASON_SIZE! */ /* --- a/xen/arch/x86/include/asm/msr-index.h +++ b/xen/arch/x86/include/asm/msr-index.h @@ -XXX,XX +XXX,XX @@ #define APIC_BASE_ENABLE (_AC(1, ULL) << 11) #define APIC_BASE_ADDR_MASK _AC(0x000ffffffffff000, ULL) +#define MSR_BARRIER 0x0000002f + #define MSR_TEST_CTRL 0x00000033 #define TEST_CTRL_SPLITLOCK_DETECT (_AC(1, ULL) << 29) #define TEST_CTRL_SPLITLOCK_DISABLE (_AC(1, ULL) << 31) --- a/xen/arch/x86/include/asm/perfc_defn.h +++ b/xen/arch/x86/include/asm/perfc_defn.h @@ -XXX,XX +XXX,XX @@ PERFCOUNTER_ARRAY(exceptions, #ifdef CONFIG_HVM -#define VMX_PERF_EXIT_REASON_SIZE 76 +#define VMX_PERF_EXIT_REASON_SIZE 80 #define VMEXIT_NPF_PERFC 143 #define SVM_PERF_EXIT_REASON_SIZE (VMEXIT_NPF_PERFC + 1) PERFCOUNTER_ARRAY(vmexits, "vmexits", --- a/xen/arch/x86/msr.c +++ b/xen/arch/x86/msr.c @@ -XXX,XX +XXX,XX @@ int guest_rdmsr(struct vcpu *v, uint32_t case MSR_AMD_PPIN: goto gp_fault; + case MSR_BARRIER: + if ( !cp->feat.msrlist ) + goto gp_fault; + *val = 0; + break; + case MSR_IA32_FEATURE_CONTROL: /* * Architecturally, availability of this MSR is enumerated by the @@ -XXX,XX +XXX,XX @@ int guest_wrmsr(struct vcpu *v, uint32_t uint64_t rsvd; /* Read-only */ + case MSR_BARRIER: case MSR_IA32_PLATFORM_ID: case MSR_CORE_CAPABILITIES: case MSR_INTEL_CORE_THREAD_COUNT: --- a/xen/arch/x86/x86_emulate/0f01.c +++ b/xen/arch/x86/x86_emulate/0f01.c @@ -XXX,XX +XXX,XX @@ #include "private.h" #ifdef __XEN__ +#include <xen/event.h> #include <asm/prot-key.h> #endif @@ -XXX,XX +XXX,XX @@ int x86emul_0f01(struct x86_emulate_stat switch ( s->modrm ) { unsigned long base, limit, cr0, cr0w, cr4; + unsigned int n; struct segment_register sreg; uint64_t msr_val; @@ -XXX,XX +XXX,XX @@ int x86emul_0f01(struct x86_emulate_stat ((uint64_t)regs->r(dx) << 32) | regs->eax, ctxt); goto done; + + case vex_f3: /* wrmsrlist */ + vcpu_must_have(msrlist); + generate_exception_if(!mode_64bit(), X86_EXC_UD); + generate_exception_if(!mode_ring0() || (regs->esi & 7) || + (regs->edi & 7), + X86_EXC_GP, 0); + fail_if(!ops->write_msr); + while ( regs->r(cx) ) + { + n = __builtin_ffsl(regs->r(cx)) - 1; + if ( (rc = ops->read(x86_seg_none, regs->r(si) + n * 8, + &msr_val, 8, ctxt)) != X86EMUL_OKAY ) + break; + generate_exception_if(msr_val != (uint32_t)msr_val, + X86_EXC_GP, 0); + base = msr_val; + if ( (rc = ops->read(x86_seg_none, regs->r(di) + n * 8, + &msr_val, 8, ctxt)) != X86EMUL_OKAY || + (rc = ops->write_msr(base, msr_val, ctxt)) != X86EMUL_OKAY ) + break; + regs->r(cx) &= ~(1UL << n); + +#ifdef __XEN__ + if ( regs->r(cx) && local_events_need_delivery() ) + { + rc = X86EMUL_RETRY; + break; + } +#endif + } + goto done; + + case vex_f2: /* rdmsrlist */ + vcpu_must_have(msrlist); + generate_exception_if(!mode_64bit(), X86_EXC_UD); + generate_exception_if(!mode_ring0() || (regs->esi & 7) || + (regs->edi & 7), + X86_EXC_GP, 0); + fail_if(!ops->read_msr || !ops->write); + while ( regs->r(cx) ) + { + n = __builtin_ffsl(regs->r(cx)) - 1; + if ( (rc = ops->read(x86_seg_none, regs->r(si) + n * 8, + &msr_val, 8, ctxt)) != X86EMUL_OKAY ) + break; + generate_exception_if(msr_val != (uint32_t)msr_val, + X86_EXC_GP, 0); + if ( (rc = ops->read_msr(msr_val, &msr_val, + ctxt)) != X86EMUL_OKAY || + (rc = ops->write(x86_seg_none, regs->r(di) + n * 8, + &msr_val, 8, ctxt)) != X86EMUL_OKAY ) + break; + regs->r(cx) &= ~(1UL << n); + } + if ( rc != X86EMUL_OKAY ) + ctxt->regs->r(cx) = regs->r(cx); + goto done; } generate_exception(X86_EXC_UD); --- a/xen/arch/x86/x86_emulate/private.h +++ b/xen/arch/x86/x86_emulate/private.h @@ -XXX,XX +XXX,XX @@ amd_like(const struct x86_emulate_ctxt * #define vcpu_has_lkgs() (ctxt->cpuid->feat.lkgs) #define vcpu_has_wrmsrns() (ctxt->cpuid->feat.wrmsrns) #define vcpu_has_avx_ifma() (ctxt->cpuid->feat.avx_ifma) +#define vcpu_has_msrlist() (ctxt->cpuid->feat.msrlist) #define vcpu_has_avx_vnni_int8() (ctxt->cpuid->feat.avx_vnni_int8) #define vcpu_has_avx_ne_convert() (ctxt->cpuid->feat.avx_ne_convert) #define vcpu_has_avx_vnni_int16() (ctxt->cpuid->feat.avx_vnni_int16) --- a/xen/arch/x86/x86_emulate/util.c +++ b/xen/arch/x86/x86_emulate/util.c @@ -XXX,XX +XXX,XX @@ bool cf_check x86_insn_is_mem_access(con break; case X86EMUL_OPC(0x0f, 0x01): + /* {RD,WR}MSRLIST */ + if ( mode_64bit() && s->modrm == 0xc6 ) + return s->vex.pfx >= vex_f3; /* Cover CLZERO. */ return (s->modrm_rm & 7) == 4 && (s->modrm_reg & 7) == 7; } @@ -XXX,XX +XXX,XX @@ bool cf_check x86_insn_is_mem_write(cons case 0xff: /* Grp5 */ break; - case X86EMUL_OPC(0x0f, 0x01): /* CLZERO is the odd one. */ + case X86EMUL_OPC(0x0f, 0x01): + /* RDMSRLIST */ + if ( mode_64bit() && s->modrm == 0xc6 ) + return s->vex.pfx == vex_f2; + /* CLZERO is another odd one. */ return (s->modrm_rm & 7) == 4 && (s->modrm_reg & 7) == 7; default: --- a/xen/include/public/arch-x86/cpufeatureset.h +++ b/xen/include/public/arch-x86/cpufeatureset.h @@ -XXX,XX +XXX,XX @@ XEN_CPUFEATURE(FRED, 10*32+17) / XEN_CPUFEATURE(LKGS, 10*32+18) /*S Load Kernel GS Base */ XEN_CPUFEATURE(WRMSRNS, 10*32+19) /*S WRMSR Non-Serialising */ XEN_CPUFEATURE(AVX_IFMA, 10*32+23) /*A AVX-IFMA Instructions */ +XEN_CPUFEATURE(MSRLIST, 10*32+27) /*s MSR list instructions */ /* AMD-defined CPU features, CPUID level 0x80000021.eax, word 11 */ XEN_CPUFEATURE(NO_NEST_BP, 11*32+ 0) /*A No Nested Data Breakpoints */ --- a/xen/tools/gen-cpuid.py +++ b/xen/tools/gen-cpuid.py @@ -XXX,XX +XXX,XX @@ def crunch_numbers(state): # superpages, PCID and PKU are only available in 4 level paging. # NO_LMSL indicates the absense of Long Mode Segment Limits, which # have been dropped in hardware. - LM: [CX16, PCID, LAHF_LM, PAGE1GB, PKU, NO_LMSL, LKGS, CMPCCXADD], + LM: [CX16, PCID, LAHF_LM, PAGE1GB, PKU, NO_LMSL, LKGS, CMPCCXADD, + MSRLIST], # AMD K6-2+ and K6-III processors shipped with 3DNow+, beyond the # standard 3DNow in the earlier K6 processors.
To represent the USER-MSR bitmap access, a new segment type needs introducing, behaving like x86_seg_none in terms of address treatment, but behaving like a system segment for page walk purposes (implicit supervisor-mode access). Signed-off-by: Jan Beulich <jbeulich@suse.com> --- This feels a little fragile: Of course I did look through uses of the enumerators, and I didn't find further places which would need adjustment, but I'm not really sure I didn't miss any place. --- v3: New. --- a/tools/tests/x86_emulator/test_x86_emulator.c +++ b/tools/tests/x86_emulator/test_x86_emulator.c @@ -XXX,XX +XXX,XX @@ static int read( default: if ( !is_x86_user_segment(seg) ) return X86EMUL_UNHANDLEABLE; + case x86_seg_sys: case x86_seg_none: bytes_read += bytes; break; --- a/xen/arch/x86/hvm/emulate.c +++ b/xen/arch/x86/hvm/emulate.c @@ -XXX,XX +XXX,XX @@ static int hvmemul_virtual_to_linear( int okay; unsigned long reps = 1; - if ( seg == x86_seg_none ) + if ( seg == x86_seg_none || seg == x86_seg_sys ) { *linear = offset; return X86EMUL_OKAY; --- a/xen/arch/x86/hvm/hvm.c +++ b/xen/arch/x86/hvm/hvm.c @@ -XXX,XX +XXX,XX @@ bool hvm_vcpu_virtual_to_linear( * It is expected that the access rights of reg are suitable for seg (and * that this is enforced at the point that seg is loaded). */ - ASSERT(seg < x86_seg_none); + ASSERT(seg < x86_seg_sys); /* However, check that insn fetches only ever specify CS. */ ASSERT(access_type != hvm_access_insn_fetch || seg == x86_seg_cs); --- a/xen/arch/x86/hvm/svm/svm.c +++ b/xen/arch/x86/hvm/svm/svm.c @@ -XXX,XX +XXX,XX @@ static void cf_check svm_set_segment_reg vmcb->ldtr = *reg; break; + case x86_seg_sys: case x86_seg_none: ASSERT_UNREACHABLE(); break; --- a/xen/arch/x86/x86_emulate/x86_emulate.h +++ b/xen/arch/x86/x86_emulate/x86_emulate.h @@ -XXX,XX +XXX,XX @@ enum x86_segment { x86_seg_ldtr, x86_seg_gdtr, x86_seg_idtr, - /* No Segment: For accesses which are already linear. */ + /* No Segment: For (system/normal) accesses which are already linear. */ + x86_seg_sys, x86_seg_none };
While UWRMSR probably isn't of much use as long as we don't support UINTR, URDMSR may well be useful to guests even without that (depending on what OSes are willing to permit access to). Since the two VEX encodings introduce a lonely opcode point in map 7, for now don't bother introducing a full 256-entry table. Signed-off-by: Jan Beulich <jbeulich@suse.com> --- The retaining of (possible) #PF from the bitmap access is "speculative" (the spec doesn't mention #PF as a possible exception; conceivably this might also need converting to #GP). I'm a little wary of the "MSRs Writeable by UWRMSR" table that the spec has, and that our code thus also enforces: As new MSRs are added to that table, we'll need piecemeal updates to that switch() statement. The choice of using APERF in the test harness is connected to the also pending RDPRU patch, where the register needs handling anyway. --- v5: Correct ModR/M.reg check for VEX-encoded forms. Cosmetic test harness adjustment. Re-base. v4: MSR index input regs are 64-bit (albeit only the APX spec has it this way for now). v3: New. --- a/tools/tests/x86_emulator/predicates.c +++ b/tools/tests/x86_emulator/predicates.c @@ -XXX,XX +XXX,XX @@ static const struct { { { 0xf6 }, { 2, 2 }, T, R, pfx_66 }, /* adcx */ { { 0xf6 }, { 2, 2 }, T, R, pfx_f3 }, /* adox */ { { 0xf8 }, { 2, 2 }, F, W, pfx_66 }, /* movdir64b */ + { { 0xf8, 0xc0 }, { 0, 2 }, F, N, pfx_f3 }, /* uwrmsr */ { { 0xf8 }, { 2, 2 }, F, W, pfx_f3 }, /* enqcmds */ + { { 0xf8, 0xc0 }, { 0, 2 }, F, N, pfx_f2 }, /* urdmsr */ { { 0xf8 }, { 2, 2 }, F, W, pfx_f2 }, /* enqcmd */ { { 0xf9 }, { 2, 2 }, F, W }, /* movdiri */ }; @@ -XXX,XX +XXX,XX @@ static const struct vex { { { 0xde }, 3, T, R, pfx_66, W0, L0 }, /* vsm3rnds2 */ { { 0xdf }, 3, T, R, pfx_66, WIG, Ln }, /* vaeskeygenassist */ { { 0xf0 }, 3, T, R, pfx_f2, Wn, L0 }, /* rorx */ +}, vex_map7[] = { + { { 0xf8, 0xc0 }, 6, F, N, pfx_f3, W0, L0 }, /* uwrmsr */ + { { 0xf8, 0xc0 }, 6, F, N, pfx_f2, W0, L0 }, /* urdmsr */ }; static const struct { @@ -XXX,XX +XXX,XX @@ static const struct { { vex_0f, ARRAY_SIZE(vex_0f) }, { vex_0f38, ARRAY_SIZE(vex_0f38) }, { vex_0f3a, ARRAY_SIZE(vex_0f3a) }, + { NULL, 0 }, /* map 4 */ + { NULL, 0 }, /* map 5 */ + { NULL, 0 }, /* map 6 */ + { vex_map7, ARRAY_SIZE(vex_map7) }, }; static const struct xop { @@ -XXX,XX +XXX,XX @@ void predicates_test(void *instr, struct if ( vex[x].tbl[t].w == WIG || (vex[x].tbl[t].w & W0) ) { - memcpy(ptr, vex[x].tbl[t].opc, vex[x].tbl[t].len); + memcpy(ptr, vex[x].tbl[t].opc, + MIN(vex[x].tbl[t].len, ARRAY_SIZE(vex->tbl->opc))); if ( vex[x].tbl[t].l == LIG || (vex[x].tbl[t].l & L0) ) do_test(instr, vex[x].tbl[t].len + ((void *)ptr - instr), @@ -XXX,XX +XXX,XX @@ void predicates_test(void *instr, struct if ( vex[x].tbl[t].l == LIG || (vex[x].tbl[t].l & L1) ) { ptr[-1] |= 4; - memcpy(ptr, vex[x].tbl[t].opc, vex[x].tbl[t].len); + memcpy(ptr, vex[x].tbl[t].opc, + MIN(vex[x].tbl[t].len, ARRAY_SIZE(vex->tbl->opc))); do_test(instr, vex[x].tbl[t].len + ((void *)ptr - instr), vex[x].tbl[t].modrm ? (void *)ptr - instr + 1 : 0, @@ -XXX,XX +XXX,XX @@ void predicates_test(void *instr, struct if ( vex[x].tbl[t].w == WIG || (vex[x].tbl[t].w & W1) ) { ptr[-1] = 0xf8 | vex[x].tbl[t].pfx; - memcpy(ptr, vex[x].tbl[t].opc, vex[x].tbl[t].len); + memcpy(ptr, vex[x].tbl[t].opc, + MIN(vex[x].tbl[t].len, ARRAY_SIZE(vex->tbl->opc))); if ( vex[x].tbl[t].l == LIG || (vex[x].tbl[t].l & L0) ) do_test(instr, vex[x].tbl[t].len + ((void *)ptr - instr), @@ -XXX,XX +XXX,XX @@ void predicates_test(void *instr, struct if ( vex[x].tbl[t].l == LIG || (vex[x].tbl[t].l & L1) ) { ptr[-1] |= 4; - memcpy(ptr, vex[x].tbl[t].opc, vex[x].tbl[t].len); + memcpy(ptr, vex[x].tbl[t].opc, + MIN(vex[x].tbl[t].len, ARRAY_SIZE(vex->tbl->opc))); do_test(instr, vex[x].tbl[t].len + ((void *)ptr - instr), vex[x].tbl[t].modrm ? (void *)ptr - instr + 1 : 0, --- a/tools/tests/x86_emulator/test_x86_emulator.c +++ b/tools/tests/x86_emulator/test_x86_emulator.c @@ -XXX,XX +XXX,XX @@ static int blk( #ifdef __x86_64__ static unsigned long gs_base, gs_base_shadow; +static unsigned long uintr_timer; #endif static int read_segment( @@ -XXX,XX +XXX,XX @@ static int write_segment( return X86EMUL_OKAY; } + +static const uint8_t __attribute__((aligned(0x1000))) umsr_bitmap[0x1000] = { +#define RD(msr) [(msr) >> 3] = 1 << ((msr) & 7) +#define WR(msr) [0x800 + ((msr) >> 3)] = 1 << ((msr) & 7) + RD(0x000000e8), /* APERF */ + WR(0x00001b00), /* UINTR_TIMER */ +#undef WR +#undef RD +}; #endif static int read_msr( @@ -XXX,XX +XXX,XX @@ static int read_msr( { switch ( reg ) { +#ifdef __x86_64__ + case 0x0000001c: /* USER_MSR_CTL */ + *val = (unsigned long)umsr_bitmap | 1; + return X86EMUL_OKAY; +#endif + case 0x0000002f: /* BARRIER */ *val = 0; return X86EMUL_OKAY; + case 0x000000e8: /* APERF */ +#define APERF_LO_VALUE 0xAEAEAEAE +#define APERF_HI_VALUE 0xEAEAEAEA + *val = ((uint64_t)APERF_HI_VALUE << 32) | APERF_LO_VALUE; + return X86EMUL_OKAY; + case 0xc0000080: /* EFER */ *val = ctxt->addr_size > 32 ? 0x500 /* LME|LMA */ : 0; return X86EMUL_OKAY; @@ -XXX,XX +XXX,XX @@ static int write_msr( { switch ( reg ) { + case 0x00001b00: /* UINTR_TIMER */ + if ( ctxt->addr_size < 64 ) + break; + uintr_timer = val; + return X86EMUL_OKAY; + case 0xc0000101: /* GS_BASE */ if ( ctxt->addr_size < 64 || !is_canonical_address(val) ) break; @@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv) (gs_base != 0x0000222244446666UL) || (gs_base_shadow != 0x0000111122224444UL) ) goto fail; + printf("okay\n"); + + printf("%-40s", "Testing urdmsr %rdx,%rcx..."); + instr[0] = 0xf2; instr[1] = 0x0f; instr[2] = 0x38; instr[3] = 0xf8; instr[4] = 0xd1; + regs.rip = (unsigned long)&instr[0]; + regs.rdx = 0x000000e8UL; /* APERF */ + rc = x86_emulate(&ctxt, &emulops); + if ( (rc != X86EMUL_OKAY) || + (regs.rip != (unsigned long)&instr[5]) || + (regs.rcx != (((uint64_t)APERF_HI_VALUE << 32) | APERF_LO_VALUE)) ) + goto fail; + printf("okay\n"); + + printf("%-40s", "Testing urdmsr $0xe8,%rdx..."); + instr[0] = 0xc4; instr[1] = 0xe7; instr[2] = 0x7b; instr[3] = 0xf8; instr[4] = 0xc2; + instr[5] = 0xe8; instr[6] = 0; instr[7] = 0; instr[8] = 0; + regs.rip = (unsigned long)&instr[0]; + rc = x86_emulate(&ctxt, &emulops); + if ( (rc != X86EMUL_OKAY) || + (regs.rip != (unsigned long)&instr[9]) || + (regs.rdx != (((uint64_t)APERF_HI_VALUE << 32) | APERF_LO_VALUE)) ) + goto fail; + printf("okay\n"); + + printf("%-40s", "Testing uwrmsr %rdi,%rsi..."); + instr[0] = 0xf3; instr[1] = 0x0f; instr[2] = 0x38; instr[3] = 0xf8; instr[4] = 0xf7; + regs.rip = (unsigned long)&instr[0]; + regs.rsi = 0x00001b00UL; /* UINTR_TIMER */ + regs.rdi = 0x0011223344556677UL; + rc = x86_emulate(&ctxt, &emulops); + if ( (rc != X86EMUL_OKAY) || + (regs.rip != (unsigned long)&instr[5]) || + (uintr_timer != 0x0011223344556677UL) ) + goto fail; + printf("okay\n"); + + printf("%-40s", "Testing uwrmsr %rsi,$0x1b00..."); + instr[0] = 0xc4; instr[1] = 0xe7; instr[2] = 0x7a; instr[3] = 0xf8; instr[4] = 0xc6; + instr[5] = 0x00; instr[6] = 0x1b; instr[7] = 0; instr[8] = 0; + regs.rip = (unsigned long)&instr[0]; + regs.rsi = 0x8877665544332211UL; + rc = x86_emulate(&ctxt, &emulops); + if ( (rc != X86EMUL_OKAY) || + (regs.rip != (unsigned long)&instr[9]) || + (uintr_timer != 0x8877665544332211UL) ) + goto fail; + printf("okay\n"); + + printf("%-40s", "Testing uwrmsr %rsi,$0x1b01..."); + instr[5] = 0x01; /* UARCH_MISC_CTL (derived from UINTR_TIMER) */ + regs.rip = (unsigned long)&instr[0]; + regs.rsi = 0; + rc = x86_emulate(&ctxt, &emulops); + if ( (rc != X86EMUL_EXCEPTION) || + (regs.rip != (unsigned long)&instr[0]) || + (uintr_timer != 0x8877665544332211UL) ) + goto fail; emulops.write_msr = NULL; #endif --- a/xen/arch/x86/include/asm/msr-index.h +++ b/xen/arch/x86/include/asm/msr-index.h @@ -XXX,XX +XXX,XX @@ #define APIC_BASE_ENABLE (_AC(1, ULL) << 11) #define APIC_BASE_ADDR_MASK _AC(0x000ffffffffff000, ULL) +#define MSR_USER_MSR_CTL 0x0000001c +#define USER_MSR_ENABLE (_AC(1, ULL) << 0) +#define USER_MSR_ADDR_MASK 0xfffffffffffff000ULL + #define MSR_BARRIER 0x0000002f #define MSR_TEST_CTRL 0x00000033 --- a/xen/arch/x86/x86_emulate/decode.c +++ b/xen/arch/x86/x86_emulate/decode.c @@ -XXX,XX +XXX,XX @@ decode_0f38(struct x86_emulate_state *s, { case 0x00 ... 0xef: case 0xf2 ... 0xf5: - case 0xf7 ... 0xf8: + case 0xf7: case 0xfa ... 0xff: s->op_bytes = 0; /* fall through */ @@ -XXX,XX +XXX,XX @@ decode_0f38(struct x86_emulate_state *s, case X86EMUL_OPC_VEX_F2(0, 0xf7): /* shrx */ break; + case 0xf8: + if ( s->modrm_mod == 3 ) /* u{rd,wr}msr */ + { + s->desc = DstMem | SrcReg | Mov; + s->op_bytes = 8; + s->simd_size = simd_none; + } + else /* movdir64b / enqcmd{,s} */ + s->op_bytes = 0; + ctxt->opcode |= MASK_INSR(s->vex.pfx, X86EMUL_OPC_PFX_MASK); + break; + default: s->op_bytes = 0; break; @@ -XXX,XX +XXX,XX @@ int x86emul_decode(struct x86_emulate_st */ d = twobyte_table[0x38].desc; break; + + case vex_map7: + opcode |= MASK_INSR(7, X86EMUL_OPC_EXT_MASK); + /* + * No table lookup here for now, as there's only a single + * opcode point (0xf8) populated in map 7. + */ + d = DstMem | SrcImm | ModRM | Mov; + s->op_bytes = 8; + break; } } else if ( s->ext < ext_8f08 + ARRAY_SIZE(xop_table) ) @@ -XXX,XX +XXX,XX @@ int x86emul_decode(struct x86_emulate_st s->simd_size = ext8f09_table[b].simd_size; break; + case ext_map7: case ext_8f08: case ext_8f0a: /* @@ -XXX,XX +XXX,XX @@ int x86emul_decode(struct x86_emulate_st case ext_map5: case ext_map6: + case ext_map7: case ext_8f09: case ext_8f0a: break; --- a/xen/arch/x86/x86_emulate/private.h +++ b/xen/arch/x86/x86_emulate/private.h @@ -XXX,XX +XXX,XX @@ enum vex_opcx { vex_0f3a, evex_map5 = 5, evex_map6, + vex_map7, }; enum vex_pfx { @@ -XXX,XX +XXX,XX @@ struct x86_emulate_state { ext_0f3a = vex_0f3a, ext_map5 = evex_map5, ext_map6 = evex_map6, + ext_map7 = vex_map7, /* * For XOP use values such that the respective instruction field * can be used without adjustment. --- a/xen/arch/x86/x86_emulate/x86_emulate.c +++ b/xen/arch/x86/x86_emulate/x86_emulate.c @@ -XXX,XX +XXX,XX @@ x86_emulate( state->simd_size = simd_none; break; - case X86EMUL_OPC_F2(0x0f38, 0xf8): /* enqcmd r,m512 */ - case X86EMUL_OPC_F3(0x0f38, 0xf8): /* enqcmds r,m512 */ + case X86EMUL_OPC_F2(0x0f38, 0xf8): /* enqcmd r,m512 / urdmsr r32,r64 */ + case X86EMUL_OPC_F3(0x0f38, 0xf8): /* enqcmds r,m512 / uwrmsr r64,r32 */ + if ( ea.type == OP_MEM ) + goto enqcmd; + imm1 = src.val; + /* fall through */ + case X86EMUL_OPC_VEX_F2(7, 0xf8): /* urdmsr imm32,r64 */ + case X86EMUL_OPC_VEX_F3(7, 0xf8): /* uwrmsr r64,imm32 */ + generate_exception_if(!mode_64bit() || ea.type != OP_REG, X86_EXC_UD); + generate_exception_if(vex.l || vex.w, X86_EXC_UD); + generate_exception_if(vex.opcx && ((modrm_reg & 7) || vex.reg != 0xf), + X86_EXC_UD); + fail_if(!ops->read_msr); + if ( ops->read_msr(MSR_USER_MSR_CTL, &msr_val, ctxt) != X86EMUL_OKAY ) + { + x86_emul_reset_event(ctxt); + msr_val = 0; + } + generate_exception_if(!(msr_val & USER_MSR_ENABLE), X86_EXC_UD); + generate_exception_if(imm1 & ~0x3fff, X86_EXC_GP, 0); + + /* Check the corresponding bitmap. */ + ea.mem.off = msr_val & ~0xfff; + if ( vex.pfx != vex_f2 ) + ea.mem.off += 0x800; + ea.mem.off += imm1 >> 3; + if ( (rc = ops->read(x86_seg_sys, ea.mem.off, &b, 1, + ctxt)) != X86EMUL_OKAY ) + goto done; + generate_exception_if(!(b & (1 << (imm1 & 7))), X86_EXC_GP, 0); + + /* Carry out the actual MSR access. */ + if ( vex.pfx == vex_f2 ) + { + /* urdmsr */ + if ( (rc = ops->read_msr(imm1, &msr_val, ctxt)) != X86EMUL_OKAY ) + goto done; + dst.val = msr_val; + ASSERT(dst.type == OP_REG); + dst.bytes = 8; + } + else + { + /* uwrmsr */ + switch ( imm1 ) + { + case 0x1b00: /* UINTR_TIMER */ + case 0x1b01: /* UARCH_MISC_CTL */ + break; + default: + generate_exception(X86_EXC_GP, 0); + } + fail_if(!ops->write_msr); + if ( (rc = ops->write_msr(imm1, dst.val, ctxt)) != X86EMUL_OKAY ) + goto done; + dst.type = OP_NONE; + } + break; + + enqcmd: host_and_vcpu_must_have(enqcmd); - generate_exception_if(ea.type != OP_MEM, X86_EXC_UD); generate_exception_if(vex.pfx != vex_f2 && !mode_ring0(), X86_EXC_GP, 0); src.val = truncate_ea(*dst.reg); generate_exception_if(!is_aligned(x86_seg_es, src.val, 64, ctxt, ops), --- a/xen/include/public/arch-x86/cpufeatureset.h +++ b/xen/include/public/arch-x86/cpufeatureset.h @@ -XXX,XX +XXX,XX @@ XEN_CPUFEATURE(AVX_VNNI_INT8, 15*32 XEN_CPUFEATURE(AVX_NE_CONVERT, 15*32+ 5) /*A AVX-NE-CONVERT Instructions */ XEN_CPUFEATURE(AVX_VNNI_INT16, 15*32+10) /*A AVX-VNNI-INT16 Instructions */ XEN_CPUFEATURE(PREFETCHI, 15*32+14) /*A PREFETCHIT{0,1} Instructions */ +XEN_CPUFEATURE(USER_MSR, 15*32+15) /* U{RD,WR}MSR Instructions */ XEN_CPUFEATURE(CET_SSS, 15*32+18) /* CET Supervisor Shadow Stacks safe to use */ /* Intel-defined CPU features, MSR_ARCH_CAPS 0x10a.eax, word 16 */ --- a/xen/tools/gen-cpuid.py +++ b/xen/tools/gen-cpuid.py @@ -XXX,XX +XXX,XX @@ def crunch_numbers(state): # NO_LMSL indicates the absense of Long Mode Segment Limits, which # have been dropped in hardware. LM: [CX16, PCID, LAHF_LM, PAGE1GB, PKU, NO_LMSL, LKGS, CMPCCXADD, - MSRLIST], + MSRLIST, USER_MSR], # AMD K6-2+ and K6-III processors shipped with 3DNow+, beyond the # standard 3DNow in the earlier K6 processors.
Move the PKS check into an "else" for the corresponding "if()", such that further adjustments (like for USER_MSR) can easily be put there as well. Signed-off-by: Jan Beulich <jbeulich@suse.com> --- v5: Re-base. v4: New. --- a/xen/arch/x86/cpu-policy.c +++ b/xen/arch/x86/cpu-policy.c @@ -XXX,XX +XXX,XX @@ static void __init calculate_hvm_max_pol if ( !cpu_has_vmx_xsaves ) __clear_bit(X86_FEATURE_XSAVES, fs); } + else + { + /* + * Xen doesn't use PKS, so the guest support for it has opted to not use + * the VMCS load/save controls for efficiency reasons. This depends on + * the exact vmentry/exit behaviour, so don't expose PKS in other + * situations until someone has cross-checked the behaviour for safety. + */ + __clear_bit(X86_FEATURE_PKS, fs); + } if ( !cpu_has_vmx_msrlist ) __clear_bit(X86_FEATURE_MSRLIST, fs); - /* - * Xen doesn't use PKS, so the guest support for it has opted to not use - * the VMCS load/save controls for efficiency reasons. This depends on - * the exact vmentry/exit behaviour, so don't expose PKS in other - * situations until someone has cross-checked the behaviour for safety. - */ - if ( !cpu_has_vmx ) - __clear_bit(X86_FEATURE_PKS, fs); - /* * Make adjustments to possible (nested) virtualization features exposed * to the guest
Hook up the new VM exit codes and handle guest accesses, context switch, and save/restore. At least for now don't allow the guest direct access to the control MSR; this may need changing if guests were to frequently access it (e.g. on their own context switch path). While there also correct a one-off in union ldt_or_tr_instr_info's comment. Signed-off-by: Jan Beulich <jbeulich@suse.com> --- Needing to change two places in hvm.c continues to be unhelpful; I recall I already did forget to also adjust hvm_load_cpu_msrs() for XFD. Considering that MSRs typically arrive in the order the table has it, couldn't we incrementally look up the incoming MSR index there, falling back to a full lookup only when the incremental lookup failed (and thus not normally re-iterating through the initial part of the array)? Said comment in union ldt_or_tr_instr_info is further odd (same for union gdt_or_idt_instr_info's) in that Instruction Information is only a 32-bit field. Hence bits 32-63 aren't undefined, but simply don't exist. RFC: The wee attempt to "deal" with nested is likely wrong, but I'm afraid I simply don't know where such enforcement would be done properly. Returning an error there is also commented out, for domain_cpu_policy_changed() returning void without "x86/xstate: re-size save area when CPUID policy changes" in place. --- v5: Introduce user_msr_gpr(). v4: New. --- a/xen/arch/x86/cpu-policy.c +++ b/xen/arch/x86/cpu-policy.c @@ -XXX,XX +XXX,XX @@ static void __init calculate_hvm_max_pol * situations until someone has cross-checked the behaviour for safety. */ __clear_bit(X86_FEATURE_PKS, fs); + + /* + * Don't expose USER_MSR until it is known how (if at all) it is + * virtualized on SVM. + */ + __clear_bit(X86_FEATURE_USER_MSR, fs); } if ( !cpu_has_vmx_msrlist ) --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -XXX,XX +XXX,XX @@ void domain_cpu_policy_changed(struct do } } + /* Nested doesn't have the necessary processing, yet. */ + if ( nestedhvm_enabled(d) && p->feat.user_msr ) + return /* -EINVAL */; + for_each_vcpu ( d, v ) { cpu_policy_updated(v); --- a/xen/arch/x86/hvm/hvm.c +++ b/xen/arch/x86/hvm/hvm.c @@ -XXX,XX +XXX,XX @@ static int cf_check hvm_load_cpu_xsave_s #define HVM_CPU_MSR_SIZE(cnt) offsetof(struct hvm_msr, msr[cnt]) static const uint32_t msrs_to_send[] = { + MSR_USER_MSR_CTL, MSR_SPEC_CTRL, MSR_INTEL_MISC_FEATURES_ENABLES, MSR_PKRS, @@ -XXX,XX +XXX,XX @@ static int cf_check hvm_load_cpu_msrs(st { int rc; + case MSR_USER_MSR_CTL: case MSR_SPEC_CTRL: case MSR_INTEL_MISC_FEATURES_ENABLES: case MSR_PKRS: --- a/xen/arch/x86/hvm/vmx/vmx.c +++ b/xen/arch/x86/hvm/vmx/vmx.c @@ -XXX,XX +XXX,XX @@ static void cf_check vmx_vcpu_destroy(st } /* - * To avoid MSR save/restore at every VM exit/entry time, we restore - * the x86_64 specific MSRs at domain switch time. Since these MSRs - * are not modified once set for para domains, we don't save them, - * but simply reset them to values set in percpu_traps_init(). + * To avoid MSR save/restore at every VM exit/entry time, we restore the + * x86_64 specific MSRs at vcpu switch time. Since these MSRs are not + * modified once set for para domains, we don't save them, but simply clear + * them or reset them to values set in percpu_traps_init(). */ -static void vmx_restore_host_msrs(void) +static void vmx_restore_host_msrs(const struct vcpu *v) { + const struct vcpu_msrs *msrs = v->arch.msrs; + + if ( msrs->user_msr_ctl.enable ) + wrmsrl(MSR_USER_MSR_CTL, 0); + /* No PV guests? No need to restore host SYSCALL infrastructure. */ if ( !IS_ENABLED(CONFIG_PV) ) return; @@ -XXX,XX +XXX,XX @@ static void vmx_restore_guest_msrs(struc if ( cp->feat.pks ) wrpkrs(msrs->pkrs); + + if ( msrs->user_msr_ctl.enable ) + wrmsrl(MSR_USER_MSR_CTL, msrs->user_msr_ctl.raw); } void vmx_update_cpu_exec_control(struct vcpu *v) @@ -XXX,XX +XXX,XX @@ static void cf_check vmx_ctxt_switch_fro if ( !v->arch.fully_eager_fpu ) vmx_fpu_leave(v); vmx_save_guest_msrs(v); - vmx_restore_host_msrs(); + vmx_restore_host_msrs(v); vmx_save_dr(v); if ( v->domain->arch.hvm.pi_ops.flags & PI_CSW_FROM ) @@ -XXX,XX +XXX,XX @@ static int vmx_handle_apic_write(void) return vlapic_apicv_write(current, exit_qualification & 0xfff); } +static unsigned int user_msr_gpr(void) +{ + user_msr_instr_info_t info; + + __vmread(VMX_INSTRUCTION_INFO, &info.raw); + return info.gpr; +} + static void undo_nmis_unblocked_by_iret(void) { unsigned long guest_info; @@ -XXX,XX +XXX,XX @@ void asmlinkage vmx_vmexit_handler(struc hvm_inject_hw_exception(X86_EXC_GP, 0); break; + case EXIT_REASON_URDMSR: + { + uint64_t msr_content = 0; + + __vmread(EXIT_QUALIFICATION, &exit_qualification); + switch ( hvm_msr_read_intercept(exit_qualification, &msr_content) ) + { + case X86EMUL_OKAY: + *decode_gpr(regs, user_msr_gpr()) = msr_content; + update_guest_eip(); /* Safe: URDMSR */ + break; + + case X86EMUL_EXCEPTION: + hvm_inject_hw_exception(X86_EXC_GP, 0); + break; + } + break; + } + + case EXIT_REASON_UWRMSR: + __vmread(EXIT_QUALIFICATION, &exit_qualification); + switch ( hvm_msr_write_intercept(exit_qualification, + *decode_gpr(regs, user_msr_gpr()), + true) ) + { + case X86EMUL_OKAY: + update_guest_eip(); /* Safe: UWRMSR */ + break; + + case X86EMUL_EXCEPTION: + hvm_inject_hw_exception(X86_EXC_GP, 0); + break; + } + break; + case EXIT_REASON_VMXOFF: case EXIT_REASON_VMXON: case EXIT_REASON_VMCLEAR: --- a/xen/arch/x86/include/asm/hvm/vmx/vmx.h +++ b/xen/arch/x86/include/asm/hvm/vmx/vmx.h @@ -XXX,XX +XXX,XX @@ static inline void pi_clear_sn(struct pi #define EXIT_REASON_NOTIFY 75 #define EXIT_REASON_RDMSRLIST 78 #define EXIT_REASON_WRMSRLIST 79 +#define EXIT_REASON_URDMSR 80 +#define EXIT_REASON_UWRMSR 81 /* Remember to also update VMX_PERF_EXIT_REASON_SIZE! */ /* @@ -XXX,XX +XXX,XX @@ typedef union ldt_or_tr_instr_info { base_reg_invalid :1, /* bit 27 - Base register invalid */ instr_identity :1, /* bit 28 - 0:LDT, 1:TR */ instr_write :1, /* bit 29 - 0:store, 1:load */ - :34; /* bits 31:63 - Undefined */ + :34; /* bits 30:63 - Undefined */ }; } ldt_or_tr_instr_info_t; +/* VM-Exit instruction info for URDMSR and UWRMSR */ +typedef union user_msr_instr_info { + unsigned long raw; + struct { + unsigned int :3, /* Bits 0:2 - Undefined */ + gpr :4, /* Bits 3:6 - Source/Destination register */ + :25; /* bits 7:31 - Undefined */ + }; +} user_msr_instr_info_t; + #endif /* __ASM_X86_HVM_VMX_VMX_H__ */ --- a/xen/arch/x86/include/asm/msr.h +++ b/xen/arch/x86/include/asm/msr.h @@ -XXX,XX +XXX,XX @@ uint64_t msr_spec_ctrl_valid_bits(const struct vcpu_msrs { /* + * 0x0000001c - MSR_USER_MSR_CTL + * + * Value is guest chosen, and always loaded in vcpu context. + */ + union { + uint64_t raw; + struct { + bool enable:1; + unsigned int :11; + unsigned long bitmap:52; + }; + } user_msr_ctl; + + /* * 0x00000048 - MSR_SPEC_CTRL * 0xc001011f - MSR_VIRT_SPEC_CTRL (if X86_FEATURE_AMD_SSBD) * --- a/xen/arch/x86/include/asm/perfc_defn.h +++ b/xen/arch/x86/include/asm/perfc_defn.h @@ -XXX,XX +XXX,XX @@ PERFCOUNTER_ARRAY(exceptions, #ifdef CONFIG_HVM -#define VMX_PERF_EXIT_REASON_SIZE 80 +#define VMX_PERF_EXIT_REASON_SIZE 82 #define VMEXIT_NPF_PERFC 143 #define SVM_PERF_EXIT_REASON_SIZE (VMEXIT_NPF_PERFC + 1) PERFCOUNTER_ARRAY(vmexits, "vmexits", --- a/xen/arch/x86/msr.c +++ b/xen/arch/x86/msr.c @@ -XXX,XX +XXX,XX @@ int guest_rdmsr(struct vcpu *v, uint32_t *val = msrs->xss.raw; break; + case MSR_USER_MSR_CTL: + if ( !cp->feat.user_msr ) + goto gp_fault; + *val = msrs->user_msr_ctl.raw; + break; + case 0x40000000 ... 0x400001ff: if ( is_viridian_domain(d) ) { @@ -XXX,XX +XXX,XX @@ int guest_wrmsr(struct vcpu *v, uint32_t msrs->xss.raw = val; break; + case MSR_USER_MSR_CTL: + if ( !cp->feat.user_msr ) + goto gp_fault; + + if ( (val & ~(USER_MSR_ENABLE | USER_MSR_ADDR_MASK)) || + !is_canonical_address(val) ) + goto gp_fault; + + msrs->user_msr_ctl.raw = val; + if ( v == curr ) + wrmsrl(MSR_USER_MSR_CTL, val); + break; + case 0x40000000 ... 0x400001ff: if ( is_viridian_domain(d) ) { --- a/xen/include/public/arch-x86/cpufeatureset.h +++ b/xen/include/public/arch-x86/cpufeatureset.h @@ -XXX,XX +XXX,XX @@ XEN_CPUFEATURE(AVX_VNNI_INT8, 15*32 XEN_CPUFEATURE(AVX_NE_CONVERT, 15*32+ 5) /*A AVX-NE-CONVERT Instructions */ XEN_CPUFEATURE(AVX_VNNI_INT16, 15*32+10) /*A AVX-VNNI-INT16 Instructions */ XEN_CPUFEATURE(PREFETCHI, 15*32+14) /*A PREFETCHIT{0,1} Instructions */ -XEN_CPUFEATURE(USER_MSR, 15*32+15) /* U{RD,WR}MSR Instructions */ +XEN_CPUFEATURE(USER_MSR, 15*32+15) /*s U{RD,WR}MSR Instructions */ XEN_CPUFEATURE(CET_SSS, 15*32+18) /* CET Supervisor Shadow Stacks safe to use */ /* Intel-defined CPU features, MSR_ARCH_CAPS 0x10a.eax, word 16 */
1: x86emul: support LKGS 2: x86emul+VMX: support {RD,WR}MSRLIST 3: x86emul: support USER_MSR instructions 4: x86/cpu-policy: re-arrange no-VMX logic 5: VMX: support USER_MSR Jan
Provide support for this insn, which is a prereq to FRED. CPUID-wise introduce both its, FRED's, and the NMI_SRC bit at this occasion, thus allowing to also express the dependency right away. While adding a testcase, also add a SWAPGS one. In order to not affect the behavior of pre-existing tests, install write_{segment,msr} hooks only transiently. Signed-off-by: Jan Beulich <jbeulich@suse.com> --- Instead of ->read_segment() we could of course also use ->read_msr() to fetch the original GS base. I don't think I can see a clear advantage of either approach; the way it's done it matches how we handle SWAPGS. For PV save_segments() would need adjustment, but the insn being restricted to ring 0 means PV guests can't use it anyway (unless we wanted to emulate it as another privileged insn). --- v6: Use MSR constants in test harness. S->s in cpufeatureset.h. Add NMI_SRC feature bits. Re-base. v5: Re-base. v3: Add dependency on LM. Re-base. v2: Use X86_EXC_*. Add comments. --- a/tools/tests/x86_emulator/predicates.c +++ b/tools/tests/x86_emulator/predicates.c @@ -XXX,XX +XXX,XX @@ static const struct { { { 0x00, 0x18 }, { 2, 2 }, T, R }, /* ltr */ { { 0x00, 0x20 }, { 2, 2 }, T, R }, /* verr */ { { 0x00, 0x28 }, { 2, 2 }, T, R }, /* verw */ + { { 0x00, 0x30 }, { 0, 2 }, T, R, pfx_f2 }, /* lkgs */ { { 0x01, 0x00 }, { 2, 2 }, F, W }, /* sgdt */ { { 0x01, 0x08 }, { 2, 2 }, F, W }, /* sidt */ { { 0x01, 0x10 }, { 2, 2 }, F, R }, /* lgdt */ --- a/tools/tests/x86_emulator/test_x86_emulator.c +++ b/tools/tests/x86_emulator/test_x86_emulator.c @@ -XXX,XX +XXX,XX @@ static int blk( return x86_emul_blk((void *)offset, p_data, bytes, eflags, state, ctxt); } +#ifdef __x86_64__ +static unsigned long gs_base, gs_base_shadow; +#endif + static int read_segment( enum x86_segment seg, struct segment_register *reg, @@ -XXX,XX +XXX,XX @@ static int read_segment( return X86EMUL_UNHANDLEABLE; memset(reg, 0, sizeof(*reg)); reg->p = 1; + +#ifdef __x86_64__ + if ( seg == x86_seg_gs ) + reg->base = gs_base; +#endif + + return X86EMUL_OKAY; +} + +#ifdef __x86_64__ +static int write_segment( + enum x86_segment seg, + const struct segment_register *reg, + struct x86_emulate_ctxt *ctxt) +{ + if ( !is_x86_user_segment(seg) ) + return X86EMUL_UNHANDLEABLE; + + if ( seg == x86_seg_gs ) + gs_base = reg->base; + return X86EMUL_OKAY; } +#endif static int read_msr( unsigned int reg, @@ -XXX,XX +XXX,XX @@ static int read_msr( *val = ctxt->addr_size > 32 ? EFER_LME | EFER_LMA : 0; return X86EMUL_OKAY; +#ifdef __x86_64__ + case MSR_GS_BASE: + if ( ctxt->addr_size < 64 ) + break; + *val = gs_base; + return X86EMUL_OKAY; + + case MSR_SHADOW_GS_BASE: + if ( ctxt->addr_size < 64 ) + break; + *val = gs_base_shadow; + return X86EMUL_OKAY; +#endif + case MSR_TSC_AUX: #define TSC_AUX_VALUE 0xCACACACA *val = TSC_AUX_VALUE; @@ -XXX,XX +XXX,XX @@ static int read_msr( return X86EMUL_UNHANDLEABLE; } +#ifdef __x86_64__ +static int write_msr( + unsigned int reg, + uint64_t val, + struct x86_emulate_ctxt *ctxt) +{ + switch ( reg ) + { + case MSR_GS_BASE: + if ( ctxt->addr_size < 64 || !is_canonical_address(val) ) + break; + gs_base = val; + return X86EMUL_OKAY; + + case MSR_SHADOW_GS_BASE: + if ( ctxt->addr_size < 64 || !is_canonical_address(val) ) + break; + gs_base_shadow = val; + return X86EMUL_OKAY; + } + + return X86EMUL_UNHANDLEABLE; +} +#endif + #define INVPCID_ADDR 0x12345678 #define INVPCID_PCID 0x123 @@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv) printf("%u bytes read - ", bytes_read); goto fail; } + printf("okay\n"); + + emulops.write_segment = write_segment; + emulops.write_msr = write_msr; + + printf("%-40s", "Testing swapgs..."); + instr[0] = 0x0f; instr[1] = 0x01; instr[2] = 0xf8; + regs.eip = (unsigned long)&instr[0]; + gs_base = 0xffffeeeecccc8888UL; + gs_base_shadow = 0x0000111122224444UL; + rc = x86_emulate(&ctxt, &emulops); + if ( (rc != X86EMUL_OKAY) || + (regs.eip != (unsigned long)&instr[3]) || + (gs_base != 0x0000111122224444UL) || + (gs_base_shadow != 0xffffeeeecccc8888UL) ) + goto fail; + printf("okay\n"); + + printf("%-40s", "Testing lkgs 2(%rdx)..."); + instr[0] = 0xf2; instr[1] = 0x0f; instr[2] = 0x00; instr[3] = 0x72; instr[4] = 0x02; + regs.eip = (unsigned long)&instr[0]; + regs.edx = (unsigned long)res; + res[0] = 0x00004444; + res[1] = 0x8888cccc; + i = cpu_policy.extd.nscb; cpu_policy.extd.nscb = true; /* for AMD */ + rc = x86_emulate(&ctxt, &emulops); + if ( (rc != X86EMUL_OKAY) || + (regs.eip != (unsigned long)&instr[5]) || + (gs_base != 0x0000111122224444UL) || + gs_base_shadow ) + goto fail; + + cpu_policy.extd.nscb = i; + emulops.write_segment = NULL; + emulops.write_msr = NULL; #endif printf("okay\n"); --- a/tools/tests/x86_emulator/x86-emulate.c +++ b/tools/tests/x86_emulator/x86-emulate.c @@ -XXX,XX +XXX,XX @@ bool emul_test_init(void) cpu_policy.feat.invpcid = true; cpu_policy.feat.adx = true; cpu_policy.feat.rdpid = true; + cpu_policy.feat.lkgs = true; cpu_policy.feat.wrmsrns = true; cpu_policy.extd.clzero = true; --- a/xen/arch/x86/x86_emulate/decode.c +++ b/xen/arch/x86/x86_emulate/decode.c @@ -XXX,XX +XXX,XX @@ decode_twobyte(struct x86_emulate_state case 0: s->desc |= DstMem | SrcImplicit | Mov; break; + case 6: + if ( !(s->modrm_reg & 1) && mode_64bit() ) + { case 2: case 4: - s->desc |= SrcMem16; + s->desc |= SrcMem16; + } break; } break; --- a/xen/arch/x86/x86_emulate/private.h +++ b/xen/arch/x86/x86_emulate/private.h @@ -XXX,XX +XXX,XX @@ amd_like(const struct x86_emulate_ctxt * #define vcpu_has_avx_vnni() (ctxt->cpuid->feat.avx_vnni) #define vcpu_has_avx512_bf16() (ctxt->cpuid->feat.avx512_bf16) #define vcpu_has_cmpccxadd() (ctxt->cpuid->feat.cmpccxadd) +#define vcpu_has_lkgs() (ctxt->cpuid->feat.lkgs) #define vcpu_has_wrmsrns() (ctxt->cpuid->feat.wrmsrns) #define vcpu_has_avx_ifma() (ctxt->cpuid->feat.avx_ifma) #define vcpu_has_avx_vnni_int8() (ctxt->cpuid->feat.avx_vnni_int8) --- a/xen/arch/x86/x86_emulate/x86_emulate.c +++ b/xen/arch/x86/x86_emulate/x86_emulate.c @@ -XXX,XX +XXX,XX @@ x86_emulate( break; } break; - default: - generate_exception_if(true, X86_EXC_UD); + case 6: /* lkgs */ + generate_exception_if((modrm_reg & 1) || vex.pfx != vex_f2, + X86_EXC_UD); + generate_exception_if(!mode_64bit() || !mode_ring0(), X86_EXC_UD); + vcpu_must_have(lkgs); + fail_if(!ops->read_segment || !ops->read_msr || + !ops->write_segment || !ops->write_msr); + if ( (rc = ops->read_msr(MSR_SHADOW_GS_BASE, &msr_val, + ctxt)) != X86EMUL_OKAY || + (rc = ops->read_segment(x86_seg_gs, &sreg, + ctxt)) != X86EMUL_OKAY ) + goto done; + dst.orig_val = sreg.base; /* Preserve full GS Base. */ + if ( (rc = protmode_load_seg(x86_seg_gs, src.val, false, &sreg, + ctxt, ops)) != X86EMUL_OKAY || + /* Write (32-bit) base into SHADOW_GS. */ + (rc = ops->write_msr(MSR_SHADOW_GS_BASE, sreg.base, + ctxt)) != X86EMUL_OKAY ) + goto done; + sreg.base = dst.orig_val; /* Reinstate full GS Base. */ + if ( (rc = ops->write_segment(x86_seg_gs, &sreg, + ctxt)) != X86EMUL_OKAY ) + { + /* Best effort unwind (i.e. no real error checking). */ + if ( ops->write_msr(MSR_SHADOW_GS_BASE, msr_val, + ctxt) == X86EMUL_EXCEPTION ) + x86_emul_reset_event(ctxt); + goto done; + } break; } break; --- a/xen/include/public/arch-x86/cpufeatureset.h +++ b/xen/include/public/arch-x86/cpufeatureset.h @@ -XXX,XX +XXX,XX @@ XEN_CPUFEATURE(CMPCCXADD, 10*32+ 7) / XEN_CPUFEATURE(FZRM, 10*32+10) /*A Fast Zero-length REP MOVSB */ XEN_CPUFEATURE(FSRS, 10*32+11) /*A Fast Short REP STOSB */ XEN_CPUFEATURE(FSRCS, 10*32+12) /*A Fast Short REP CMPSB/SCASB */ +XEN_CPUFEATURE(FRED, 10*32+17) /* Flexible Return and Event Delivery */ +XEN_CPUFEATURE(LKGS, 10*32+18) /*s Load Kernel GS Base */ XEN_CPUFEATURE(WRMSRNS, 10*32+19) /*S WRMSR Non-Serialising */ +XEN_CPUFEATURE(NMI_SRC, 10*32+20) /* NMI-source reporting */ XEN_CPUFEATURE(AMX_FP16, 10*32+21) /* AMX FP16 instruction */ XEN_CPUFEATURE(AVX_IFMA, 10*32+23) /*A AVX-IFMA Instructions */ --- a/xen/tools/gen-cpuid.py +++ b/xen/tools/gen-cpuid.py @@ -XXX,XX +XXX,XX @@ def crunch_numbers(state): # superpages, PCID and PKU are only available in 4 level paging. # NO_LMSL indicates the absense of Long Mode Segment Limits, which # have been dropped in hardware. - LM: [CX16, PCID, LAHF_LM, PAGE1GB, PKU, NO_LMSL, AMX_TILE, CMPCCXADD], + LM: [CX16, PCID, LAHF_LM, PAGE1GB, PKU, NO_LMSL, AMX_TILE, CMPCCXADD, + LKGS], # AMD K6-2+ and K6-III processors shipped with 3DNow+, beyond the # standard 3DNow in the earlier K6 processors. @@ -XXX,XX +XXX,XX @@ def crunch_numbers(state): # computational instructions. All further AMX features are built on top # of AMX-TILE. AMX_TILE: [AMX_BF16, AMX_INT8, AMX_FP16, AMX_COMPLEX], + + # FRED builds on the LKGS instruction. + LKGS: [FRED], } deep_features = tuple(sorted(deps.keys()))
These are "compound" instructions to issue a series of RDMSR / WRMSR respectively. In the emulator we can therefore implement them by using the existing msr_{read,write}() hooks. The memory accesses utilize that the HVM ->read() / ->write() hooks are already linear-address (x86_seg_none) aware (by way of hvmemul_virtual_to_linear() handling this case). Preemption is being checked for in WRMSRLIST handling only, as only MSR writes are expected to possibly take long. Signed-off-by: Jan Beulich <jbeulich@suse.com> --- RFC: In vmx_vmexit_handler() handling is forwarded to the emulator blindly. Alternatively we could consult the exit qualification and process just a single MSR at a time (without involving the emulator), exiting back to the guest after every iteration. (I don't think a mix of both models makes a lot of sense.) The precise behavior of MSR_BARRIER is still not spelled out in ISE 050, so the (minimal) implementation continues to be a guess for now. Wouldn't calculate_hvm_max_policy() for MPX better behave the same way as done here, at least from an abstract perspective (assuming that AMD won't add such functionality now that Intel have deprecated it)? --- v6: Use MSR constants in test harness. Re-base. v5: Add missing vmx_init_vmcs_config() and construct_vmcs() adjustments. Avoid unnecessary uses of r(). Re-base. v3: Add dependency on LM. Limit exposure to HVM. Utilize new info from ISE 050. Re-base. v2: Use X86_EXC_*. Add preemption checking to WRMSRLIST handling. Remove the feature from "max" when the VMX counterpart isn't available. --- a/tools/tests/x86_emulator/predicates.c +++ b/tools/tests/x86_emulator/predicates.c @@ -XXX,XX +XXX,XX @@ static const struct { { { 0x01, 0xc4 }, { 2, 2 }, F, N }, /* vmxoff */ { { 0x01, 0xc5 }, { 2, 2 }, F, N }, /* pconfig */ { { 0x01, 0xc6 }, { 2, 2 }, F, N }, /* wrmsrns */ + { { 0x01, 0xc6 }, { 0, 2 }, F, W, pfx_f2 }, /* rdmsrlist */ + { { 0x01, 0xc6 }, { 0, 2 }, F, R, pfx_f3 }, /* wrmsrlist */ { { 0x01, 0xc8 }, { 2, 2 }, F, N }, /* monitor */ { { 0x01, 0xc9 }, { 2, 2 }, F, N }, /* mwait */ { { 0x01, 0xca }, { 2, 2 }, F, N }, /* clac */ --- a/tools/tests/x86_emulator/test_x86_emulator.c +++ b/tools/tests/x86_emulator/test_x86_emulator.c @@ -XXX,XX +XXX,XX @@ static int write( if ( verbose ) printf("** %s(%u, %p,, %u,)\n", __func__, seg, (void *)offset, bytes); - if ( !is_x86_user_segment(seg) ) + if ( !is_x86_user_segment(seg) && seg != x86_seg_none ) return X86EMUL_UNHANDLEABLE; memcpy((void *)offset, p_data, bytes); return X86EMUL_OKAY; @@ -XXX,XX +XXX,XX @@ static int read_msr( { switch ( reg ) { + case MSR_BARRIER: + *val = 0; + return X86EMUL_OKAY; + case MSR_EFER: *val = ctxt->addr_size > 32 ? EFER_LME | EFER_LMA : 0; return X86EMUL_OKAY; @@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv) (gs_base != 0x0000111122224444UL) || gs_base_shadow ) goto fail; + printf("okay\n"); cpu_policy.extd.nscb = i; emulops.write_segment = NULL; + + printf("%-40s", "Testing rdmsrlist..."); + instr[0] = 0xf2; instr[1] = 0x0f; instr[2] = 0x01; instr[3] = 0xc6; + regs.rip = (unsigned long)&instr[0]; + regs.rsi = (unsigned long)(res + 0x80); + regs.rdi = (unsigned long)(res + 0x80 + 0x40 * 2); + regs.rcx = 0x0002000100008000UL; + gs_base_shadow = 0x0000222244446666UL; + memset(res + 0x80, ~0, 0x40 * 8 * 2); + res[0x80 + 0x0f * 2] = MSR_GS_BASE; + res[0x80 + 0x0f * 2 + 1] = 0; + res[0x80 + 0x20 * 2] = MSR_SHADOW_GS_BASE; + res[0x80 + 0x20 * 2 + 1] = 0; + res[0x80 + 0x31 * 2] = MSR_BARRIER; + res[0x80 + 0x31 * 2 + 1] = 0; + rc = x86_emulate(&ctxt, &emulops); + if ( (rc != X86EMUL_OKAY) || + (regs.rip != (unsigned long)&instr[4]) || + regs.rcx || + (res[0x80 + (0x40 + 0x0f) * 2] != (unsigned int)gs_base) || + (res[0x80 + (0x40 + 0x0f) * 2 + 1] != (gs_base >> (8 * sizeof(int)))) || + (res[0x80 + (0x40 + 0x20) * 2] != (unsigned int)gs_base_shadow) || + (res[0x80 + (0x40 + 0x20) * 2 + 1] != (gs_base_shadow >> (8 * sizeof(int)))) || + res[0x80 + (0x40 + 0x31) * 2] || res[0x80 + (0x40 + 0x31) * 2 + 1] ) + goto fail; + printf("okay\n"); + + printf("%-40s", "Testing wrmsrlist..."); + instr[0] = 0xf3; instr[1] = 0x0f; instr[2] = 0x01; instr[3] = 0xc6; + regs.eip = (unsigned long)&instr[0]; + regs.rsi -= 0x11 * 8; + regs.rdi -= 0x11 * 8; + regs.rcx = 0x0002000100000000UL; + res[0x80 + 0x0f * 2] = MSR_SHADOW_GS_BASE; + res[0x80 + 0x20 * 2] = MSR_GS_BASE; + rc = x86_emulate(&ctxt, &emulops); + if ( (rc != X86EMUL_OKAY) || + (regs.rip != (unsigned long)&instr[4]) || + regs.rcx || + (gs_base != 0x0000222244446666UL) || + (gs_base_shadow != 0x0000111122224444UL) ) + goto fail; + emulops.write_msr = NULL; #endif printf("okay\n"); --- a/tools/tests/x86_emulator/x86-emulate.c +++ b/tools/tests/x86_emulator/x86-emulate.c @@ -XXX,XX +XXX,XX @@ bool emul_test_init(void) cpu_policy.feat.rdpid = true; cpu_policy.feat.lkgs = true; cpu_policy.feat.wrmsrns = true; + cpu_policy.feat.msrlist = true; cpu_policy.extd.clzero = true; if ( cpu_has_xsave ) --- a/xen/arch/x86/cpu-policy.c +++ b/xen/arch/x86/cpu-policy.c @@ -XXX,XX +XXX,XX @@ static void __init calculate_hvm_max_pol __clear_bit(X86_FEATURE_XSAVES, fs); } + if ( !cpu_has_vmx_msrlist ) + __clear_bit(X86_FEATURE_MSRLIST, fs); + /* * Xen doesn't use PKS, so the guest support for it has opted to not use * the VMCS load/save controls for efficiency reasons. This depends on --- a/xen/arch/x86/hvm/vmx/vmcs.c +++ b/xen/arch/x86/hvm/vmx/vmcs.c @@ -XXX,XX +XXX,XX @@ static int vmx_init_vmcs_config(bool bsp if ( _vmx_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS ) { - uint64_t opt = TERTIARY_EXEC_VIRT_SPEC_CTRL; + uint64_t opt = TERTIARY_EXEC_ENABLE_MSRLIST | + TERTIARY_EXEC_VIRT_SPEC_CTRL; _vmx_tertiary_exec_control = adjust_vmx_controls2( "Tertiary Exec Control", 0, opt, @@ -XXX,XX +XXX,XX @@ static int construct_vmcs(struct vcpu *v v->arch.hvm.vmx.exec_control |= CPU_BASED_RDTSC_EXITING; v->arch.hvm.vmx.secondary_exec_control = vmx_secondary_exec_control; - v->arch.hvm.vmx.tertiary_exec_control = vmx_tertiary_exec_control; + v->arch.hvm.vmx.tertiary_exec_control = vmx_tertiary_exec_control & + ~TERTIARY_EXEC_ENABLE_MSRLIST; /* * Disable features which we don't want active by default: --- a/xen/arch/x86/hvm/vmx/vmx.c +++ b/xen/arch/x86/hvm/vmx/vmx.c @@ -XXX,XX +XXX,XX @@ static void cf_check vmx_cpuid_policy_ch else vmx_set_msr_intercept(v, MSR_PKRS, VMX_MSR_RW); + if ( cp->feat.msrlist ) + { + vmx_clear_msr_intercept(v, MSR_BARRIER, VMX_MSR_RW); + v->arch.hvm.vmx.tertiary_exec_control |= TERTIARY_EXEC_ENABLE_MSRLIST; + vmx_update_tertiary_exec_control(v); + } + else if ( v->arch.hvm.vmx.tertiary_exec_control & + TERTIARY_EXEC_ENABLE_MSRLIST ) + { + vmx_set_msr_intercept(v, MSR_BARRIER, VMX_MSR_RW); + v->arch.hvm.vmx.tertiary_exec_control &= ~TERTIARY_EXEC_ENABLE_MSRLIST; + vmx_update_tertiary_exec_control(v); + } + out: vmx_vmcs_exit(v); @@ -XXX,XX +XXX,XX @@ gp_fault: return X86EMUL_EXCEPTION; } +static bool cf_check is_msrlist( + const struct x86_emulate_state *state, const struct x86_emulate_ctxt *ctxt) +{ + + if ( ctxt->opcode == X86EMUL_OPC(0x0f, 0x01) ) + { + unsigned int rm, reg; + int mode = x86_insn_modrm(state, &rm, ®); + + /* This also includes WRMSRNS; should be okay. */ + return mode == 3 && rm == 6 && !reg; + } + + return false; +} + static void vmx_do_extint(struct cpu_user_regs *regs) { unsigned long vector; @@ -XXX,XX +XXX,XX @@ void asmlinkage vmx_vmexit_handler(struc } break; + case EXIT_REASON_RDMSRLIST: + case EXIT_REASON_WRMSRLIST: + if ( vmx_guest_x86_mode(v) != 8 || !currd->arch.cpuid->feat.msrlist ) + { + ASSERT_UNREACHABLE(); + hvm_inject_hw_exception(X86_EXC_UD, X86_EVENT_NO_EC); + } + else if ( !hvm_emulate_one_insn(is_msrlist, "MSR list") ) + hvm_inject_hw_exception(X86_EXC_GP, 0); + break; + case EXIT_REASON_VMXOFF: case EXIT_REASON_VMXON: case EXIT_REASON_VMCLEAR: --- a/xen/arch/x86/include/asm/hvm/vmx/vmcs.h +++ b/xen/arch/x86/include/asm/hvm/vmx/vmcs.h @@ -XXX,XX +XXX,XX @@ extern u32 vmx_secondary_exec_control; #define TERTIARY_EXEC_EPT_PAGING_WRITE BIT(2, UL) #define TERTIARY_EXEC_GUEST_PAGING_VERIFY BIT(3, UL) #define TERTIARY_EXEC_IPI_VIRT BIT(4, UL) +#define TERTIARY_EXEC_ENABLE_MSRLIST BIT(6, UL) #define TERTIARY_EXEC_VIRT_SPEC_CTRL BIT(7, UL) extern uint64_t vmx_tertiary_exec_control; @@ -XXX,XX +XXX,XX @@ extern u64 vmx_ept_vpid_cap; #define cpu_has_vmx_notify_vm_exiting \ (IS_ENABLED(CONFIG_INTEL_VMX) && \ vmx_secondary_exec_control & SECONDARY_EXEC_NOTIFY_VM_EXITING) +#define cpu_has_vmx_msrlist \ + (IS_ENABLED(CONFIG_INTEL_VMX) && \ + (vmx_tertiary_exec_control & TERTIARY_EXEC_ENABLE_MSRLIST)) #define VMCS_RID_TYPE_MASK 0x80000000U --- a/xen/arch/x86/include/asm/hvm/vmx/vmx.h +++ b/xen/arch/x86/include/asm/hvm/vmx/vmx.h @@ -XXX,XX +XXX,XX @@ static inline void pi_clear_sn(struct pi #define EXIT_REASON_XRSTORS 64 #define EXIT_REASON_BUS_LOCK 74 #define EXIT_REASON_NOTIFY 75 +#define EXIT_REASON_RDMSRLIST 78 +#define EXIT_REASON_WRMSRLIST 79 /* Remember to also update VMX_PERF_EXIT_REASON_SIZE! */ /* --- a/xen/arch/x86/include/asm/msr-index.h +++ b/xen/arch/x86/include/asm/msr-index.h @@ -XXX,XX +XXX,XX @@ #define APIC_BASE_ENABLE (_AC(1, ULL) << 11) #define APIC_BASE_ADDR_MASK _AC(0x000ffffffffff000, ULL) +#define MSR_BARRIER 0x0000002f + #define MSR_TEST_CTRL 0x00000033 #define TEST_CTRL_SPLITLOCK_DETECT (_AC(1, ULL) << 29) #define TEST_CTRL_SPLITLOCK_DISABLE (_AC(1, ULL) << 31) --- a/xen/arch/x86/include/asm/perfc_defn.h +++ b/xen/arch/x86/include/asm/perfc_defn.h @@ -XXX,XX +XXX,XX @@ PERFCOUNTER_ARRAY(exceptions, #ifdef CONFIG_HVM -#define VMX_PERF_EXIT_REASON_SIZE 76 +#define VMX_PERF_EXIT_REASON_SIZE 80 #define VMEXIT_NPF_PERFC 143 #define SVM_PERF_EXIT_REASON_SIZE (VMEXIT_NPF_PERFC + 1) PERFCOUNTER_ARRAY(vmexits, "vmexits", --- a/xen/arch/x86/msr.c +++ b/xen/arch/x86/msr.c @@ -XXX,XX +XXX,XX @@ int guest_rdmsr(struct vcpu *v, uint32_t case MSR_AMD_PPIN: goto gp_fault; + case MSR_BARRIER: + if ( !cp->feat.msrlist ) + goto gp_fault; + *val = 0; + break; + case MSR_IA32_FEATURE_CONTROL: /* * Architecturally, availability of this MSR is enumerated by the @@ -XXX,XX +XXX,XX @@ int guest_wrmsr(struct vcpu *v, uint32_t uint64_t rsvd; /* Read-only */ + case MSR_BARRIER: case MSR_IA32_PLATFORM_ID: case MSR_CORE_CAPABILITIES: case MSR_INTEL_CORE_THREAD_COUNT: --- a/xen/arch/x86/x86_emulate/0f01.c +++ b/xen/arch/x86/x86_emulate/0f01.c @@ -XXX,XX +XXX,XX @@ #include "private.h" #ifdef __XEN__ +#include <xen/event.h> #include <asm/prot-key.h> #endif @@ -XXX,XX +XXX,XX @@ int x86emul_0f01(struct x86_emulate_stat switch ( s->modrm ) { unsigned long base, limit, cr0, cr0w, cr4; + unsigned int n; struct segment_register sreg; uint64_t msr_val; @@ -XXX,XX +XXX,XX @@ int x86emul_0f01(struct x86_emulate_stat ((uint64_t)regs->r(dx) << 32) | regs->eax, ctxt); goto done; + + case vex_f3: /* wrmsrlist */ + vcpu_must_have(msrlist); + generate_exception_if(!mode_64bit(), X86_EXC_UD); + generate_exception_if(!mode_ring0() || (regs->esi & 7) || + (regs->edi & 7), + X86_EXC_GP, 0); + fail_if(!ops->write_msr); + while ( regs->r(cx) ) + { + n = __builtin_ffsl(regs->r(cx)) - 1; + if ( (rc = ops->read(x86_seg_none, regs->r(si) + n * 8, + &msr_val, 8, ctxt)) != X86EMUL_OKAY ) + break; + generate_exception_if(msr_val != (uint32_t)msr_val, + X86_EXC_GP, 0); + base = msr_val; + if ( (rc = ops->read(x86_seg_none, regs->r(di) + n * 8, + &msr_val, 8, ctxt)) != X86EMUL_OKAY || + (rc = ops->write_msr(base, msr_val, ctxt)) != X86EMUL_OKAY ) + break; + regs->r(cx) &= ~(1UL << n); + +#ifdef __XEN__ + if ( regs->r(cx) && local_events_need_delivery() ) + { + rc = X86EMUL_RETRY; + break; + } +#endif + } + goto done; + + case vex_f2: /* rdmsrlist */ + vcpu_must_have(msrlist); + generate_exception_if(!mode_64bit(), X86_EXC_UD); + generate_exception_if(!mode_ring0() || (regs->esi & 7) || + (regs->edi & 7), + X86_EXC_GP, 0); + fail_if(!ops->read_msr || !ops->write); + while ( regs->r(cx) ) + { + n = __builtin_ffsl(regs->r(cx)) - 1; + if ( (rc = ops->read(x86_seg_none, regs->r(si) + n * 8, + &msr_val, 8, ctxt)) != X86EMUL_OKAY ) + break; + generate_exception_if(msr_val != (uint32_t)msr_val, + X86_EXC_GP, 0); + if ( (rc = ops->read_msr(msr_val, &msr_val, + ctxt)) != X86EMUL_OKAY || + (rc = ops->write(x86_seg_none, regs->r(di) + n * 8, + &msr_val, 8, ctxt)) != X86EMUL_OKAY ) + break; + regs->r(cx) &= ~(1UL << n); + } + if ( rc != X86EMUL_OKAY ) + ctxt->regs->r(cx) = regs->r(cx); + goto done; } generate_exception(X86_EXC_UD); --- a/xen/arch/x86/x86_emulate/private.h +++ b/xen/arch/x86/x86_emulate/private.h @@ -XXX,XX +XXX,XX @@ amd_like(const struct x86_emulate_ctxt * #define vcpu_has_lkgs() (ctxt->cpuid->feat.lkgs) #define vcpu_has_wrmsrns() (ctxt->cpuid->feat.wrmsrns) #define vcpu_has_avx_ifma() (ctxt->cpuid->feat.avx_ifma) +#define vcpu_has_msrlist() (ctxt->cpuid->feat.msrlist) #define vcpu_has_avx_vnni_int8() (ctxt->cpuid->feat.avx_vnni_int8) #define vcpu_has_avx_ne_convert() (ctxt->cpuid->feat.avx_ne_convert) #define vcpu_has_avx_vnni_int16() (ctxt->cpuid->feat.avx_vnni_int16) --- a/xen/arch/x86/x86_emulate/util.c +++ b/xen/arch/x86/x86_emulate/util.c @@ -XXX,XX +XXX,XX @@ bool cf_check x86_insn_is_mem_access(con break; case X86EMUL_OPC(0x0f, 0x01): + /* {RD,WR}MSRLIST */ + if ( mode_64bit() && s->modrm == 0xc6 ) + return s->vex.pfx >= vex_f3; /* Cover CLZERO. */ return (s->modrm_rm & 7) == 4 && (s->modrm_reg & 7) == 7; } @@ -XXX,XX +XXX,XX @@ bool cf_check x86_insn_is_mem_write(cons case 0xff: /* Grp5 */ break; - case X86EMUL_OPC(0x0f, 0x01): /* CLZERO is the odd one. */ + case X86EMUL_OPC(0x0f, 0x01): + /* RDMSRLIST */ + if ( mode_64bit() && s->modrm == 0xc6 ) + return s->vex.pfx == vex_f2; + /* CLZERO is another odd one. */ return (s->modrm_rm & 7) == 4 && (s->modrm_reg & 7) == 7; default: --- a/xen/include/public/arch-x86/cpufeatureset.h +++ b/xen/include/public/arch-x86/cpufeatureset.h @@ -XXX,XX +XXX,XX @@ XEN_CPUFEATURE(WRMSRNS, 10*32+19) / XEN_CPUFEATURE(NMI_SRC, 10*32+20) /* NMI-source reporting */ XEN_CPUFEATURE(AMX_FP16, 10*32+21) /* AMX FP16 instruction */ XEN_CPUFEATURE(AVX_IFMA, 10*32+23) /*A AVX-IFMA Instructions */ +XEN_CPUFEATURE(MSRLIST, 10*32+27) /*s MSR list instructions */ /* AMD-defined CPU features, CPUID level 0x80000021.eax, word 11 */ XEN_CPUFEATURE(NO_NEST_BP, 11*32+ 0) /*A No Nested Data Breakpoints */ --- a/xen/tools/gen-cpuid.py +++ b/xen/tools/gen-cpuid.py @@ -XXX,XX +XXX,XX @@ def crunch_numbers(state): # NO_LMSL indicates the absense of Long Mode Segment Limits, which # have been dropped in hardware. LM: [CX16, PCID, LAHF_LM, PAGE1GB, PKU, NO_LMSL, AMX_TILE, CMPCCXADD, - LKGS], + LKGS, MSRLIST], # AMD K6-2+ and K6-III processors shipped with 3DNow+, beyond the # standard 3DNow in the earlier K6 processors.
While UWRMSR probably isn't of much use as long as we don't support UINTR, URDMSR may well be useful to guests even without that (depending on what OSes are willing to permit access to). Since the two VEX encodings introduce a lonely opcode point in map 7, for now don't bother introducing a full 256-entry table. Signed-off-by: Jan Beulich <jbeulich@suse.com> --- The retaining of (possible) #PF from the bitmap access is "speculative" (the spec doesn't mention #PF as a possible exception; conceivably this might also need converting to #GP). I'm a little wary of the "MSRs Writeable by UWRMSR" table that the spec has, and that our code thus also enforces: As new MSRs are added to that table, we'll need piecemeal updates to that switch() statement. --- v6: Add MSR_UINTR_TIMER to header. Use MSR constants in test harness. Re-base. v5: Correct ModR/M.reg check for VEX-encoded forms. Cosmetic test harness adjustment. Re-base. v4: MSR index input regs are 64-bit (albeit only the APX spec has it this way for now). v3: New. --- a/tools/tests/x86_emulator/predicates.c +++ b/tools/tests/x86_emulator/predicates.c @@ -XXX,XX +XXX,XX @@ static const struct { { { 0xf6 }, { 2, 2 }, T, R, pfx_66 }, /* adcx */ { { 0xf6 }, { 2, 2 }, T, R, pfx_f3 }, /* adox */ { { 0xf8 }, { 2, 2 }, F, W, pfx_66 }, /* movdir64b */ + { { 0xf8, 0xc0 }, { 0, 2 }, F, N, pfx_f3 }, /* uwrmsr */ { { 0xf8 }, { 2, 2 }, F, W, pfx_f3 }, /* enqcmds */ + { { 0xf8, 0xc0 }, { 0, 2 }, F, N, pfx_f2 }, /* urdmsr */ { { 0xf8 }, { 2, 2 }, F, W, pfx_f2 }, /* enqcmd */ { { 0xf9 }, { 2, 2 }, F, W }, /* movdiri */ }; @@ -XXX,XX +XXX,XX @@ static const struct vex { { { 0xde }, 3, T, R, pfx_66, W0, L0 }, /* vsm3rnds2 */ { { 0xdf }, 3, T, R, pfx_66, WIG, Ln }, /* vaeskeygenassist */ { { 0xf0 }, 3, T, R, pfx_f2, Wn, L0 }, /* rorx */ +}, vex_map7[] = { + { { 0xf8, 0xc0 }, 6, F, N, pfx_f3, W0, L0 }, /* uwrmsr */ + { { 0xf8, 0xc0 }, 6, F, N, pfx_f2, W0, L0 }, /* urdmsr */ }; static const struct { @@ -XXX,XX +XXX,XX @@ static const struct { { vex_0f, ARRAY_SIZE(vex_0f) }, { vex_0f38, ARRAY_SIZE(vex_0f38) }, { vex_0f3a, ARRAY_SIZE(vex_0f3a) }, + { NULL, 0 }, /* map 4 */ + { NULL, 0 }, /* map 5 */ + { NULL, 0 }, /* map 6 */ + { vex_map7, ARRAY_SIZE(vex_map7) }, }; static const struct xop { @@ -XXX,XX +XXX,XX @@ void predicates_test(void *instr, struct if ( vex[x].tbl[t].w == WIG || (vex[x].tbl[t].w & W0) ) { - memcpy(ptr, vex[x].tbl[t].opc, vex[x].tbl[t].len); + memcpy(ptr, vex[x].tbl[t].opc, + MIN(vex[x].tbl[t].len, ARRAY_SIZE(vex->tbl->opc))); if ( vex[x].tbl[t].l == LIG || (vex[x].tbl[t].l & L0) ) do_test(instr, vex[x].tbl[t].len + ((void *)ptr - instr), @@ -XXX,XX +XXX,XX @@ void predicates_test(void *instr, struct if ( vex[x].tbl[t].l == LIG || (vex[x].tbl[t].l & L1) ) { ptr[-1] |= 4; - memcpy(ptr, vex[x].tbl[t].opc, vex[x].tbl[t].len); + memcpy(ptr, vex[x].tbl[t].opc, + MIN(vex[x].tbl[t].len, ARRAY_SIZE(vex->tbl->opc))); do_test(instr, vex[x].tbl[t].len + ((void *)ptr - instr), vex[x].tbl[t].modrm ? (void *)ptr - instr + 1 : 0, @@ -XXX,XX +XXX,XX @@ void predicates_test(void *instr, struct if ( vex[x].tbl[t].w == WIG || (vex[x].tbl[t].w & W1) ) { ptr[-1] = 0xf8 | vex[x].tbl[t].pfx; - memcpy(ptr, vex[x].tbl[t].opc, vex[x].tbl[t].len); + memcpy(ptr, vex[x].tbl[t].opc, + MIN(vex[x].tbl[t].len, ARRAY_SIZE(vex->tbl->opc))); if ( vex[x].tbl[t].l == LIG || (vex[x].tbl[t].l & L0) ) do_test(instr, vex[x].tbl[t].len + ((void *)ptr - instr), @@ -XXX,XX +XXX,XX @@ void predicates_test(void *instr, struct if ( vex[x].tbl[t].l == LIG || (vex[x].tbl[t].l & L1) ) { ptr[-1] |= 4; - memcpy(ptr, vex[x].tbl[t].opc, vex[x].tbl[t].len); + memcpy(ptr, vex[x].tbl[t].opc, + MIN(vex[x].tbl[t].len, ARRAY_SIZE(vex->tbl->opc))); do_test(instr, vex[x].tbl[t].len + ((void *)ptr - instr), vex[x].tbl[t].modrm ? (void *)ptr - instr + 1 : 0, --- a/tools/tests/x86_emulator/test_x86_emulator.c +++ b/tools/tests/x86_emulator/test_x86_emulator.c @@ -XXX,XX +XXX,XX @@ static int blk( #ifdef __x86_64__ static unsigned long gs_base, gs_base_shadow; +static unsigned long uintr_timer; #endif static int read_segment( @@ -XXX,XX +XXX,XX @@ static int write_segment( return X86EMUL_OKAY; } + +static const uint8_t __attribute__((aligned(0x1000))) umsr_bitmap[0x1000] = { +#define RD(msr) [(msr) >> 3] = 1 << ((msr) & 7) +#define WR(msr) [0x800 + ((msr) >> 3)] = 1 << ((msr) & 7) + RD(MSR_IA32_APERF), + WR(MSR_UINTR_TIMER), +#undef WR +#undef RD +}; #endif static int read_msr( @@ -XXX,XX +XXX,XX @@ static int read_msr( { switch ( reg ) { +#ifdef __x86_64__ + case MSR_USER_MSR_CTL: + *val = (unsigned long)umsr_bitmap | 1; + return X86EMUL_OKAY; +#endif + case MSR_BARRIER: *val = 0; return X86EMUL_OKAY; + case MSR_IA32_APERF: +#define APERF_LO_VALUE 0xAEAEAEAE +#define APERF_HI_VALUE 0xEAEAEAEA + *val = ((uint64_t)APERF_HI_VALUE << 32) | APERF_LO_VALUE; + return X86EMUL_OKAY; + case MSR_EFER: *val = ctxt->addr_size > 32 ? EFER_LME | EFER_LMA : 0; return X86EMUL_OKAY; @@ -XXX,XX +XXX,XX @@ static int write_msr( { switch ( reg ) { + case MSR_UINTR_TIMER: + if ( ctxt->addr_size < 64 ) + break; + uintr_timer = val; + return X86EMUL_OKAY; + case MSR_GS_BASE: if ( ctxt->addr_size < 64 || !is_canonical_address(val) ) break; @@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv) (gs_base != 0x0000222244446666UL) || (gs_base_shadow != 0x0000111122224444UL) ) goto fail; + printf("okay\n"); + + printf("%-40s", "Testing urdmsr %rdx,%rcx..."); + instr[0] = 0xf2; instr[1] = 0x0f; instr[2] = 0x38; instr[3] = 0xf8; instr[4] = 0xd1; + regs.rip = (unsigned long)&instr[0]; + regs.rdx = 0x000000e8UL; /* APERF */ + rc = x86_emulate(&ctxt, &emulops); + if ( (rc != X86EMUL_OKAY) || + (regs.rip != (unsigned long)&instr[5]) || + (regs.rcx != (((uint64_t)APERF_HI_VALUE << 32) | APERF_LO_VALUE)) ) + goto fail; + printf("okay\n"); + + printf("%-40s", "Testing urdmsr $0xe8,%rdx..."); + instr[0] = 0xc4; instr[1] = 0xe7; instr[2] = 0x7b; instr[3] = 0xf8; instr[4] = 0xc2; + instr[5] = 0xe8; instr[6] = 0; instr[7] = 0; instr[8] = 0; + regs.rip = (unsigned long)&instr[0]; + rc = x86_emulate(&ctxt, &emulops); + if ( (rc != X86EMUL_OKAY) || + (regs.rip != (unsigned long)&instr[9]) || + (regs.rdx != (((uint64_t)APERF_HI_VALUE << 32) | APERF_LO_VALUE)) ) + goto fail; + printf("okay\n"); + + printf("%-40s", "Testing uwrmsr %rdi,%rsi..."); + instr[0] = 0xf3; instr[1] = 0x0f; instr[2] = 0x38; instr[3] = 0xf8; instr[4] = 0xf7; + regs.rip = (unsigned long)&instr[0]; + regs.rsi = 0x00001b00UL; /* UINTR_TIMER */ + regs.rdi = 0x0011223344556677UL; + rc = x86_emulate(&ctxt, &emulops); + if ( (rc != X86EMUL_OKAY) || + (regs.rip != (unsigned long)&instr[5]) || + (uintr_timer != 0x0011223344556677UL) ) + goto fail; + printf("okay\n"); + + printf("%-40s", "Testing uwrmsr %rsi,$0x1b00..."); + instr[0] = 0xc4; instr[1] = 0xe7; instr[2] = 0x7a; instr[3] = 0xf8; instr[4] = 0xc6; + instr[5] = 0x00; instr[6] = 0x1b; instr[7] = 0; instr[8] = 0; + regs.rip = (unsigned long)&instr[0]; + regs.rsi = 0x8877665544332211UL; + rc = x86_emulate(&ctxt, &emulops); + if ( (rc != X86EMUL_OKAY) || + (regs.rip != (unsigned long)&instr[9]) || + (uintr_timer != 0x8877665544332211UL) ) + goto fail; + printf("okay\n"); + + printf("%-40s", "Testing uwrmsr %rsi,$0x1b01..."); + instr[5] = 0x01; /* UARCH_MISC_CTL (derived from UINTR_TIMER) */ + regs.rip = (unsigned long)&instr[0]; + regs.rsi = 0; + rc = x86_emulate(&ctxt, &emulops); + if ( (rc != X86EMUL_EXCEPTION) || + (regs.rip != (unsigned long)&instr[0]) || + (uintr_timer != 0x8877665544332211UL) ) + goto fail; emulops.write_msr = NULL; #endif --- a/xen/arch/x86/include/asm/msr-index.h +++ b/xen/arch/x86/include/asm/msr-index.h @@ -XXX,XX +XXX,XX @@ #define APIC_BASE_ENABLE (_AC(1, ULL) << 11) #define APIC_BASE_ADDR_MASK _AC(0x000ffffffffff000, ULL) +#define MSR_USER_MSR_CTL 0x0000001c +#define USER_MSR_ENABLE (_AC(1, ULL) << 0) +#define USER_MSR_ADDR_MASK 0xfffffffffffff000ULL + #define MSR_BARRIER 0x0000002f #define MSR_TEST_CTRL 0x00000033 @@ -XXX,XX +XXX,XX @@ #define MCU_CONTROL_DIS_MCU_LOAD (_AC(1, ULL) << 1) #define MCU_CONTROL_EN_SMM_BYPASS (_AC(1, ULL) << 2) +#define MSR_UINTR_TIMER 0x00001b00 + #define MSR_UARCH_MISC_CTRL 0x00001b01 #define UARCH_CTRL_DOITM (_AC(1, ULL) << 0) --- a/xen/arch/x86/x86_emulate/decode.c +++ b/xen/arch/x86/x86_emulate/decode.c @@ -XXX,XX +XXX,XX @@ decode_0f38(struct x86_emulate_state *s, { case 0x00 ... 0xef: case 0xf2 ... 0xf5: - case 0xf7 ... 0xf8: + case 0xf7: case 0xfa ... 0xff: s->op_bytes = 0; /* fall through */ @@ -XXX,XX +XXX,XX @@ decode_0f38(struct x86_emulate_state *s, case X86EMUL_OPC_VEX_F2(0, 0xf7): /* shrx */ break; + case 0xf8: + if ( s->modrm_mod == 3 ) /* u{rd,wr}msr */ + { + s->desc = DstMem | SrcReg | Mov; + s->op_bytes = 8; + s->simd_size = simd_none; + } + else /* movdir64b / enqcmd{,s} */ + s->op_bytes = 0; + ctxt->opcode |= MASK_INSR(s->vex.pfx, X86EMUL_OPC_PFX_MASK); + break; + default: s->op_bytes = 0; break; @@ -XXX,XX +XXX,XX @@ int x86emul_decode(struct x86_emulate_st */ d = twobyte_table[0x38].desc; break; + + case vex_map7: + opcode |= MASK_INSR(7, X86EMUL_OPC_EXT_MASK); + /* + * No table lookup here for now, as there's only a single + * opcode point (0xf8) populated in map 7. + */ + d = DstMem | SrcImm | ModRM | Mov; + s->op_bytes = 8; + break; } } else if ( s->ext < ext_8f08 + ARRAY_SIZE(xop_table) ) @@ -XXX,XX +XXX,XX @@ int x86emul_decode(struct x86_emulate_st s->simd_size = ext8f09_table[b].simd_size; break; + case ext_map7: case ext_8f08: case ext_8f0a: /* @@ -XXX,XX +XXX,XX @@ int x86emul_decode(struct x86_emulate_st case ext_map5: case ext_map6: + case ext_map7: case ext_8f09: case ext_8f0a: break; --- a/xen/arch/x86/x86_emulate/private.h +++ b/xen/arch/x86/x86_emulate/private.h @@ -XXX,XX +XXX,XX @@ enum vex_opcx { vex_0f3a, evex_map5 = 5, evex_map6, + vex_map7, }; enum vex_pfx { @@ -XXX,XX +XXX,XX @@ struct x86_emulate_state { ext_0f3a = vex_0f3a, ext_map5 = evex_map5, ext_map6 = evex_map6, + ext_map7 = vex_map7, /* * For XOP use values such that the respective instruction field * can be used without adjustment. --- a/xen/arch/x86/x86_emulate/x86_emulate.c +++ b/xen/arch/x86/x86_emulate/x86_emulate.c @@ -XXX,XX +XXX,XX @@ x86_emulate( state->simd_size = simd_none; break; - case X86EMUL_OPC_F2(0x0f38, 0xf8): /* enqcmd r,m512 */ - case X86EMUL_OPC_F3(0x0f38, 0xf8): /* enqcmds r,m512 */ + case X86EMUL_OPC_F2(0x0f38, 0xf8): /* enqcmd r,m512 / urdmsr r32,r64 */ + case X86EMUL_OPC_F3(0x0f38, 0xf8): /* enqcmds r,m512 / uwrmsr r64,r32 */ + if ( ea.type == OP_MEM ) + goto enqcmd; + imm1 = src.val; + /* fall through */ + case X86EMUL_OPC_VEX_F2(7, 0xf8): /* urdmsr imm32,r64 */ + case X86EMUL_OPC_VEX_F3(7, 0xf8): /* uwrmsr r64,imm32 */ + generate_exception_if(!mode_64bit() || ea.type != OP_REG, X86_EXC_UD); + generate_exception_if(vex.l || vex.w, X86_EXC_UD); + generate_exception_if(vex.opcx && ((modrm_reg & 7) || vex.reg != 0xf), + X86_EXC_UD); + fail_if(!ops->read_msr); + if ( ops->read_msr(MSR_USER_MSR_CTL, &msr_val, ctxt) != X86EMUL_OKAY ) + { + x86_emul_reset_event(ctxt); + msr_val = 0; + } + generate_exception_if(!(msr_val & USER_MSR_ENABLE), X86_EXC_UD); + generate_exception_if(imm1 & ~0x3fff, X86_EXC_GP, 0); + + /* Check the corresponding bitmap. */ + ea.mem.off = msr_val & ~0xfff; + if ( vex.pfx != vex_f2 ) + ea.mem.off += 0x800; + ea.mem.off += imm1 >> 3; + if ( (rc = ops->read(x86_seg_sys, ea.mem.off, &b, 1, + ctxt)) != X86EMUL_OKAY ) + goto done; + generate_exception_if(!(b & (1 << (imm1 & 7))), X86_EXC_GP, 0); + + /* Carry out the actual MSR access. */ + if ( vex.pfx == vex_f2 ) + { + /* urdmsr */ + if ( (rc = ops->read_msr(imm1, &msr_val, ctxt)) != X86EMUL_OKAY ) + goto done; + dst.val = msr_val; + ASSERT(dst.type == OP_REG); + dst.bytes = 8; + } + else + { + /* uwrmsr */ + switch ( imm1 ) + { + case 0x1b00: /* UINTR_TIMER */ + case 0x1b01: /* UARCH_MISC_CTL */ + break; + default: + generate_exception(X86_EXC_GP, 0); + } + fail_if(!ops->write_msr); + if ( (rc = ops->write_msr(imm1, dst.val, ctxt)) != X86EMUL_OKAY ) + goto done; + dst.type = OP_NONE; + } + break; + + enqcmd: host_and_vcpu_must_have(enqcmd); - generate_exception_if(ea.type != OP_MEM, X86_EXC_UD); generate_exception_if(vex.pfx != vex_f2 && !mode_ring0(), X86_EXC_GP, 0); src.val = truncate_ea(*dst.reg); generate_exception_if(!is_aligned(x86_seg_es, src.val, 64, ctxt, ops), --- a/xen/include/public/arch-x86/cpufeatureset.h +++ b/xen/include/public/arch-x86/cpufeatureset.h @@ -XXX,XX +XXX,XX @@ XEN_CPUFEATURE(AVX_NE_CONVERT, 15*32 XEN_CPUFEATURE(AMX_COMPLEX, 15*32+ 8) /* AMX Complex Instructions */ XEN_CPUFEATURE(AVX_VNNI_INT16, 15*32+10) /*A AVX-VNNI-INT16 Instructions */ XEN_CPUFEATURE(PREFETCHI, 15*32+14) /*A PREFETCHIT{0,1} Instructions */ +XEN_CPUFEATURE(USER_MSR, 15*32+15) /* U{RD,WR}MSR Instructions */ XEN_CPUFEATURE(CET_SSS, 15*32+18) /* CET Supervisor Shadow Stacks safe to use */ /* Intel-defined CPU features, MSR_ARCH_CAPS 0x10a.eax, word 16 */ --- a/xen/tools/gen-cpuid.py +++ b/xen/tools/gen-cpuid.py @@ -XXX,XX +XXX,XX @@ def crunch_numbers(state): # NO_LMSL indicates the absense of Long Mode Segment Limits, which # have been dropped in hardware. LM: [CX16, PCID, LAHF_LM, PAGE1GB, PKU, NO_LMSL, AMX_TILE, CMPCCXADD, - LKGS, MSRLIST], + LKGS, MSRLIST, USER_MSR], # AMD K6-2+ and K6-III processors shipped with 3DNow+, beyond the # standard 3DNow in the earlier K6 processors.
Move the PKS check into an "else" for the corresponding "if()", such that further adjustments (like for USER_MSR) can easily be put there as well. Signed-off-by: Jan Beulich <jbeulich@suse.com> --- v5: Re-base. v4: New. --- a/xen/arch/x86/cpu-policy.c +++ b/xen/arch/x86/cpu-policy.c @@ -XXX,XX +XXX,XX @@ static void __init calculate_hvm_max_pol if ( !cpu_has_vmx_xsaves ) __clear_bit(X86_FEATURE_XSAVES, fs); } + else + { + /* + * Xen doesn't use PKS, so the guest support for it has opted to not use + * the VMCS load/save controls for efficiency reasons. This depends on + * the exact vmentry/exit behaviour, so don't expose PKS in other + * situations until someone has cross-checked the behaviour for safety. + */ + __clear_bit(X86_FEATURE_PKS, fs); + } if ( !cpu_has_vmx_msrlist ) __clear_bit(X86_FEATURE_MSRLIST, fs); - /* - * Xen doesn't use PKS, so the guest support for it has opted to not use - * the VMCS load/save controls for efficiency reasons. This depends on - * the exact vmentry/exit behaviour, so don't expose PKS in other - * situations until someone has cross-checked the behaviour for safety. - */ - if ( !cpu_has_vmx ) - __clear_bit(X86_FEATURE_PKS, fs); - /* * Make adjustments to possible (nested) virtualization features exposed * to the guest
Hook up the new VM exit codes and handle guest accesses, context switch, and save/restore. At least for now don't allow the guest direct access to the control MSR; this may need changing if guests were to frequently access it (e.g. on their own context switch path). While there also correct a one-off in union ldt_or_tr_instr_info's comment. Signed-off-by: Jan Beulich <jbeulich@suse.com> --- Needing to change two places in hvm.c continues to be unhelpful; I recall I already did forget to also adjust hvm_load_cpu_msrs() for XFD. Considering that MSRs typically arrive in the order the table has it, couldn't we incrementally look up the incoming MSR index there, falling back to a full lookup only when the incremental lookup failed (and thus not normally re-iterating through the initial part of the array)? Said comment in union ldt_or_tr_instr_info is further odd (same for union gdt_or_idt_instr_info's) in that Instruction Information is only a 32-bit field. Hence bits 32-63 aren't undefined, but simply don't exist. RFC: The wee attempt to "deal" with nested is likely wrong, but I'm afraid I simply don't know where such enforcement would be done properly. Returning an error there is also commented out, for domain_cpu_policy_changed() returning void without "x86/xstate: re-size save area when CPUID policy changes" in place. --- v5: Introduce user_msr_gpr(). v4: New. --- a/xen/arch/x86/cpu-policy.c +++ b/xen/arch/x86/cpu-policy.c @@ -XXX,XX +XXX,XX @@ static void __init calculate_hvm_max_pol * situations until someone has cross-checked the behaviour for safety. */ __clear_bit(X86_FEATURE_PKS, fs); + + /* + * Don't expose USER_MSR until it is known how (if at all) it is + * virtualized on SVM. + */ + __clear_bit(X86_FEATURE_USER_MSR, fs); } if ( !cpu_has_vmx_msrlist ) --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -XXX,XX +XXX,XX @@ void domain_cpu_policy_changed(struct do } } + /* Nested doesn't have the necessary processing, yet. */ + if ( nestedhvm_enabled(d) && p->feat.user_msr ) + return /* -EINVAL */; + for_each_vcpu ( d, v ) { cpu_policy_updated(v); --- a/xen/arch/x86/hvm/hvm.c +++ b/xen/arch/x86/hvm/hvm.c @@ -XXX,XX +XXX,XX @@ static int cf_check hvm_load_cpu_xsave_s #define HVM_CPU_MSR_SIZE(cnt) offsetof(struct hvm_msr, msr[cnt]) static const uint32_t msrs_to_send[] = { + MSR_USER_MSR_CTL, MSR_SPEC_CTRL, MSR_INTEL_MISC_FEATURES_ENABLES, MSR_PKRS, @@ -XXX,XX +XXX,XX @@ static int cf_check hvm_load_cpu_msrs(st { int rc; + case MSR_USER_MSR_CTL: case MSR_SPEC_CTRL: case MSR_INTEL_MISC_FEATURES_ENABLES: case MSR_PKRS: --- a/xen/arch/x86/hvm/vmx/vmx.c +++ b/xen/arch/x86/hvm/vmx/vmx.c @@ -XXX,XX +XXX,XX @@ static void cf_check vmx_vcpu_destroy(st } /* - * To avoid MSR save/restore at every VM exit/entry time, we restore - * the x86_64 specific MSRs at domain switch time. Since these MSRs - * are not modified once set for para domains, we don't save them, - * but simply reset them to values set in percpu_traps_init(). + * To avoid MSR save/restore at every VM exit/entry time, we restore the + * x86_64 specific MSRs at vcpu switch time. Since these MSRs are not + * modified once set for para domains, we don't save them, but simply clear + * them or reset them to values set in percpu_traps_init(). */ -static void vmx_restore_host_msrs(void) +static void vmx_restore_host_msrs(const struct vcpu *v) { + const struct vcpu_msrs *msrs = v->arch.msrs; + + if ( msrs->user_msr_ctl.enable ) + wrmsrl(MSR_USER_MSR_CTL, 0); + /* No PV guests? No need to restore host SYSCALL infrastructure. */ if ( !IS_ENABLED(CONFIG_PV) ) return; @@ -XXX,XX +XXX,XX @@ static void vmx_restore_guest_msrs(struc if ( cp->feat.pks ) wrpkrs(msrs->pkrs); + + if ( msrs->user_msr_ctl.enable ) + wrmsrl(MSR_USER_MSR_CTL, msrs->user_msr_ctl.raw); } void vmx_update_cpu_exec_control(struct vcpu *v) @@ -XXX,XX +XXX,XX @@ static void cf_check vmx_ctxt_switch_fro if ( !v->arch.fully_eager_fpu ) vmx_fpu_leave(v); vmx_save_guest_msrs(v); - vmx_restore_host_msrs(); + vmx_restore_host_msrs(v); vmx_save_dr(v); if ( v->domain->arch.hvm.pi_ops.flags & PI_CSW_FROM ) @@ -XXX,XX +XXX,XX @@ static int vmx_handle_apic_write(void) return vlapic_apicv_write(current, exit_qualification & 0xfff); } +static unsigned int user_msr_gpr(void) +{ + user_msr_instr_info_t info; + + __vmread(VMX_INSTRUCTION_INFO, &info.raw); + return info.gpr; +} + static void undo_nmis_unblocked_by_iret(void) { unsigned long guest_info; @@ -XXX,XX +XXX,XX @@ void asmlinkage vmx_vmexit_handler(struc hvm_inject_hw_exception(X86_EXC_GP, 0); break; + case EXIT_REASON_URDMSR: + { + uint64_t msr_content = 0; + + __vmread(EXIT_QUALIFICATION, &exit_qualification); + switch ( hvm_msr_read_intercept(exit_qualification, &msr_content) ) + { + case X86EMUL_OKAY: + *decode_gpr(regs, user_msr_gpr()) = msr_content; + update_guest_eip(); /* Safe: URDMSR */ + break; + + case X86EMUL_EXCEPTION: + hvm_inject_hw_exception(X86_EXC_GP, 0); + break; + } + break; + } + + case EXIT_REASON_UWRMSR: + __vmread(EXIT_QUALIFICATION, &exit_qualification); + switch ( hvm_msr_write_intercept(exit_qualification, + *decode_gpr(regs, user_msr_gpr()), + true) ) + { + case X86EMUL_OKAY: + update_guest_eip(); /* Safe: UWRMSR */ + break; + + case X86EMUL_EXCEPTION: + hvm_inject_hw_exception(X86_EXC_GP, 0); + break; + } + break; + case EXIT_REASON_VMXOFF: case EXIT_REASON_VMXON: case EXIT_REASON_VMCLEAR: --- a/xen/arch/x86/include/asm/hvm/vmx/vmx.h +++ b/xen/arch/x86/include/asm/hvm/vmx/vmx.h @@ -XXX,XX +XXX,XX @@ static inline void pi_clear_sn(struct pi #define EXIT_REASON_NOTIFY 75 #define EXIT_REASON_RDMSRLIST 78 #define EXIT_REASON_WRMSRLIST 79 +#define EXIT_REASON_URDMSR 80 +#define EXIT_REASON_UWRMSR 81 /* Remember to also update VMX_PERF_EXIT_REASON_SIZE! */ /* @@ -XXX,XX +XXX,XX @@ typedef union ldt_or_tr_instr_info { base_reg_invalid :1, /* bit 27 - Base register invalid */ instr_identity :1, /* bit 28 - 0:LDT, 1:TR */ instr_write :1, /* bit 29 - 0:store, 1:load */ - :34; /* bits 31:63 - Undefined */ + :34; /* bits 30:63 - Undefined */ }; } ldt_or_tr_instr_info_t; +/* VM-Exit instruction info for URDMSR and UWRMSR */ +typedef union user_msr_instr_info { + unsigned long raw; + struct { + unsigned int :3, /* Bits 0:2 - Undefined */ + gpr :4, /* Bits 3:6 - Source/Destination register */ + :25; /* bits 7:31 - Undefined */ + }; +} user_msr_instr_info_t; + #endif /* __ASM_X86_HVM_VMX_VMX_H__ */ --- a/xen/arch/x86/include/asm/msr.h +++ b/xen/arch/x86/include/asm/msr.h @@ -XXX,XX +XXX,XX @@ uint64_t msr_spec_ctrl_valid_bits(const struct vcpu_msrs { /* + * 0x0000001c - MSR_USER_MSR_CTL + * + * Value is guest chosen, and always loaded in vcpu context. + */ + union { + uint64_t raw; + struct { + bool enable:1; + unsigned int :11; + unsigned long bitmap:52; + }; + } user_msr_ctl; + + /* * 0x00000048 - MSR_SPEC_CTRL * 0xc001011f - MSR_VIRT_SPEC_CTRL (if X86_FEATURE_AMD_SSBD) * --- a/xen/arch/x86/include/asm/perfc_defn.h +++ b/xen/arch/x86/include/asm/perfc_defn.h @@ -XXX,XX +XXX,XX @@ PERFCOUNTER_ARRAY(exceptions, #ifdef CONFIG_HVM -#define VMX_PERF_EXIT_REASON_SIZE 80 +#define VMX_PERF_EXIT_REASON_SIZE 82 #define VMEXIT_NPF_PERFC 143 #define SVM_PERF_EXIT_REASON_SIZE (VMEXIT_NPF_PERFC + 1) PERFCOUNTER_ARRAY(vmexits, "vmexits", --- a/xen/arch/x86/msr.c +++ b/xen/arch/x86/msr.c @@ -XXX,XX +XXX,XX @@ int guest_rdmsr(struct vcpu *v, uint32_t *val = msrs->xss.raw; break; + case MSR_USER_MSR_CTL: + if ( !cp->feat.user_msr ) + goto gp_fault; + *val = msrs->user_msr_ctl.raw; + break; + case 0x40000000 ... 0x400001ff: if ( is_viridian_domain(d) ) { @@ -XXX,XX +XXX,XX @@ int guest_wrmsr(struct vcpu *v, uint32_t msrs->xss.raw = val; break; + case MSR_USER_MSR_CTL: + if ( !cp->feat.user_msr ) + goto gp_fault; + + if ( (val & ~(USER_MSR_ENABLE | USER_MSR_ADDR_MASK)) || + !is_canonical_address(val) ) + goto gp_fault; + + msrs->user_msr_ctl.raw = val; + if ( v == curr ) + wrmsrl(MSR_USER_MSR_CTL, val); + break; + case 0x40000000 ... 0x400001ff: if ( is_viridian_domain(d) ) { --- a/xen/include/public/arch-x86/cpufeatureset.h +++ b/xen/include/public/arch-x86/cpufeatureset.h @@ -XXX,XX +XXX,XX @@ XEN_CPUFEATURE(AVX_NE_CONVERT, 15*32 XEN_CPUFEATURE(AMX_COMPLEX, 15*32+ 8) /* AMX Complex Instructions */ XEN_CPUFEATURE(AVX_VNNI_INT16, 15*32+10) /*A AVX-VNNI-INT16 Instructions */ XEN_CPUFEATURE(PREFETCHI, 15*32+14) /*A PREFETCHIT{0,1} Instructions */ -XEN_CPUFEATURE(USER_MSR, 15*32+15) /* U{RD,WR}MSR Instructions */ +XEN_CPUFEATURE(USER_MSR, 15*32+15) /*s U{RD,WR}MSR Instructions */ XEN_CPUFEATURE(CET_SSS, 15*32+18) /* CET Supervisor Shadow Stacks safe to use */ /* Intel-defined CPU features, MSR_ARCH_CAPS 0x10a.eax, word 16 */