These are "compound" instructions to issue a series of RDMSR / WRMSR
respectively. In the emulator we can therefore implement them by using
the existing msr_{read,write}() hooks. The memory accesses utilize that
the HVM ->read() / ->write() hooks are already linear-address
(x86_seg_none) aware (by way of hvmemul_virtual_to_linear() handling
this case).
Preemption is being checked for in WRMSRLIST handling only, as only MSR
writes are expected to possibly take long.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
RFC: In vmx_vmexit_handler() handling is forwarded to the emulator
blindly. Alternatively we could consult the exit qualification and
process just a single MSR at a time (without involving the
emulator), exiting back to the guest after every iteration. (I
don't think a mix of both models makes a lot of sense.)
The precise behavior of MSR_BARRIER is still not spelled out in ISE 050,
so the (minimal) implementation continues to be a guess for now.
Wouldn't calculate_hvm_max_policy() for MPX better behave the same way
as done here, at least from an abstract perspective (assuming that AMD
won't add such functionality now that Intel have deprecated it)?
---
v6: Use MSR constants in test harness. Re-base.
v5: Add missing vmx_init_vmcs_config() and construct_vmcs() adjustments.
Avoid unnecessary uses of r(). Re-base.
v3: Add dependency on LM. Limit exposure to HVM. Utilize new info from
ISE 050. Re-base.
v2: Use X86_EXC_*. Add preemption checking to WRMSRLIST handling. Remove
the feature from "max" when the VMX counterpart isn't available.
--- a/tools/tests/x86_emulator/predicates.c
+++ b/tools/tests/x86_emulator/predicates.c
@@ -342,6 +342,8 @@ static const struct {
{ { 0x01, 0xc4 }, { 2, 2 }, F, N }, /* vmxoff */
{ { 0x01, 0xc5 }, { 2, 2 }, F, N }, /* pconfig */
{ { 0x01, 0xc6 }, { 2, 2 }, F, N }, /* wrmsrns */
+ { { 0x01, 0xc6 }, { 0, 2 }, F, W, pfx_f2 }, /* rdmsrlist */
+ { { 0x01, 0xc6 }, { 0, 2 }, F, R, pfx_f3 }, /* wrmsrlist */
{ { 0x01, 0xc8 }, { 2, 2 }, F, N }, /* monitor */
{ { 0x01, 0xc9 }, { 2, 2 }, F, N }, /* mwait */
{ { 0x01, 0xca }, { 2, 2 }, F, N }, /* clac */
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -625,7 +625,7 @@ static int write(
if ( verbose )
printf("** %s(%u, %p,, %u,)\n", __func__, seg, (void *)offset, bytes);
- if ( !is_x86_user_segment(seg) )
+ if ( !is_x86_user_segment(seg) && seg != x86_seg_none )
return X86EMUL_UNHANDLEABLE;
memcpy((void *)offset, p_data, bytes);
return X86EMUL_OKAY;
@@ -717,6 +717,10 @@ static int read_msr(
{
switch ( reg )
{
+ case MSR_BARRIER:
+ *val = 0;
+ return X86EMUL_OKAY;
+
case MSR_EFER:
*val = ctxt->addr_size > 32 ? EFER_LME | EFER_LMA : 0;
return X86EMUL_OKAY;
@@ -1434,9 +1438,53 @@ int main(int argc, char **argv)
(gs_base != 0x0000111122224444UL) ||
gs_base_shadow )
goto fail;
+ printf("okay\n");
cpu_policy.extd.nscb = i;
emulops.write_segment = NULL;
+
+ printf("%-40s", "Testing rdmsrlist...");
+ instr[0] = 0xf2; instr[1] = 0x0f; instr[2] = 0x01; instr[3] = 0xc6;
+ regs.rip = (unsigned long)&instr[0];
+ regs.rsi = (unsigned long)(res + 0x80);
+ regs.rdi = (unsigned long)(res + 0x80 + 0x40 * 2);
+ regs.rcx = 0x0002000100008000UL;
+ gs_base_shadow = 0x0000222244446666UL;
+ memset(res + 0x80, ~0, 0x40 * 8 * 2);
+ res[0x80 + 0x0f * 2] = MSR_GS_BASE;
+ res[0x80 + 0x0f * 2 + 1] = 0;
+ res[0x80 + 0x20 * 2] = MSR_SHADOW_GS_BASE;
+ res[0x80 + 0x20 * 2 + 1] = 0;
+ res[0x80 + 0x31 * 2] = MSR_BARRIER;
+ res[0x80 + 0x31 * 2 + 1] = 0;
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( (rc != X86EMUL_OKAY) ||
+ (regs.rip != (unsigned long)&instr[4]) ||
+ regs.rcx ||
+ (res[0x80 + (0x40 + 0x0f) * 2] != (unsigned int)gs_base) ||
+ (res[0x80 + (0x40 + 0x0f) * 2 + 1] != (gs_base >> (8 * sizeof(int)))) ||
+ (res[0x80 + (0x40 + 0x20) * 2] != (unsigned int)gs_base_shadow) ||
+ (res[0x80 + (0x40 + 0x20) * 2 + 1] != (gs_base_shadow >> (8 * sizeof(int)))) ||
+ res[0x80 + (0x40 + 0x31) * 2] || res[0x80 + (0x40 + 0x31) * 2 + 1] )
+ goto fail;
+ printf("okay\n");
+
+ printf("%-40s", "Testing wrmsrlist...");
+ instr[0] = 0xf3; instr[1] = 0x0f; instr[2] = 0x01; instr[3] = 0xc6;
+ regs.eip = (unsigned long)&instr[0];
+ regs.rsi -= 0x11 * 8;
+ regs.rdi -= 0x11 * 8;
+ regs.rcx = 0x0002000100000000UL;
+ res[0x80 + 0x0f * 2] = MSR_SHADOW_GS_BASE;
+ res[0x80 + 0x20 * 2] = MSR_GS_BASE;
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( (rc != X86EMUL_OKAY) ||
+ (regs.rip != (unsigned long)&instr[4]) ||
+ regs.rcx ||
+ (gs_base != 0x0000222244446666UL) ||
+ (gs_base_shadow != 0x0000111122224444UL) )
+ goto fail;
+
emulops.write_msr = NULL;
#endif
printf("okay\n");
--- a/tools/tests/x86_emulator/x86-emulate.c
+++ b/tools/tests/x86_emulator/x86-emulate.c
@@ -87,6 +87,7 @@ bool emul_test_init(void)
cpu_policy.feat.rdpid = true;
cpu_policy.feat.lkgs = true;
cpu_policy.feat.wrmsrns = true;
+ cpu_policy.feat.msrlist = true;
cpu_policy.extd.clzero = true;
if ( cpu_has_xsave )
--- a/xen/arch/x86/cpu-policy.c
+++ b/xen/arch/x86/cpu-policy.c
@@ -745,6 +745,9 @@ static void __init calculate_hvm_max_pol
__clear_bit(X86_FEATURE_XSAVES, fs);
}
+ if ( !cpu_has_vmx_msrlist )
+ __clear_bit(X86_FEATURE_MSRLIST, fs);
+
/*
* Xen doesn't use PKS, so the guest support for it has opted to not use
* the VMCS load/save controls for efficiency reasons. This depends on
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -366,7 +366,8 @@ static int vmx_init_vmcs_config(bool bsp
if ( _vmx_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS )
{
- uint64_t opt = TERTIARY_EXEC_VIRT_SPEC_CTRL;
+ uint64_t opt = TERTIARY_EXEC_ENABLE_MSRLIST |
+ TERTIARY_EXEC_VIRT_SPEC_CTRL;
_vmx_tertiary_exec_control = adjust_vmx_controls2(
"Tertiary Exec Control", 0, opt,
@@ -1119,7 +1120,8 @@ static int construct_vmcs(struct vcpu *v
v->arch.hvm.vmx.exec_control |= CPU_BASED_RDTSC_EXITING;
v->arch.hvm.vmx.secondary_exec_control = vmx_secondary_exec_control;
- v->arch.hvm.vmx.tertiary_exec_control = vmx_tertiary_exec_control;
+ v->arch.hvm.vmx.tertiary_exec_control = vmx_tertiary_exec_control &
+ ~TERTIARY_EXEC_ENABLE_MSRLIST;
/*
* Disable features which we don't want active by default:
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -871,6 +871,20 @@ static void cf_check vmx_cpuid_policy_ch
else
vmx_set_msr_intercept(v, MSR_PKRS, VMX_MSR_RW);
+ if ( cp->feat.msrlist )
+ {
+ vmx_clear_msr_intercept(v, MSR_BARRIER, VMX_MSR_RW);
+ v->arch.hvm.vmx.tertiary_exec_control |= TERTIARY_EXEC_ENABLE_MSRLIST;
+ vmx_update_tertiary_exec_control(v);
+ }
+ else if ( v->arch.hvm.vmx.tertiary_exec_control &
+ TERTIARY_EXEC_ENABLE_MSRLIST )
+ {
+ vmx_set_msr_intercept(v, MSR_BARRIER, VMX_MSR_RW);
+ v->arch.hvm.vmx.tertiary_exec_control &= ~TERTIARY_EXEC_ENABLE_MSRLIST;
+ vmx_update_tertiary_exec_control(v);
+ }
+
out:
vmx_vmcs_exit(v);
@@ -3732,6 +3746,22 @@ gp_fault:
return X86EMUL_EXCEPTION;
}
+static bool cf_check is_msrlist(
+ const struct x86_emulate_state *state, const struct x86_emulate_ctxt *ctxt)
+{
+
+ if ( ctxt->opcode == X86EMUL_OPC(0x0f, 0x01) )
+ {
+ unsigned int rm, reg;
+ int mode = x86_insn_modrm(state, &rm, ®);
+
+ /* This also includes WRMSRNS; should be okay. */
+ return mode == 3 && rm == 6 && !reg;
+ }
+
+ return false;
+}
+
static void vmx_do_extint(struct cpu_user_regs *regs)
{
unsigned long vector;
@@ -4539,6 +4569,17 @@ void asmlinkage vmx_vmexit_handler(struc
}
break;
+ case EXIT_REASON_RDMSRLIST:
+ case EXIT_REASON_WRMSRLIST:
+ if ( vmx_guest_x86_mode(v) != 8 || !currd->arch.cpuid->feat.msrlist )
+ {
+ ASSERT_UNREACHABLE();
+ hvm_inject_hw_exception(X86_EXC_UD, X86_EVENT_NO_EC);
+ }
+ else if ( !hvm_emulate_one_insn(is_msrlist, "MSR list") )
+ hvm_inject_hw_exception(X86_EXC_GP, 0);
+ break;
+
case EXIT_REASON_VMXOFF:
case EXIT_REASON_VMXON:
case EXIT_REASON_VMCLEAR:
--- a/xen/arch/x86/include/asm/hvm/vmx/vmcs.h
+++ b/xen/arch/x86/include/asm/hvm/vmx/vmcs.h
@@ -267,6 +267,7 @@ extern u32 vmx_secondary_exec_control;
#define TERTIARY_EXEC_EPT_PAGING_WRITE BIT(2, UL)
#define TERTIARY_EXEC_GUEST_PAGING_VERIFY BIT(3, UL)
#define TERTIARY_EXEC_IPI_VIRT BIT(4, UL)
+#define TERTIARY_EXEC_ENABLE_MSRLIST BIT(6, UL)
#define TERTIARY_EXEC_VIRT_SPEC_CTRL BIT(7, UL)
extern uint64_t vmx_tertiary_exec_control;
@@ -391,6 +392,9 @@ extern u64 vmx_ept_vpid_cap;
#define cpu_has_vmx_notify_vm_exiting \
(IS_ENABLED(CONFIG_INTEL_VMX) && \
vmx_secondary_exec_control & SECONDARY_EXEC_NOTIFY_VM_EXITING)
+#define cpu_has_vmx_msrlist \
+ (IS_ENABLED(CONFIG_INTEL_VMX) && \
+ (vmx_tertiary_exec_control & TERTIARY_EXEC_ENABLE_MSRLIST))
#define VMCS_RID_TYPE_MASK 0x80000000U
--- a/xen/arch/x86/include/asm/hvm/vmx/vmx.h
+++ b/xen/arch/x86/include/asm/hvm/vmx/vmx.h
@@ -201,6 +201,8 @@ static inline void pi_clear_sn(struct pi
#define EXIT_REASON_XRSTORS 64
#define EXIT_REASON_BUS_LOCK 74
#define EXIT_REASON_NOTIFY 75
+#define EXIT_REASON_RDMSRLIST 78
+#define EXIT_REASON_WRMSRLIST 79
/* Remember to also update VMX_PERF_EXIT_REASON_SIZE! */
/*
--- a/xen/arch/x86/include/asm/msr-index.h
+++ b/xen/arch/x86/include/asm/msr-index.h
@@ -24,6 +24,8 @@
#define APIC_BASE_ENABLE (_AC(1, ULL) << 11)
#define APIC_BASE_ADDR_MASK _AC(0x000ffffffffff000, ULL)
+#define MSR_BARRIER 0x0000002f
+
#define MSR_TEST_CTRL 0x00000033
#define TEST_CTRL_SPLITLOCK_DETECT (_AC(1, ULL) << 29)
#define TEST_CTRL_SPLITLOCK_DISABLE (_AC(1, ULL) << 31)
--- a/xen/arch/x86/include/asm/perfc_defn.h
+++ b/xen/arch/x86/include/asm/perfc_defn.h
@@ -6,7 +6,7 @@ PERFCOUNTER_ARRAY(exceptions,
#ifdef CONFIG_HVM
-#define VMX_PERF_EXIT_REASON_SIZE 76
+#define VMX_PERF_EXIT_REASON_SIZE 80
#define VMEXIT_NPF_PERFC 143
#define SVM_PERF_EXIT_REASON_SIZE (VMEXIT_NPF_PERFC + 1)
PERFCOUNTER_ARRAY(vmexits, "vmexits",
--- a/xen/arch/x86/msr.c
+++ b/xen/arch/x86/msr.c
@@ -74,6 +74,12 @@ int guest_rdmsr(struct vcpu *v, uint32_t
case MSR_AMD_PPIN:
goto gp_fault;
+ case MSR_BARRIER:
+ if ( !cp->feat.msrlist )
+ goto gp_fault;
+ *val = 0;
+ break;
+
case MSR_IA32_FEATURE_CONTROL:
/*
* Architecturally, availability of this MSR is enumerated by the
@@ -347,6 +353,7 @@ int guest_wrmsr(struct vcpu *v, uint32_t
uint64_t rsvd;
/* Read-only */
+ case MSR_BARRIER:
case MSR_IA32_PLATFORM_ID:
case MSR_CORE_CAPABILITIES:
case MSR_INTEL_CORE_THREAD_COUNT:
--- a/xen/arch/x86/x86_emulate/0f01.c
+++ b/xen/arch/x86/x86_emulate/0f01.c
@@ -11,6 +11,7 @@
#include "private.h"
#ifdef __XEN__
+#include <xen/event.h>
#include <asm/prot-key.h>
#endif
@@ -28,6 +29,7 @@ int x86emul_0f01(struct x86_emulate_stat
switch ( s->modrm )
{
unsigned long base, limit, cr0, cr0w, cr4;
+ unsigned int n;
struct segment_register sreg;
uint64_t msr_val;
@@ -42,6 +44,64 @@ int x86emul_0f01(struct x86_emulate_stat
((uint64_t)regs->r(dx) << 32) | regs->eax,
ctxt);
goto done;
+
+ case vex_f3: /* wrmsrlist */
+ vcpu_must_have(msrlist);
+ generate_exception_if(!mode_64bit(), X86_EXC_UD);
+ generate_exception_if(!mode_ring0() || (regs->esi & 7) ||
+ (regs->edi & 7),
+ X86_EXC_GP, 0);
+ fail_if(!ops->write_msr);
+ while ( regs->r(cx) )
+ {
+ n = __builtin_ffsl(regs->r(cx)) - 1;
+ if ( (rc = ops->read(x86_seg_none, regs->r(si) + n * 8,
+ &msr_val, 8, ctxt)) != X86EMUL_OKAY )
+ break;
+ generate_exception_if(msr_val != (uint32_t)msr_val,
+ X86_EXC_GP, 0);
+ base = msr_val;
+ if ( (rc = ops->read(x86_seg_none, regs->r(di) + n * 8,
+ &msr_val, 8, ctxt)) != X86EMUL_OKAY ||
+ (rc = ops->write_msr(base, msr_val, ctxt)) != X86EMUL_OKAY )
+ break;
+ regs->r(cx) &= ~(1UL << n);
+
+#ifdef __XEN__
+ if ( regs->r(cx) && local_events_need_delivery() )
+ {
+ rc = X86EMUL_RETRY;
+ break;
+ }
+#endif
+ }
+ goto done;
+
+ case vex_f2: /* rdmsrlist */
+ vcpu_must_have(msrlist);
+ generate_exception_if(!mode_64bit(), X86_EXC_UD);
+ generate_exception_if(!mode_ring0() || (regs->esi & 7) ||
+ (regs->edi & 7),
+ X86_EXC_GP, 0);
+ fail_if(!ops->read_msr || !ops->write);
+ while ( regs->r(cx) )
+ {
+ n = __builtin_ffsl(regs->r(cx)) - 1;
+ if ( (rc = ops->read(x86_seg_none, regs->r(si) + n * 8,
+ &msr_val, 8, ctxt)) != X86EMUL_OKAY )
+ break;
+ generate_exception_if(msr_val != (uint32_t)msr_val,
+ X86_EXC_GP, 0);
+ if ( (rc = ops->read_msr(msr_val, &msr_val,
+ ctxt)) != X86EMUL_OKAY ||
+ (rc = ops->write(x86_seg_none, regs->r(di) + n * 8,
+ &msr_val, 8, ctxt)) != X86EMUL_OKAY )
+ break;
+ regs->r(cx) &= ~(1UL << n);
+ }
+ if ( rc != X86EMUL_OKAY )
+ ctxt->regs->r(cx) = regs->r(cx);
+ goto done;
}
generate_exception(X86_EXC_UD);
--- a/xen/arch/x86/x86_emulate/private.h
+++ b/xen/arch/x86/x86_emulate/private.h
@@ -597,6 +597,7 @@ amd_like(const struct x86_emulate_ctxt *
#define vcpu_has_lkgs() (ctxt->cpuid->feat.lkgs)
#define vcpu_has_wrmsrns() (ctxt->cpuid->feat.wrmsrns)
#define vcpu_has_avx_ifma() (ctxt->cpuid->feat.avx_ifma)
+#define vcpu_has_msrlist() (ctxt->cpuid->feat.msrlist)
#define vcpu_has_avx_vnni_int8() (ctxt->cpuid->feat.avx_vnni_int8)
#define vcpu_has_avx_ne_convert() (ctxt->cpuid->feat.avx_ne_convert)
#define vcpu_has_avx_vnni_int16() (ctxt->cpuid->feat.avx_vnni_int16)
--- a/xen/arch/x86/x86_emulate/util.c
+++ b/xen/arch/x86/x86_emulate/util.c
@@ -100,6 +100,9 @@ bool cf_check x86_insn_is_mem_access(con
break;
case X86EMUL_OPC(0x0f, 0x01):
+ /* {RD,WR}MSRLIST */
+ if ( mode_64bit() && s->modrm == 0xc6 )
+ return s->vex.pfx >= vex_f3;
/* Cover CLZERO. */
return (s->modrm_rm & 7) == 4 && (s->modrm_reg & 7) == 7;
}
@@ -160,7 +163,11 @@ bool cf_check x86_insn_is_mem_write(cons
case 0xff: /* Grp5 */
break;
- case X86EMUL_OPC(0x0f, 0x01): /* CLZERO is the odd one. */
+ case X86EMUL_OPC(0x0f, 0x01):
+ /* RDMSRLIST */
+ if ( mode_64bit() && s->modrm == 0xc6 )
+ return s->vex.pfx == vex_f2;
+ /* CLZERO is another odd one. */
return (s->modrm_rm & 7) == 4 && (s->modrm_reg & 7) == 7;
default:
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -313,6 +313,7 @@ XEN_CPUFEATURE(WRMSRNS, 10*32+19) /
XEN_CPUFEATURE(NMI_SRC, 10*32+20) /* NMI-source reporting */
XEN_CPUFEATURE(AMX_FP16, 10*32+21) /* AMX FP16 instruction */
XEN_CPUFEATURE(AVX_IFMA, 10*32+23) /*A AVX-IFMA Instructions */
+XEN_CPUFEATURE(MSRLIST, 10*32+27) /*s MSR list instructions */
/* AMD-defined CPU features, CPUID level 0x80000021.eax, word 11 */
XEN_CPUFEATURE(NO_NEST_BP, 11*32+ 0) /*A No Nested Data Breakpoints */
--- a/xen/tools/gen-cpuid.py
+++ b/xen/tools/gen-cpuid.py
@@ -275,7 +275,7 @@ def crunch_numbers(state):
# NO_LMSL indicates the absense of Long Mode Segment Limits, which
# have been dropped in hardware.
LM: [CX16, PCID, LAHF_LM, PAGE1GB, PKU, NO_LMSL, AMX_TILE, CMPCCXADD,
- LKGS],
+ LKGS, MSRLIST],
# AMD K6-2+ and K6-III processors shipped with 3DNow+, beyond the
# standard 3DNow in the earlier K6 processors.
© 2016 - 2024 Red Hat, Inc.