xen/arch/x86/hvm/vmx/vmcs.c | 10 ++++++- xen/arch/x86/hvm/vmx/vmx.c | 40 ++++++++++++++++++++----- xen/arch/x86/include/asm/hvm/vmx/vmcs.h | 5 ++++ xen/arch/x86/include/asm/msr.h | 9 ++++-- 4 files changed, 54 insertions(+), 10 deletions(-)
The feature is defined in the tertiary exec control, and is available starting
from Sapphire Rapids and Alder Lake CPUs.
When enabled, two extra VMCS fields are used: SPEC_CTRL mask and shadow. Bits
set in mask are not allowed to be toggled by the guest (either set or clear)
and the value in the shadow field is the value the guest expects to be in the
SPEC_CTRL register.
By using it the hypervisor can force the value of SPEC_CTRL bits behind the
guest back without having to trap all accesses to SPEC_CTRL. It also allows
getting rid of SPEC_CTRL in the guest MSR load list, since the value in the
shadow field will be loaded by the hardware on vmentry.
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
---
Applies on top of "VMX: tertiary execution control infrastructure"
---
xen/arch/x86/hvm/vmx/vmcs.c | 10 ++++++-
xen/arch/x86/hvm/vmx/vmx.c | 40 ++++++++++++++++++++-----
xen/arch/x86/include/asm/hvm/vmx/vmcs.h | 5 ++++
xen/arch/x86/include/asm/msr.h | 9 ++++--
4 files changed, 54 insertions(+), 10 deletions(-)
diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index 9e016634ab5c..adcbd014123a 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -202,6 +202,7 @@ static void __init vmx_display_features(void)
P(cpu_has_vmx_tsc_scaling, "TSC Scaling");
P(cpu_has_vmx_bus_lock_detection, "Bus Lock Detection");
P(cpu_has_vmx_notify_vm_exiting, "Notify VM Exit");
+ P(cpu_has_vmx_virt_spec_ctrl, "Virtualize SPEC_CTRL");
#undef P
if ( !printed )
@@ -365,7 +366,7 @@ static int vmx_init_vmcs_config(bool bsp)
if ( _vmx_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS )
{
- uint64_t opt = 0;
+ uint64_t opt = TERTIARY_EXEC_VIRT_SPEC_CTRL;
_vmx_tertiary_exec_control = adjust_vmx_controls2(
"Tertiary Exec Control", 0, opt,
@@ -1378,6 +1379,10 @@ static int construct_vmcs(struct vcpu *v)
rc = vmx_add_msr(v, MSR_PRED_CMD, PRED_CMD_IBPB,
VMX_MSR_HOST);
+ /* Set any bits we don't allow toggling in the mask field. */
+ if ( cpu_has_vmx_virt_spec_ctrl && v->arch.msrs->spec_ctrl.raw )
+ __vmwrite(SPEC_CTRL_MASK, v->arch.msrs->spec_ctrl.raw);
+
out:
vmx_vmcs_exit(v);
@@ -2086,6 +2091,9 @@ void vmcs_dump_vcpu(struct vcpu *v)
if ( v->arch.hvm.vmx.secondary_exec_control &
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY )
printk("InterruptStatus = %04x\n", vmr16(GUEST_INTR_STATUS));
+ if ( cpu_has_vmx_virt_spec_ctrl )
+ printk("SPEC_CTRL mask = %#016lx shadow = %#016lx\n",
+ vmr(SPEC_CTRL_MASK), vmr(SPEC_CTRL_SHADOW));
printk("*** Host State ***\n");
printk("RIP = 0x%016lx (%ps) RSP = 0x%016lx\n",
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 48376cc32751..3911e4ecb0d6 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -823,18 +823,28 @@ static void cf_check vmx_cpuid_policy_changed(struct vcpu *v)
{
vmx_clear_msr_intercept(v, MSR_SPEC_CTRL, VMX_MSR_RW);
- rc = vmx_add_guest_msr(v, MSR_SPEC_CTRL, 0);
- if ( rc )
- goto out;
+ if ( !cpu_has_vmx_virt_spec_ctrl )
+ {
+ rc = vmx_add_guest_msr(v, MSR_SPEC_CTRL, 0);
+ if ( rc )
+ goto out;
+ }
}
else
{
vmx_set_msr_intercept(v, MSR_SPEC_CTRL, VMX_MSR_RW);
- rc = vmx_del_msr(v, MSR_SPEC_CTRL, VMX_MSR_GUEST);
- if ( rc && rc != -ESRCH )
- goto out;
- rc = 0; /* Tolerate -ESRCH */
+ /*
+ * NB: there's no need to clear the virtualize SPEC_CTRL control, as
+ * the MSR intercept takes precedence.
+ */
+ if ( !cpu_has_vmx_virt_spec_ctrl )
+ {
+ rc = vmx_del_msr(v, MSR_SPEC_CTRL, VMX_MSR_GUEST);
+ if ( rc && rc != -ESRCH )
+ goto out;
+ rc = 0; /* Tolerate -ESRCH */
+ }
}
/* MSR_PRED_CMD is safe to pass through if the guest knows about it. */
@@ -2629,6 +2639,9 @@ static uint64_t cf_check vmx_get_reg(struct vcpu *v, unsigned int reg)
switch ( reg )
{
case MSR_SPEC_CTRL:
+ if ( cpu_has_vmx_virt_spec_ctrl )
+ /* Requires remote VMCS loaded - fetched below. */
+ break;
rc = vmx_read_guest_msr(v, reg, &val);
if ( rc )
{
@@ -2652,6 +2665,11 @@ static uint64_t cf_check vmx_get_reg(struct vcpu *v, unsigned int reg)
vmx_vmcs_enter(v);
switch ( reg )
{
+ case MSR_SPEC_CTRL:
+ ASSERT(cpu_has_vmx_virt_spec_ctrl);
+ __vmread(SPEC_CTRL_SHADOW, &val);
+ break;
+
case MSR_IA32_BNDCFGS:
__vmread(GUEST_BNDCFGS, &val);
break;
@@ -2678,6 +2696,9 @@ static void cf_check vmx_set_reg(struct vcpu *v, unsigned int reg, uint64_t val)
switch ( reg )
{
case MSR_SPEC_CTRL:
+ if ( cpu_has_vmx_virt_spec_ctrl )
+ /* Requires remote VMCS loaded - fetched below. */
+ break;
rc = vmx_write_guest_msr(v, reg, val);
if ( rc )
{
@@ -2698,6 +2719,11 @@ static void cf_check vmx_set_reg(struct vcpu *v, unsigned int reg, uint64_t val)
vmx_vmcs_enter(v);
switch ( reg )
{
+ case MSR_SPEC_CTRL:
+ ASSERT(cpu_has_vmx_virt_spec_ctrl);
+ __vmwrite(SPEC_CTRL_SHADOW, val);
+ break;
+
case MSR_IA32_BNDCFGS:
__vmwrite(GUEST_BNDCFGS, val);
break;
diff --git a/xen/arch/x86/include/asm/hvm/vmx/vmcs.h b/xen/arch/x86/include/asm/hvm/vmx/vmcs.h
index a7dd2eeffcad..58140af69153 100644
--- a/xen/arch/x86/include/asm/hvm/vmx/vmcs.h
+++ b/xen/arch/x86/include/asm/hvm/vmx/vmcs.h
@@ -270,6 +270,9 @@ extern u32 vmx_secondary_exec_control;
#define TERTIARY_EXEC_VIRT_SPEC_CTRL BIT(7, UL)
extern uint64_t vmx_tertiary_exec_control;
+#define cpu_has_vmx_virt_spec_ctrl \
+ (vmx_tertiary_exec_control & TERTIARY_EXEC_VIRT_SPEC_CTRL)
+
#define VMX_EPT_EXEC_ONLY_SUPPORTED 0x00000001
#define VMX_EPT_WALK_LENGTH_4_SUPPORTED 0x00000040
#define VMX_EPT_MEMORY_TYPE_UC 0x00000100
@@ -436,6 +439,8 @@ enum vmcs_field {
XSS_EXIT_BITMAP = 0x0000202c,
TSC_MULTIPLIER = 0x00002032,
TERTIARY_VM_EXEC_CONTROL = 0x00002034,
+ SPEC_CTRL_MASK = 0x0000204a,
+ SPEC_CTRL_SHADOW = 0x0000204c,
GUEST_PHYSICAL_ADDRESS = 0x00002400,
VMCS_LINK_POINTER = 0x00002800,
GUEST_IA32_DEBUGCTL = 0x00002802,
diff --git a/xen/arch/x86/include/asm/msr.h b/xen/arch/x86/include/asm/msr.h
index 1d8ea9f26faa..eed7b36cd992 100644
--- a/xen/arch/x86/include/asm/msr.h
+++ b/xen/arch/x86/include/asm/msr.h
@@ -302,8 +302,13 @@ struct vcpu_msrs
* For PV guests, this holds the guest kernel value. It is accessed on
* every entry/exit path.
*
- * For VT-x guests, the guest value is held in the MSR guest load/save
- * list.
+ * For VT-x guests, the guest value is held in the MSR guest load/save list
+ * if there's no support for virtualized SPEC_CTRL. If virtualized
+ * SPEC_CTRL is enabled the value here signals which bits in SPEC_CTRL the
+ * guest is not able to modify. Note that the value for those bits used in
+ * Xen context is also used in the guest context. Setting a bit here
+ * doesn't force such bit to set in the guest context unless also set in
+ * Xen selection of SPEC_CTRL.
*
* For SVM, the guest value lives in the VMCB, and hardware saves/restores
* the host value automatically. However, guests run with the OR of the
--
2.43.0
On 06.02.2024 15:25, Roger Pau Monne wrote: > @@ -2086,6 +2091,9 @@ void vmcs_dump_vcpu(struct vcpu *v) > if ( v->arch.hvm.vmx.secondary_exec_control & > SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY ) > printk("InterruptStatus = %04x\n", vmr16(GUEST_INTR_STATUS)); > + if ( cpu_has_vmx_virt_spec_ctrl ) > + printk("SPEC_CTRL mask = %#016lx shadow = %#016lx\n", > + vmr(SPEC_CTRL_MASK), vmr(SPEC_CTRL_SHADOW)); #0... doesn't make a lot of sense; only e.g. %#lx does. Seeing context there's no 0x prefix there anyway. Having looked at the function the other day, I know though that there's a fair mix of 0x-prefixed and unprefixed hex numbers that are output. Personally I'd prefer if all 0x prefixes were omitted here. If you and Andrew think otherwise, I can live with that, so long as we're at least striving towards consistent output (I may be able to get to doing a conversion patch, once I know which way the conversion should be). > --- a/xen/arch/x86/hvm/vmx/vmx.c > +++ b/xen/arch/x86/hvm/vmx/vmx.c > @@ -823,18 +823,28 @@ static void cf_check vmx_cpuid_policy_changed(struct vcpu *v) > { > vmx_clear_msr_intercept(v, MSR_SPEC_CTRL, VMX_MSR_RW); > > - rc = vmx_add_guest_msr(v, MSR_SPEC_CTRL, 0); > - if ( rc ) > - goto out; > + if ( !cpu_has_vmx_virt_spec_ctrl ) > + { > + rc = vmx_add_guest_msr(v, MSR_SPEC_CTRL, 0); > + if ( rc ) > + goto out; > + } I'm certainly okay with you doing it this way, but generally I'd prefer if code churn was limited whjere possible. Here leveraging that rc is 0 on entry, a smaller change would be to if ( !cpu_has_vmx_virt_spec_ctrl ) rc = vmx_add_guest_msr(v, MSR_SPEC_CTRL, 0); if ( rc ) goto out; (similarly below then). > else > { > vmx_set_msr_intercept(v, MSR_SPEC_CTRL, VMX_MSR_RW); > > - rc = vmx_del_msr(v, MSR_SPEC_CTRL, VMX_MSR_GUEST); > - if ( rc && rc != -ESRCH ) > - goto out; > - rc = 0; /* Tolerate -ESRCH */ > + /* > + * NB: there's no need to clear the virtualize SPEC_CTRL control, as > + * the MSR intercept takes precedence. > + */ The two VMCS values are, aiui, unused during guest entry/exit. Maybe worth mentioning here as well, as that not being the case would also raise correctness questions? > --- a/xen/arch/x86/include/asm/msr.h > +++ b/xen/arch/x86/include/asm/msr.h > @@ -302,8 +302,13 @@ struct vcpu_msrs > * For PV guests, this holds the guest kernel value. It is accessed on > * every entry/exit path. > * > - * For VT-x guests, the guest value is held in the MSR guest load/save > - * list. > + * For VT-x guests, the guest value is held in the MSR guest load/save list > + * if there's no support for virtualized SPEC_CTRL. If virtualized > + * SPEC_CTRL is enabled the value here signals which bits in SPEC_CTRL the > + * guest is not able to modify. Note that the value for those bits used in > + * Xen context is also used in the guest context. Setting a bit here > + * doesn't force such bit to set in the guest context unless also set in > + * Xen selection of SPEC_CTRL. Hmm, this mask value is unlikely to be in need of being vCPU-specific. I'd not even expect it to be per-domain, but simply global. I also can't spot where you set that field; do we really mean to give guests full control now that we have it (rather than e.g. running in IBRS-always-on mode at least under certain conditions)? If intended to be like this for now, this (to me at least) surprising aspect could likely do with mentioning in the description. Jan
On Thu, Feb 08, 2024 at 02:40:53PM +0100, Jan Beulich wrote: > On 06.02.2024 15:25, Roger Pau Monne wrote: > > @@ -2086,6 +2091,9 @@ void vmcs_dump_vcpu(struct vcpu *v) > > if ( v->arch.hvm.vmx.secondary_exec_control & > > SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY ) > > printk("InterruptStatus = %04x\n", vmr16(GUEST_INTR_STATUS)); > > + if ( cpu_has_vmx_virt_spec_ctrl ) > > + printk("SPEC_CTRL mask = %#016lx shadow = %#016lx\n", > > + vmr(SPEC_CTRL_MASK), vmr(SPEC_CTRL_SHADOW)); > > #0... doesn't make a lot of sense; only e.g. %#lx does. Seeing context > there's no 0x prefix there anyway. Having looked at the function the > other day, I know though that there's a fair mix of 0x-prefixed and > unprefixed hex numbers that are output. For consistency with how other MSRs are printed I should use the '0x' prefix. > Personally I'd prefer if all > 0x prefixes were omitted here. If you and Andrew think otherwise, I can > live with that, so long as we're at least striving towards consistent > output (I may be able to get to doing a conversion patch, once I know > which way the conversion should be). I usually prefer the '0x' to avoid ambiguity. However this being all hardware registers, I might be fine with dropping the '0x' on the grounds that all registers are always printed as hex. > > --- a/xen/arch/x86/hvm/vmx/vmx.c > > +++ b/xen/arch/x86/hvm/vmx/vmx.c > > @@ -823,18 +823,28 @@ static void cf_check vmx_cpuid_policy_changed(struct vcpu *v) > > { > > vmx_clear_msr_intercept(v, MSR_SPEC_CTRL, VMX_MSR_RW); > > > > - rc = vmx_add_guest_msr(v, MSR_SPEC_CTRL, 0); > > - if ( rc ) > > - goto out; > > + if ( !cpu_has_vmx_virt_spec_ctrl ) > > + { > > + rc = vmx_add_guest_msr(v, MSR_SPEC_CTRL, 0); > > + if ( rc ) > > + goto out; > > + } > > I'm certainly okay with you doing it this way, but generally I'd prefer > if code churn was limited whjere possible. Here leveraging that rc is 0 > on entry, a smaller change would be to > > if ( !cpu_has_vmx_virt_spec_ctrl ) > rc = vmx_add_guest_msr(v, MSR_SPEC_CTRL, 0); > if ( rc ) > goto out; > > (similarly below then). That looks odd to me, and is not how I would write that code. I can however adjust if you insist. Given it's just a two line difference I think it was worth having the more usual form. > > else > > { > > vmx_set_msr_intercept(v, MSR_SPEC_CTRL, VMX_MSR_RW); > > > > - rc = vmx_del_msr(v, MSR_SPEC_CTRL, VMX_MSR_GUEST); > > - if ( rc && rc != -ESRCH ) > > - goto out; > > - rc = 0; /* Tolerate -ESRCH */ > > + /* > > + * NB: there's no need to clear the virtualize SPEC_CTRL control, as > > + * the MSR intercept takes precedence. > > + */ > > The two VMCS values are, aiui, unused during guest entry/exit. Maybe > worth mentioning here as well, as that not being the case would also > raise correctness questions? Hm, yes indeed, I've double checked and the value is not loaded, so will expand the message. > > --- a/xen/arch/x86/include/asm/msr.h > > +++ b/xen/arch/x86/include/asm/msr.h > > @@ -302,8 +302,13 @@ struct vcpu_msrs > > * For PV guests, this holds the guest kernel value. It is accessed on > > * every entry/exit path. > > * > > - * For VT-x guests, the guest value is held in the MSR guest load/save > > - * list. > > + * For VT-x guests, the guest value is held in the MSR guest load/save list > > + * if there's no support for virtualized SPEC_CTRL. If virtualized > > + * SPEC_CTRL is enabled the value here signals which bits in SPEC_CTRL the > > + * guest is not able to modify. Note that the value for those bits used in > > + * Xen context is also used in the guest context. Setting a bit here > > + * doesn't force such bit to set in the guest context unless also set in > > + * Xen selection of SPEC_CTRL. > > Hmm, this mask value is unlikely to be in need of being vCPU-specific. > I'd not even expect it to be per-domain, but simply global. This is mostly to keep the logic in-sync with the one used on AMD. > I also can't spot where you set that field; do we really mean to give > guests full control now that we have it (rather than e.g. running in > IBRS-always-on mode at least under certain conditions)? If intended to > be like this for now, this (to me at least) surprising aspect could > likely do with mentioning in the description. Yes, so far I didn't set any bit before the guest back, that should be done in a separate patch. Thanks, Roger.
On 09.02.2024 11:45, Roger Pau Monné wrote: > On Thu, Feb 08, 2024 at 02:40:53PM +0100, Jan Beulich wrote: >> On 06.02.2024 15:25, Roger Pau Monne wrote: >>> @@ -2086,6 +2091,9 @@ void vmcs_dump_vcpu(struct vcpu *v) >>> if ( v->arch.hvm.vmx.secondary_exec_control & >>> SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY ) >>> printk("InterruptStatus = %04x\n", vmr16(GUEST_INTR_STATUS)); >>> + if ( cpu_has_vmx_virt_spec_ctrl ) >>> + printk("SPEC_CTRL mask = %#016lx shadow = %#016lx\n", >>> + vmr(SPEC_CTRL_MASK), vmr(SPEC_CTRL_SHADOW)); >> >> #0... doesn't make a lot of sense; only e.g. %#lx does. Seeing context >> there's no 0x prefix there anyway. Having looked at the function the >> other day, I know though that there's a fair mix of 0x-prefixed and >> unprefixed hex numbers that are output. > > For consistency with how other MSRs are printed I should use the '0x' > prefix. MSRs? It's VMCS fields which are printed here. >> Personally I'd prefer if all >> 0x prefixes were omitted here. If you and Andrew think otherwise, I can >> live with that, so long as we're at least striving towards consistent >> output (I may be able to get to doing a conversion patch, once I know >> which way the conversion should be). > > I usually prefer the '0x' to avoid ambiguity. However this being all > hardware registers, I might be fine with dropping the '0x' on the > grounds that all registers are always printed as hex. > >>> --- a/xen/arch/x86/hvm/vmx/vmx.c >>> +++ b/xen/arch/x86/hvm/vmx/vmx.c >>> @@ -823,18 +823,28 @@ static void cf_check vmx_cpuid_policy_changed(struct vcpu *v) >>> { >>> vmx_clear_msr_intercept(v, MSR_SPEC_CTRL, VMX_MSR_RW); >>> >>> - rc = vmx_add_guest_msr(v, MSR_SPEC_CTRL, 0); >>> - if ( rc ) >>> - goto out; >>> + if ( !cpu_has_vmx_virt_spec_ctrl ) >>> + { >>> + rc = vmx_add_guest_msr(v, MSR_SPEC_CTRL, 0); >>> + if ( rc ) >>> + goto out; >>> + } >> >> I'm certainly okay with you doing it this way, but generally I'd prefer >> if code churn was limited whjere possible. Here leveraging that rc is 0 >> on entry, a smaller change would be to >> >> if ( !cpu_has_vmx_virt_spec_ctrl ) >> rc = vmx_add_guest_msr(v, MSR_SPEC_CTRL, 0); >> if ( rc ) >> goto out; >> >> (similarly below then). > > That looks odd to me, and is not how I would write that code. I can > however adjust if you insist. Given it's just a two line difference I > think it was worth having the more usual form. As said, I'm okay with the change as is. Jan
On Fri, Feb 09, 2024 at 12:51:41PM +0100, Jan Beulich wrote: > On 09.02.2024 11:45, Roger Pau Monné wrote: > > On Thu, Feb 08, 2024 at 02:40:53PM +0100, Jan Beulich wrote: > >> On 06.02.2024 15:25, Roger Pau Monne wrote: > >>> @@ -2086,6 +2091,9 @@ void vmcs_dump_vcpu(struct vcpu *v) > >>> if ( v->arch.hvm.vmx.secondary_exec_control & > >>> SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY ) > >>> printk("InterruptStatus = %04x\n", vmr16(GUEST_INTR_STATUS)); > >>> + if ( cpu_has_vmx_virt_spec_ctrl ) > >>> + printk("SPEC_CTRL mask = %#016lx shadow = %#016lx\n", > >>> + vmr(SPEC_CTRL_MASK), vmr(SPEC_CTRL_SHADOW)); > >> > >> #0... doesn't make a lot of sense; only e.g. %#lx does. Seeing context > >> there's no 0x prefix there anyway. Having looked at the function the > >> other day, I know though that there's a fair mix of 0x-prefixed and > >> unprefixed hex numbers that are output. > > > > For consistency with how other MSRs are printed I should use the '0x' > > prefix. > > MSRs? It's VMCS fields which are printed here. Well, yes, but it represents a MSR value. Thanks, Roger.
© 2016 - 2024 Red Hat, Inc.