:p
atchew
Login
This series lays the groundwork for revamp of the AMD nested virt functionality. The first five patches are clean-ups or reorganizations of existing code. The final patch is the first major step towards making the feature supportable: allowing Xen to refuse nested virt support if certain hardware features are not present. George Dunlap (6): xen/hvm: Convert hap_capabilities into a bitfield svm: Improve type of cpu_has_svm_feature xen/hvm: Move other hvm_function_table booleans into the caps bitfield nestedsvm: Disable TscRateMSR nestedsvm: Remove bogus debug message from nestedsvm_check_intercepts svm/nestedvm: Introduce nested capabilities bit docs/designs/nested-svm-cpu-features.md | 110 +++++++++++++++++++ xen/arch/x86/cpu-policy.c | 3 +- xen/arch/x86/domain.c | 6 + xen/arch/x86/hvm/hvm.c | 14 +-- xen/arch/x86/hvm/svm/nestedhvm.h | 1 + xen/arch/x86/hvm/svm/nestedsvm.c | 22 +++- xen/arch/x86/hvm/svm/svm.c | 65 +---------- xen/arch/x86/hvm/vlapic.c | 4 +- xen/arch/x86/hvm/vmx/vmcs.c | 6 +- xen/arch/x86/hvm/vmx/vmx.c | 19 ++-- xen/arch/x86/include/asm/hvm/hvm.h | 47 ++++---- xen/arch/x86/include/asm/hvm/svm/nestedsvm.h | 5 - xen/arch/x86/include/asm/hvm/svm/svm.h | 5 +- 13 files changed, 191 insertions(+), 116 deletions(-) create mode 100644 docs/designs/nested-svm-cpu-features.md -- 2.25.1
hvm_function_table is an internal structure; rather than manually |-ing and &-ing bits, just make it a boolean bitfield and let the compiler do all the work. This makes everything easier to read, and presumably allows the compiler more flexibility in producing efficient code. No functional change intended. Signed-off-by: George Dunlap <george.dunlap@cloud.com> --- Questions: * Should hap_superpage_2m really be set unconditionally, or should we condition it on cpu_has_svm_npt? * Do we really need to "!!cpu_has_svm_npt"? If so, wouldn't it be better to put the "!!" in the #define, rather than requiring the user to know that it's needed? CC: Jan Beulich <jbeulich@suse.com> CC: Andrew Cooper <andrew.cooper3@citrix.com> CC: "Roger Pau Monné" <roger.pau@citrix.com> CC: Wei Liu <wl@xen.org> CC: Jun Nakajima <jun.nakajima@intel.com> CC: Kevin Tian <kevin.tian@intel.com> --- xen/arch/x86/hvm/hvm.c | 8 ++++---- xen/arch/x86/hvm/svm/svm.c | 4 ++-- xen/arch/x86/hvm/vmx/vmcs.c | 4 ++-- xen/arch/x86/hvm/vmx/vmx.c | 8 ++------ xen/arch/x86/include/asm/hvm/hvm.h | 19 +++++++------------ 5 files changed, 17 insertions(+), 26 deletions(-) diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/hvm/hvm.c +++ b/xen/arch/x86/hvm/hvm.c @@ -XXX,XX +XXX,XX @@ static int __init cf_check hvm_enable(void) { printk("HVM: Hardware Assisted Paging (HAP) detected\n"); printk("HVM: HAP page sizes: 4kB"); - if ( fns->hap_capabilities & HVM_HAP_SUPERPAGE_2MB ) + if ( fns->caps.hap_superpage_2mb ) { printk(", 2MB%s", opt_hap_2mb ? "" : " [disabled]"); if ( !opt_hap_2mb ) - hvm_funcs.hap_capabilities &= ~HVM_HAP_SUPERPAGE_2MB; + hvm_funcs.caps.hap_superpage_2mb = false; } - if ( fns->hap_capabilities & HVM_HAP_SUPERPAGE_1GB ) + if ( fns->caps.hap_superpage_1gb ) { printk(", 1GB%s", opt_hap_1gb ? "" : " [disabled]"); if ( !opt_hap_1gb ) - hvm_funcs.hap_capabilities &= ~HVM_HAP_SUPERPAGE_1GB; + hvm_funcs.caps.hap_superpage_1gb = false; } printk("\n"); } diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/hvm/svm/svm.c +++ b/xen/arch/x86/hvm/svm/svm.c @@ -XXX,XX +XXX,XX @@ const struct hvm_function_table * __init start_svm(void) printk(" - none\n"); svm_function_table.hap_supported = !!cpu_has_svm_npt; - svm_function_table.hap_capabilities = HVM_HAP_SUPERPAGE_2MB | - (cpu_has_page1gb ? HVM_HAP_SUPERPAGE_1GB : 0); + svm_function_table.caps.hap_superpage_2mb = true; + svm_function_table.caps.hap_superpage_1gb = cpu_has_page1gb; return &svm_function_table; } diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/hvm/vmx/vmcs.c +++ b/xen/arch/x86/hvm/vmx/vmcs.c @@ -XXX,XX +XXX,XX @@ static int cf_check parse_ept_param_runtime(const char *s) int val; if ( !cpu_has_vmx_ept || !hvm_funcs.hap_supported || - !(hvm_funcs.hap_capabilities & - (HVM_HAP_SUPERPAGE_2MB | HVM_HAP_SUPERPAGE_1GB)) ) + !(hvm_funcs.caps.hap_superpage_2mb || + hvm_funcs.caps.hap_superpage_1gb) ) { printk("VMX: EPT not available, or not in use - ignoring\n"); return 0; diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/hvm/vmx/vmx.c +++ b/xen/arch/x86/hvm/vmx/vmx.c @@ -XXX,XX +XXX,XX @@ const struct hvm_function_table * __init start_vmx(void) vmx_function_table.hap_supported = 1; vmx_function_table.altp2m_supported = 1; - vmx_function_table.hap_capabilities = 0; - - if ( cpu_has_vmx_ept_2mb ) - vmx_function_table.hap_capabilities |= HVM_HAP_SUPERPAGE_2MB; - if ( cpu_has_vmx_ept_1gb ) - vmx_function_table.hap_capabilities |= HVM_HAP_SUPERPAGE_1GB; + vmx_function_table.caps.hap_superpage_2mb = cpu_has_vmx_ept_2mb; + vmx_function_table.caps.hap_superpage_1gb = cpu_has_vmx_ept_1gb; setup_ept_dump(); } diff --git a/xen/arch/x86/include/asm/hvm/hvm.h b/xen/arch/x86/include/asm/hvm/hvm.h index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/include/asm/hvm/hvm.h +++ b/xen/arch/x86/include/asm/hvm/hvm.h @@ -XXX,XX +XXX,XX @@ enum hvm_intblk { #define HVM_INTR_SHADOW_SMI 0x00000004 #define HVM_INTR_SHADOW_NMI 0x00000008 -/* - * HAP super page capabilities: - * bit0: if 2MB super page is allowed? - * bit1: if 1GB super page is allowed? - */ -#define HVM_HAP_SUPERPAGE_2MB 0x00000001 -#define HVM_HAP_SUPERPAGE_1GB 0x00000002 - #define HVM_EVENT_VECTOR_UNSET (-1) #define HVM_EVENT_VECTOR_UPDATING (-2) @@ -XXX,XX +XXX,XX @@ struct hvm_function_table { /* Hardware virtual interrupt delivery enable? */ bool virtual_intr_delivery_enabled; - /* Indicate HAP capabilities. */ - unsigned int hap_capabilities; + struct { + /* Indicate HAP capabilities. */ + bool hap_superpage_1gb:1, + hap_superpage_2mb:1; + } caps; /* * Initialise/destroy HVM domain/vcpu resources @@ -XXX,XX +XXX,XX @@ int hvm_get_param(struct domain *d, uint32_t index, uint64_t *value); (hvm_paging_enabled(v) && ((v)->arch.hvm.guest_cr[4] & X86_CR4_PKS)) /* Can we use superpages in the HAP p2m table? */ -#define hap_has_1gb (!!(hvm_funcs.hap_capabilities & HVM_HAP_SUPERPAGE_1GB)) -#define hap_has_2mb (!!(hvm_funcs.hap_capabilities & HVM_HAP_SUPERPAGE_2MB)) +#define hap_has_1gb hvm_funcs.caps.hap_superpage_1gb +#define hap_has_2mb hvm_funcs.caps.hap_superpage_2mb #define hvm_long_mode_active(v) (!!((v)->arch.hvm.guest_efer & EFER_LMA)) -- 2.25.1
The "effective type" of the cpu_has_svm_feature macro is effectively an unsigned log with one bit set (or not); at least one place someone felt compelled to do a !! to make sure that they got a boolean out of it. Ideally the whole of this would be folded into the cpufeature.h infrastructure. But for now, duplicate the more type-safe static inlines in that file, and remove the !!. Signed-off-by: George Dunlap <george.dunlap@cloud.com> --- CC: Jan Beulich <jbeulich@suse.com> CC: Andrew Cooper <andrew.cooper3@citrix.com> CC: "Roger Pau Monné" <roger.pau@citrix.com> CC: Wei Liu <wl@xen.org> --- xen/arch/x86/hvm/svm/svm.c | 2 +- xen/arch/x86/include/asm/hvm/svm/svm.h | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/hvm/svm/svm.c +++ b/xen/arch/x86/hvm/svm/svm.c @@ -XXX,XX +XXX,XX @@ const struct hvm_function_table * __init start_svm(void) if ( !printed ) printk(" - none\n"); - svm_function_table.hap_supported = !!cpu_has_svm_npt; + svm_function_table.hap_supported = cpu_has_svm_npt; svm_function_table.caps.hap_superpage_2mb = true; svm_function_table.caps.hap_superpage_1gb = cpu_has_page1gb; diff --git a/xen/arch/x86/include/asm/hvm/svm/svm.h b/xen/arch/x86/include/asm/hvm/svm/svm.h index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/include/asm/hvm/svm/svm.h +++ b/xen/arch/x86/include/asm/hvm/svm/svm.h @@ -XXX,XX +XXX,XX @@ extern u32 svm_feature_flags; #define SVM_FEATURE_SSS 19 /* NPT Supervisor Shadow Stacks */ #define SVM_FEATURE_SPEC_CTRL 20 /* MSR_SPEC_CTRL virtualisation */ -#define cpu_has_svm_feature(f) (svm_feature_flags & (1u << (f))) +static inline bool cpu_has_svm_feature(unsigned int feat) +{ + return svm_feature_flags & (1u << (feat)); +} #define cpu_has_svm_npt cpu_has_svm_feature(SVM_FEATURE_NPT) #define cpu_has_svm_lbrv cpu_has_svm_feature(SVM_FEATURE_LBRV) #define cpu_has_svm_svml cpu_has_svm_feature(SVM_FEATURE_SVML) -- 2.25.1
Moving them all together has several advantages: * Collects them all in one part of the struct * The `caps` field means that we can drop the "_supported" suffix, as it's clear what is meant. Signed-off-by: George Dunlap <george.dunlap@cloud.com> --- CC: Jan Beulich <jbeulich@suse.com> CC: Andrew Cooper <andrew.cooper3@citrix.com> CC: "Roger Pau Monné" <roger.pau@citrix.com> CC: Wei Liu <wl@xen.org> CC: Jun Nakajima <jun.nakajima@intel.com> CC: Kevin Tian <kevin.tian@intel.com> --- xen/arch/x86/hvm/hvm.c | 6 +++--- xen/arch/x86/hvm/svm/svm.c | 2 +- xen/arch/x86/hvm/vlapic.c | 4 ++-- xen/arch/x86/hvm/vmx/vmcs.c | 2 +- xen/arch/x86/hvm/vmx/vmx.c | 8 ++++---- xen/arch/x86/include/asm/hvm/hvm.h | 29 ++++++++++++++--------------- 6 files changed, 25 insertions(+), 26 deletions(-) diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/hvm/hvm.c +++ b/xen/arch/x86/hvm/hvm.c @@ -XXX,XX +XXX,XX @@ static struct notifier_block cpu_nfb = { static bool __init hap_supported(struct hvm_function_table *fns) { - if ( !fns->hap_supported ) + if ( !fns->caps.hap ) { printk("HVM: Hardware Assisted Paging (HAP) not detected\n"); return false; @@ -XXX,XX +XXX,XX @@ static bool __init hap_supported(struct hvm_function_table *fns) if ( !opt_hap_enabled ) { - fns->hap_supported = 0; + fns->caps.hap = 0; printk("HVM: Hardware Assisted Paging (HAP) detected but disabled\n"); return false; } @@ -XXX,XX +XXX,XX @@ static int __init cf_check hvm_enable(void) } if ( !opt_altp2m_enabled ) - hvm_funcs.altp2m_supported = 0; + hvm_funcs.caps.altp2m = 0; if ( opt_hvm_fep ) warning_add(warning_hvm_fep); diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/hvm/svm/svm.c +++ b/xen/arch/x86/hvm/svm/svm.c @@ -XXX,XX +XXX,XX @@ const struct hvm_function_table * __init start_svm(void) if ( !printed ) printk(" - none\n"); - svm_function_table.hap_supported = cpu_has_svm_npt; + svm_function_table.caps.hap = cpu_has_svm_npt; svm_function_table.caps.hap_superpage_2mb = true; svm_function_table.caps.hap_superpage_1gb = cpu_has_page1gb; diff --git a/xen/arch/x86/hvm/vlapic.c b/xen/arch/x86/hvm/vlapic.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/hvm/vlapic.c +++ b/xen/arch/x86/hvm/vlapic.c @@ -XXX,XX +XXX,XX @@ int vlapic_has_pending_irq(struct vcpu *v) if ( irr == -1 ) return -1; - if ( hvm_funcs.virtual_intr_delivery_enabled && + if ( hvm_funcs.caps.virtual_intr_delivery && !nestedhvm_vcpu_in_guestmode(v) ) return irr; @@ -XXX,XX +XXX,XX @@ int vlapic_ack_pending_irq(struct vcpu *v, int vector, bool force_ack) int isr; if ( !force_ack && - hvm_funcs.virtual_intr_delivery_enabled ) + hvm_funcs.caps.virtual_intr_delivery ) return 1; /* If there's no chance of using APIC assist then bail now. */ diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/hvm/vmx/vmcs.c +++ b/xen/arch/x86/hvm/vmx/vmcs.c @@ -XXX,XX +XXX,XX @@ static int cf_check parse_ept_param_runtime(const char *s) struct domain *d; int val; - if ( !cpu_has_vmx_ept || !hvm_funcs.hap_supported || + if ( !cpu_has_vmx_ept || !hvm_funcs.caps.hap || !(hvm_funcs.caps.hap_superpage_2mb || hvm_funcs.caps.hap_superpage_1gb) ) { diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/hvm/vmx/vmx.c +++ b/xen/arch/x86/hvm/vmx/vmx.c @@ -XXX,XX +XXX,XX @@ const struct hvm_function_table * __init start_vmx(void) return NULL; } - vmx_function_table.singlestep_supported = cpu_has_monitor_trap_flag; + vmx_function_table.caps.singlestep = cpu_has_monitor_trap_flag; if ( cpu_has_vmx_dt_exiting ) vmx_function_table.set_descriptor_access_exiting = @@ -XXX,XX +XXX,XX @@ const struct hvm_function_table * __init start_vmx(void) printk("VMX: Disabling executable EPT superpages due to CVE-2018-12207\n"); } - vmx_function_table.hap_supported = 1; - vmx_function_table.altp2m_supported = 1; + vmx_function_table.caps.hap = 1; + vmx_function_table.caps.altp2m = 1; vmx_function_table.caps.hap_superpage_2mb = cpu_has_vmx_ept_2mb; vmx_function_table.caps.hap_superpage_1gb = cpu_has_vmx_ept_1gb; @@ -XXX,XX +XXX,XX @@ const struct hvm_function_table * __init start_vmx(void) vmx_function_table.update_eoi_exit_bitmap = vmx_update_eoi_exit_bitmap; vmx_function_table.process_isr = vmx_process_isr; vmx_function_table.handle_eoi = vmx_handle_eoi; - vmx_function_table.virtual_intr_delivery_enabled = true; + vmx_function_table.caps.virtual_intr_delivery = true; } if ( cpu_has_vmx_posted_intr_processing ) diff --git a/xen/arch/x86/include/asm/hvm/hvm.h b/xen/arch/x86/include/asm/hvm/hvm.h index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/include/asm/hvm/hvm.h +++ b/xen/arch/x86/include/asm/hvm/hvm.h @@ -XXX,XX +XXX,XX @@ struct hvm_vcpu_nonreg_state { struct hvm_function_table { const char *name; - /* Support Hardware-Assisted Paging? */ - bool hap_supported; - - /* Necessary hardware support for alternate p2m's? */ - bool altp2m_supported; - bool singlestep_supported; - - /* Hardware virtual interrupt delivery enable? */ - bool virtual_intr_delivery_enabled; - struct { /* Indicate HAP capabilities. */ - bool hap_superpage_1gb:1, - hap_superpage_2mb:1; + bool hap:1, + hap_superpage_1gb:1, + hap_superpage_2mb:1, + + /* Altp2m capabilities */ + altp2m:1, + singlestep:1, + + /* Hardware virtual interrupt delivery enable? */ + virtual_intr_delivery; + } caps; /* @@ -XXX,XX +XXX,XX @@ static inline void hvm_enable_msr_interception(struct domain *d, uint32_t msr) static inline bool hvm_is_singlestep_supported(void) { - return hvm_funcs.singlestep_supported; + return hvm_funcs.caps.singlestep; } static inline bool hvm_hap_supported(void) { - return hvm_funcs.hap_supported; + return hvm_funcs.caps.hap; } /* returns true if hardware supports alternate p2m's */ static inline bool hvm_altp2m_supported(void) { - return hvm_funcs.altp2m_supported; + return hvm_funcs.caps.altp2m; } /* updates the current hardware p2m */ -- 2.25.1
The primary purpose of TSC scaling, from our perspective, is to maintain the fiction of an "invariant TSC" across migrates between platforms with different clock speeds. On AMD, the TscRateMSR CPUID bit is unconditionally enabled in the "host cpuid", even if the hardware doesn't actually support it. According to c/s fd14a1943c4 ("nestedsvm: Support TSC Rate MSR"), testing showed that emulating TSC scaling in an L1 was more expensive than emulating TSC scaling on an L0 (due to extra sets of vmexit / vmenter). However, the current implementation seems to be broken. First of all, the final L2 scaling ratio should be a composition of the L0 scaling ratio and the L1 scaling ratio; there's no indication this is being done anywhere. Secondly, it's not clear that the L1 tsc scaling ratio actually affects the L0 tsc scaling ratio. The stored value (ns_tscratio) is used to affect the tsc *offset*, but doesn't seem to actually be factored into d->hvm.tsc_scaling_ratio. (Which shouldn't be per-domain anyway, but per-vcpu.) Having the *offset* scaled according to the nested scaling without the actual RDTSC itself also being scaled has got to produce inconsistent results. For now, just disable the functionality entirely until we can implement it properly: - Don't set TSCRATEMSR in the host CPUID policy - Remove MSR_AMD64_TSC_RATIO emulation handling, so that the guest guests a #GP if it tries to access them (as it should when TSCRATEMSR is clear) - Remove ns_tscratio from struct nestedhvm, and all code that touches it Unfortunately this means ripping out the scaling calculation stuff as well, since it's only used in the nested case; it's there in the git tree if we need it for reference when we re-introduce it. Signed-off-by: George Dunlap <george.dunlap@cloud.com> --- CC: Jan Beulich <jbeulich@suse.com> CC: Andrew Cooper <andrew.cooper3@citrix.com> CC: "Roger Pau Monné" <roger.pau@citrix.com> CC: Wei Liu <wl@xen.org> --- xen/arch/x86/cpu-policy.c | 3 +- xen/arch/x86/hvm/svm/nestedsvm.c | 2 - xen/arch/x86/hvm/svm/svm.c | 57 -------------------- xen/arch/x86/include/asm/hvm/svm/nestedsvm.h | 5 -- 4 files changed, 1 insertion(+), 66 deletions(-) diff --git a/xen/arch/x86/cpu-policy.c b/xen/arch/x86/cpu-policy.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/cpu-policy.c +++ b/xen/arch/x86/cpu-policy.c @@ -XXX,XX +XXX,XX @@ static void __init calculate_host_policy(void) (1u << SVM_FEATURE_PAUSEFILTER) | (1u << SVM_FEATURE_DECODEASSISTS)); /* Enable features which are always emulated. */ - p->extd.raw[0xa].d |= ((1u << SVM_FEATURE_VMCBCLEAN) | - (1u << SVM_FEATURE_TSCRATEMSR)); + p->extd.raw[0xa].d |= (1u << SVM_FEATURE_VMCBCLEAN); } /* 0x000000ce MSR_INTEL_PLATFORM_INFO */ diff --git a/xen/arch/x86/hvm/svm/nestedsvm.c b/xen/arch/x86/hvm/svm/nestedsvm.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/hvm/svm/nestedsvm.c +++ b/xen/arch/x86/hvm/svm/nestedsvm.c @@ -XXX,XX +XXX,XX @@ int cf_check nsvm_vcpu_reset(struct vcpu *v) svm->ns_msr_hsavepa = INVALID_PADDR; svm->ns_ovvmcb_pa = INVALID_PADDR; - svm->ns_tscratio = DEFAULT_TSC_RATIO; - svm->ns_cr_intercepts = 0; svm->ns_dr_intercepts = 0; svm->ns_exception_intercepts = 0; diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/hvm/svm/svm.c +++ b/xen/arch/x86/hvm/svm/svm.c @@ -XXX,XX +XXX,XX @@ static int cf_check svm_get_guest_pat(struct vcpu *v, u64 *gpat) return 1; } -static uint64_t scale_tsc(uint64_t host_tsc, uint64_t ratio) -{ - uint64_t mult, frac, scaled_host_tsc; - - if ( ratio == DEFAULT_TSC_RATIO ) - return host_tsc; - - /* - * Suppose the most significant 32 bits of host_tsc and ratio are - * tsc_h and mult, and the least 32 bits of them are tsc_l and frac, - * then - * host_tsc * ratio * 2^-32 - * = host_tsc * (mult * 2^32 + frac) * 2^-32 - * = host_tsc * mult + (tsc_h * 2^32 + tsc_l) * frac * 2^-32 - * = host_tsc * mult + tsc_h * frac + ((tsc_l * frac) >> 32) - * - * Multiplications in the last two terms are between 32-bit integers, - * so both of them can fit in 64-bit integers. - * - * Because mult is usually less than 10 in practice, it's very rare - * that host_tsc * mult can overflow a 64-bit integer. - */ - mult = ratio >> 32; - frac = ratio & ((1ULL << 32) - 1); - scaled_host_tsc = host_tsc * mult; - scaled_host_tsc += (host_tsc >> 32) * frac; - scaled_host_tsc += ((host_tsc & ((1ULL << 32) - 1)) * frac) >> 32; - - return scaled_host_tsc; -} - -static uint64_t svm_get_tsc_offset(uint64_t host_tsc, uint64_t guest_tsc, - uint64_t ratio) -{ - return guest_tsc - scale_tsc(host_tsc, ratio); -} - static void cf_check svm_set_tsc_offset(struct vcpu *v, u64 offset, u64 at_tsc) { struct vmcb_struct *vmcb = v->arch.hvm.svm.vmcb; @@ -XXX,XX +XXX,XX @@ static void cf_check svm_set_tsc_offset(struct vcpu *v, u64 offset, u64 at_tsc) if ( nestedhvm_vcpu_in_guestmode(v) ) { - struct nestedsvm *svm = &vcpu_nestedsvm(v); - n2_tsc_offset = vmcb_get_tsc_offset(n2vmcb) - vmcb_get_tsc_offset(n1vmcb); - if ( svm->ns_tscratio != DEFAULT_TSC_RATIO ) - { - uint64_t guest_tsc = hvm_get_guest_tsc_fixed(v, at_tsc); - - n2_tsc_offset = svm_get_tsc_offset(guest_tsc, - guest_tsc + n2_tsc_offset, - svm->ns_tscratio); - } vmcb_set_tsc_offset(n1vmcb, offset); } @@ -XXX,XX +XXX,XX @@ static int cf_check svm_msr_read_intercept( *msr_content = nsvm->ns_msr_hsavepa; break; - case MSR_AMD64_TSC_RATIO: - *msr_content = nsvm->ns_tscratio; - break; - case MSR_AMD_OSVW_ID_LENGTH: case MSR_AMD_OSVW_STATUS: if ( !d->arch.cpuid->extd.osvw ) @@ -XXX,XX +XXX,XX @@ static int cf_check svm_msr_write_intercept( goto gpf; break; - case MSR_AMD64_TSC_RATIO: - if ( msr_content & TSC_RATIO_RSVD_BITS ) - goto gpf; - nsvm->ns_tscratio = msr_content; - break; - case MSR_IA32_MCx_MISC(4): /* Threshold register */ case MSR_F10_MC4_MISC1 ... MSR_F10_MC4_MISC3: /* diff --git a/xen/arch/x86/include/asm/hvm/svm/nestedsvm.h b/xen/arch/x86/include/asm/hvm/svm/nestedsvm.h index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/include/asm/hvm/svm/nestedsvm.h +++ b/xen/arch/x86/include/asm/hvm/svm/nestedsvm.h @@ -XXX,XX +XXX,XX @@ struct nestedsvm { */ uint64_t ns_ovvmcb_pa; - /* virtual tscratio holding the value l1 guest writes to the - * MSR_AMD64_TSC_RATIO MSR. - */ - uint64_t ns_tscratio; - /* Cached real intercepts of the l2 guest */ uint32_t ns_cr_intercepts; uint32_t ns_dr_intercepts; -- 2.25.1
Changeset ef3e8db8068 ("x86/hvm: Corrections and improvements to unhandled vmexit logging") introduced a printk to the default path of the switch statement in nestedsvm_check_intercepts(), complaining of an unknown exit reason. Unfortunately, the "core" switch statement which is meant to handle all vmexit reasons is in nsvm_vmcb_guest_intercepts_exitcode(); the switch statement in nestedsvm_check_intercepts() is only meant to superimpose on top of that some special-casing for how to interaction between L1 and L0 vmexits. Remove the printk, and add a comment to prevent future confusion. Signed-off-by: George Dunlap <george.dunlap@cloud.com> --- xen/arch/x86/hvm/svm/nestedsvm.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/xen/arch/x86/hvm/svm/nestedsvm.c b/xen/arch/x86/hvm/svm/nestedsvm.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/hvm/svm/nestedsvm.c +++ b/xen/arch/x86/hvm/svm/nestedsvm.c @@ -XXX,XX +XXX,XX @@ nestedsvm_check_intercepts(struct vcpu *v, struct cpu_user_regs *regs, ASSERT(vcpu_nestedhvm(v).nv_vmexit_pending == 0); is_intercepted = nsvm_vmcb_guest_intercepts_exitcode(v, regs, exitcode); + /* + * Handle specific interactions between things the guest and host + * may both want to intercept + */ switch ( exitcode ) { case VMEXIT_INVALID: @@ -XXX,XX +XXX,XX @@ nestedsvm_check_intercepts(struct vcpu *v, struct cpu_user_regs *regs, /* Always let the guest handle VMMCALL/VMCALL */ return NESTEDHVM_VMEXIT_INJECT; default: - gprintk(XENLOG_ERR, "Unexpected nested vmexit: reason %#"PRIx64"\n", - exitcode); break; } -- 2.25.1
In order to make implementation and testing tractable, we will require specific host functionality. Add a nested_virt bit to hvm_funcs.caps, and return an error if a domain is created with nested virt and this bit isn't set. For VMX, start with always enabling it if HAP is present; this shouldn't change current behvior. For SVM, require some basic functionality, adding a document explaining the rationale. NB that only SVM CPUID bits 0-7 have been considered. Bits 10-16 may be considered in a follow-up patch. Signed-off-by: George Dunlap <george.dunlap@cloud.com> --- CC: Andrew Cooper <andrew.cooper3@citrix.com> CC: George Dunlap <george.dunlap@citrix.com> CC: Jan Beulich <jbeulich@suse.com> CC: Julien Grall <julien@xen.org> CC: Stefano Stabellini <sstabellini@kernel.org> CC: Wei Liu <wl@xen.org> CC: "Roger Pau Monné" <roger.pau@citrix.com> CC: Jun Nakajima <jun.nakajima@intel.com> CC: Kevin Tian <kevin.tian@intel.com> --- docs/designs/nested-svm-cpu-features.md | 110 ++++++++++++++++++++++++ xen/arch/x86/domain.c | 6 ++ xen/arch/x86/hvm/svm/nestedhvm.h | 1 + xen/arch/x86/hvm/svm/nestedsvm.c | 14 +++ xen/arch/x86/hvm/svm/svm.c | 2 + xen/arch/x86/hvm/vmx/vmx.c | 3 + xen/arch/x86/include/asm/hvm/hvm.h | 11 ++- 7 files changed, 146 insertions(+), 1 deletion(-) diff --git a/docs/designs/nested-svm-cpu-features.md b/docs/designs/nested-svm-cpu-features.md new file mode 100644 index XXXXXXX..XXXXXXX --- /dev/null +++ b/docs/designs/nested-svm-cpu-features.md @@ -XXX,XX +XXX,XX @@ +# Nested SVM (AMD) CPUID requirements + +The first step in making nested SVM production-ready is to make sure +that all features are implemented and well-tested. To make this +tractable, we will initially be limiting the "supported" range of +nested virt to a specific subset of host and guest features. This +document describes the criteria for deciding on features, and the +rationale behind each feature. + +For AMD, all virtualization-related features can be found in CPUID +leaf 8000000A:edx + +# Criteria + +- Processor support: At a minimum we want to support processors from + the last 5 years. All things being equal, older processors are + better. Bits 0:7 were available in the very earliest processors; + and even through bit 15 we should be pretty good support-wise. + +- Faithfulness to hardware: We need the behavior of the "virtual cpu" + from the L1 hypervisor's perspective to be as close as possible to + the original hardware. In particular, the behavior of the hardware + on error paths 1) is not easy to understand or test, 2) can be the + source of surprising vulnerabiliies. (See XSA-7 for an example of a + case where subtle error-handling differences can open up a privilege + escalation.) We should avoid emulating any bit of the hardware with + complex error paths if we can at all help it. + +- Cost of implementation: We want to minimize the cost of + implementation (where this includes bringing an existing sub-par + implementation up to speed). All things being equal, we'll favor a + configuration which does not require any new implementation. + +- Performance: All things being equal, we'd prefer to choose a set of + L0 / L1 CPUID bits that are faster than slower. + + +# Bits + +- 0 `NP` *Nested Paging*: Required both for L0 and L1. + + Based primarily on faithfulness and performance, as well as + potential cost of implementation. Available on earliest hardware, + so no compatibility issues. + +- 1 `LbrVirt` *LBR / debugging virtualization*: Require for L0 and L1. + + For L0 this is required for performance: There's no way to tell the + guests not to use the LBR-related registers; and if the guest does, + then you have to save and restore all LBR-related registers on + context switch, which is prohibitive. Furthermore, the additional + emulation risks a security-relevant difference to come up. + + Providing it to L1 when we have it in L0 is basically free, and + already implemented. + + Just require it and provide it. + +- 2 `SVML` *SVM Lock*: Not required for L0, not provided to L1 + + Seems to be aboult enabling an operating system to prevent "blue + pill" attacks against itself. + + Xen doesn't use it, nor provide it; so it would need to be + implementend. The best way to protect a guest OS is to leave nested + virt disabled in the tools. + +- 3 `NRIPS` NRIP Save: Require for both L0 and L1 + + If NRIPS is not present, the software interrupt injection + functionality can't be used; and Xen has to emulate it. That's + another source of potential security issues. If hardware supports + it, then providing it to guest is basically free. + +- 4 `TscRateMsr`: Not required by L0, not provided to L1 + + The main putative use for this would be trying to maintain an + invariant TSC across cores with different clock speeds, or after a + migrate. Unlike others, this doesn't have an error path to worry + about compatibility-wise; and according to tests done when nestedSVM + was first implemented, it's actually faster to emliate TscRateMSR in + the L0 hypervisor than for L1 to attempt to emulate it itself. + + However, using this properly in L0 will take some implementation + effort; and composing it properly with L1 will take even more + effort. Just leave it off for now. + + - 5 `VmcbClean`: VMCB Clean Bits: Not required by L0, provide to L1 + + This is a pure optimization, both on the side of the L0 and L1. The + implementaiton for L1 is entirely Xen-side, so can be provided even + on hardware that doesn't provide it. And it's purely an + optimization, so could be "implemented" by ignoring the bits + entirely. + + As such, we don't need to require it for L0; and as it's already + implemented, no reason not to provide it to L1. Before this feature + was available those bits were marked SBZ ("should be zero"); setting + them was already advertised to cause unpredictable behavior. + +- 6 `FlushByAsid`: Require for L0, provide to L1 + + This is cheap and easy to use for L0 and to provide to the L1; + there's no reson not to just pass it through. + +- 7 `DecodeAssists`: Require for L0, provide to L1 + + Using it in L0 reduces the chance that we'll make some sort of error + in the decode path. And if hardware supports it, it's easy enough + to provide to the L1. diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -XXX,XX +XXX,XX @@ int arch_sanitise_domain_config(struct xen_domctl_createdomain *config) */ config->flags |= XEN_DOMCTL_CDF_oos_off; + if ( nested_virt && !hvm_nested_virt_supported() ) + { + dprintk(XENLOG_INFO, "Nested virt requested but not available\n"); + return -EINVAL; + } + if ( nested_virt && !hap ) { dprintk(XENLOG_INFO, "Nested virt not supported without HAP\n"); diff --git a/xen/arch/x86/hvm/svm/nestedhvm.h b/xen/arch/x86/hvm/svm/nestedhvm.h index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/hvm/svm/nestedhvm.h +++ b/xen/arch/x86/hvm/svm/nestedhvm.h @@ -XXX,XX +XXX,XX @@ enum nestedhvm_vmexits nestedsvm_check_intercepts(struct vcpu *v, struct cpu_user_regs *regs, uint64_t exitcode); void svm_nested_features_on_efer_update(struct vcpu *v); +void __init start_nested_svm(struct hvm_function_table *svm_function_table); /* Interface methods */ void cf_check nsvm_vcpu_destroy(struct vcpu *v); diff --git a/xen/arch/x86/hvm/svm/nestedsvm.c b/xen/arch/x86/hvm/svm/nestedsvm.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/hvm/svm/nestedsvm.c +++ b/xen/arch/x86/hvm/svm/nestedsvm.c @@ -XXX,XX +XXX,XX @@ void svm_nested_features_on_efer_update(struct vcpu *v) } } } + +void __init start_nested_svm(struct hvm_function_table *svm_function_table) +{ + /* + * Required host functionality to support nested virt. See + * docs/designs/nested-svm-cpu-features.md for rationale. + */ + svm_function_table->caps.nested_virt = + cpu_has_svm_nrips && + cpu_has_svm_lbrv && + cpu_has_svm_nrips && + cpu_has_svm_flushbyasid && + cpu_has_svm_decode; +} diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/hvm/svm/svm.c +++ b/xen/arch/x86/hvm/svm/svm.c @@ -XXX,XX +XXX,XX @@ const struct hvm_function_table * __init start_svm(void) svm_function_table.caps.hap_superpage_2mb = true; svm_function_table.caps.hap_superpage_1gb = cpu_has_page1gb; + start_nested_svm(&svm_function_table); + return &svm_function_table; } diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/hvm/vmx/vmx.c +++ b/xen/arch/x86/hvm/vmx/vmx.c @@ -XXX,XX +XXX,XX @@ const struct hvm_function_table * __init start_vmx(void) if ( cpu_has_vmx_tsc_scaling ) vmx_function_table.tsc_scaling.ratio_frac_bits = 48; + /* TODO: Require hardware support before enabling nested virt */ + vmx_function_table.caps.nested_virt = vmx_function_table.caps.hap; + model_specific_lbr = get_model_specific_lbr(); lbr_tsx_fixup_check(); ler_to_fixup_check(); diff --git a/xen/arch/x86/include/asm/hvm/hvm.h b/xen/arch/x86/include/asm/hvm/hvm.h index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/include/asm/hvm/hvm.h +++ b/xen/arch/x86/include/asm/hvm/hvm.h @@ -XXX,XX +XXX,XX @@ struct hvm_function_table { singlestep:1, /* Hardware virtual interrupt delivery enable? */ - virtual_intr_delivery; + virtual_intr_delivery, + + /* Nested virt capabilities */ + nested_virt:1; } caps; @@ -XXX,XX +XXX,XX @@ static inline bool hvm_altp2m_supported(void) return hvm_funcs.caps.altp2m; } +/* Returns true if we have the minimum hardware requirements for nested virt */ +static inline bool hvm_nested_virt_supported(void) +{ + return hvm_funcs.caps.nested_virt; +} + /* updates the current hardware p2m */ static inline void altp2m_vcpu_update_p2m(struct vcpu *v) { -- 2.25.1
This series lays the groundwork for revamp of the AMD nested virt functionality. The first two patches are clean-ups or reorganizations of existing code. The final patch is the first major step towards making the feature supportable: allowing Xen to refuse nested virt support if certain hardware features are not present. George Dunlap (3): x86: Move SVM features exposed to guest into hvm_max_cpu_policy nestedsvm: Disable TscRateMSR svm/nestedsvm: Introduce nested capabilities bit docs/designs/nested-svm-cpu-features.md | 111 +++++++++++++++++++ xen/arch/x86/cpu-policy.c | 29 ++--- xen/arch/x86/domain.c | 6 + xen/arch/x86/hvm/nestedhvm.c | 10 ++ xen/arch/x86/hvm/svm/nestedsvm.c | 16 ++- xen/arch/x86/hvm/svm/svm.c | 57 ---------- xen/arch/x86/hvm/vmx/vvmx.c | 8 ++ xen/arch/x86/include/asm/hvm/hvm.h | 16 ++- xen/arch/x86/include/asm/hvm/nestedhvm.h | 4 + xen/arch/x86/include/asm/hvm/svm/nestedsvm.h | 5 - 10 files changed, 184 insertions(+), 78 deletions(-) create mode 100644 docs/designs/nested-svm-cpu-features.md -- 2.25.1
Currently (nested) SVM features we're willing to expose to the guest are defined in calculate_host_policy, and stored in host_cpu_policy. This is the wrong place for this; move it into calculate_hvm_max_policy(), and store it in hvm_max_cpu_policy. Signed-off-by: George Dunlap <george.dunlap@cloud.com> --- v2: - New --- xen/arch/x86/cpu-policy.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/xen/arch/x86/cpu-policy.c b/xen/arch/x86/cpu-policy.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/cpu-policy.c +++ b/xen/arch/x86/cpu-policy.c @@ -XXX,XX +XXX,XX @@ static void __init calculate_host_policy(void) if ( vpmu_mode == XENPMU_MODE_OFF ) p->basic.raw[0xa] = EMPTY_LEAF; - if ( p->extd.svm ) - { - /* Clamp to implemented features which require hardware support. */ - p->extd.raw[0xa].d &= ((1u << SVM_FEATURE_NPT) | - (1u << SVM_FEATURE_LBRV) | - (1u << SVM_FEATURE_NRIPS) | - (1u << SVM_FEATURE_PAUSEFILTER) | - (1u << SVM_FEATURE_DECODEASSISTS)); - /* Enable features which are always emulated. */ - p->extd.raw[0xa].d |= ((1u << SVM_FEATURE_VMCBCLEAN) | - (1u << SVM_FEATURE_TSCRATEMSR)); - } - /* 0x000000ce MSR_INTEL_PLATFORM_INFO */ /* probe_cpuid_faulting() sanity checks presence of MISC_FEATURES_ENABLES */ p->platform_info.cpuid_faulting = cpu_has_cpuid_faulting; @@ -XXX,XX +XXX,XX @@ static void __init calculate_hvm_max_policy(void) if ( !cpu_has_vmx ) __clear_bit(X86_FEATURE_PKS, fs); + /* + * Make adjustments to possible (nested) virtualization features exposed + * to the guest + */ + if ( p->extd.svm ) + { + /* Clamp to implemented features which require hardware support. */ + p->extd.raw[0xa].d &= ((1u << SVM_FEATURE_NPT) | + (1u << SVM_FEATURE_LBRV) | + (1u << SVM_FEATURE_NRIPS) | + (1u << SVM_FEATURE_PAUSEFILTER) | + (1u << SVM_FEATURE_DECODEASSISTS)); + /* Enable features which are always emulated. */ + p->extd.raw[0xa].d |= ((1u << SVM_FEATURE_VMCBCLEAN) | + (1u << SVM_FEATURE_TSCRATEMSR)); + } + guest_common_max_feature_adjustments(fs); guest_common_feature_adjustments(fs); -- 2.25.1
The primary purpose of TSC scaling, from our perspective, is to maintain the fiction of an "invariant TSC" across migrates between platforms with different clock speeds. On AMD, the TscRateMSR CPUID bit is unconditionally enabled in the "host cpuid", even if the hardware doesn't actually support it. According to c/s fd14a1943c4 ("nestedsvm: Support TSC Rate MSR"), testing showed that emulating TSC scaling in an L1 was more expensive than emulating TSC scaling on an L0 (due to extra sets of vmexit / vmenter). However, the current implementation seems to be broken. First of all, the final L2 scaling ratio should be a composition of the L0 scaling ratio and the L1 scaling ratio; there's no indication this is being done anywhere. Secondly, it's not clear that the L1 tsc scaling ratio actually affects the L0 tsc scaling ratio. The stored value (ns_tscratio) is used to affect the tsc *offset*, but doesn't seem to actually be factored into d->hvm.tsc_scaling_ratio. (Which shouldn't be per-domain anyway, but per-vcpu.) Having the *offset* scaled according to the nested scaling without the actual RDTSC itself also being scaled has got to produce inconsistent results. For now, just disable the functionality entirely until we can implement it properly: - Don't set TSCRATEMSR in the host CPUID policy - Remove MSR_AMD64_TSC_RATIO emulation handling, so that the guest guests a #GP if it tries to access them (as it should when TSCRATEMSR is clear) - Remove ns_tscratio from struct nestedhvm, and all code that touches it Unfortunately this means ripping out the scaling calculation stuff as well, since it's only used in the nested case; it's there in the git tree if we need it for reference when we re-introduce it. Signed-off-by: George Dunlap <george.dunlap@cloud.com> --- v2: - Port over move to hvm_max_cpu_policy CC: Jan Beulich <jbeulich@suse.com> CC: Andrew Cooper <andrew.cooper3@citrix.com> CC: "Roger Pau Monné" <roger.pau@citrix.com> CC: Wei Liu <wl@xen.org> --- xen/arch/x86/cpu-policy.c | 3 +- xen/arch/x86/hvm/svm/nestedsvm.c | 2 - xen/arch/x86/hvm/svm/svm.c | 57 -------------------- xen/arch/x86/include/asm/hvm/svm/nestedsvm.h | 5 -- 4 files changed, 1 insertion(+), 66 deletions(-) diff --git a/xen/arch/x86/cpu-policy.c b/xen/arch/x86/cpu-policy.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/cpu-policy.c +++ b/xen/arch/x86/cpu-policy.c @@ -XXX,XX +XXX,XX @@ static void __init calculate_hvm_max_policy(void) (1u << SVM_FEATURE_PAUSEFILTER) | (1u << SVM_FEATURE_DECODEASSISTS)); /* Enable features which are always emulated. */ - p->extd.raw[0xa].d |= ((1u << SVM_FEATURE_VMCBCLEAN) | - (1u << SVM_FEATURE_TSCRATEMSR)); + p->extd.raw[0xa].d |= (1u << SVM_FEATURE_VMCBCLEAN); } guest_common_max_feature_adjustments(fs); diff --git a/xen/arch/x86/hvm/svm/nestedsvm.c b/xen/arch/x86/hvm/svm/nestedsvm.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/hvm/svm/nestedsvm.c +++ b/xen/arch/x86/hvm/svm/nestedsvm.c @@ -XXX,XX +XXX,XX @@ int cf_check nsvm_vcpu_reset(struct vcpu *v) svm->ns_msr_hsavepa = INVALID_PADDR; svm->ns_ovvmcb_pa = INVALID_PADDR; - svm->ns_tscratio = DEFAULT_TSC_RATIO; - svm->ns_cr_intercepts = 0; svm->ns_dr_intercepts = 0; svm->ns_exception_intercepts = 0; diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/hvm/svm/svm.c +++ b/xen/arch/x86/hvm/svm/svm.c @@ -XXX,XX +XXX,XX @@ static int cf_check svm_get_guest_pat(struct vcpu *v, u64 *gpat) return 1; } -static uint64_t scale_tsc(uint64_t host_tsc, uint64_t ratio) -{ - uint64_t mult, frac, scaled_host_tsc; - - if ( ratio == DEFAULT_TSC_RATIO ) - return host_tsc; - - /* - * Suppose the most significant 32 bits of host_tsc and ratio are - * tsc_h and mult, and the least 32 bits of them are tsc_l and frac, - * then - * host_tsc * ratio * 2^-32 - * = host_tsc * (mult * 2^32 + frac) * 2^-32 - * = host_tsc * mult + (tsc_h * 2^32 + tsc_l) * frac * 2^-32 - * = host_tsc * mult + tsc_h * frac + ((tsc_l * frac) >> 32) - * - * Multiplications in the last two terms are between 32-bit integers, - * so both of them can fit in 64-bit integers. - * - * Because mult is usually less than 10 in practice, it's very rare - * that host_tsc * mult can overflow a 64-bit integer. - */ - mult = ratio >> 32; - frac = ratio & ((1ULL << 32) - 1); - scaled_host_tsc = host_tsc * mult; - scaled_host_tsc += (host_tsc >> 32) * frac; - scaled_host_tsc += ((host_tsc & ((1ULL << 32) - 1)) * frac) >> 32; - - return scaled_host_tsc; -} - -static uint64_t svm_get_tsc_offset(uint64_t host_tsc, uint64_t guest_tsc, - uint64_t ratio) -{ - return guest_tsc - scale_tsc(host_tsc, ratio); -} - static void cf_check svm_set_tsc_offset(struct vcpu *v, u64 offset, u64 at_tsc) { struct vmcb_struct *vmcb = v->arch.hvm.svm.vmcb; @@ -XXX,XX +XXX,XX @@ static void cf_check svm_set_tsc_offset(struct vcpu *v, u64 offset, u64 at_tsc) if ( nestedhvm_vcpu_in_guestmode(v) ) { - struct nestedsvm *svm = &vcpu_nestedsvm(v); - n2_tsc_offset = vmcb_get_tsc_offset(n2vmcb) - vmcb_get_tsc_offset(n1vmcb); - if ( svm->ns_tscratio != DEFAULT_TSC_RATIO ) - { - uint64_t guest_tsc = hvm_get_guest_tsc_fixed(v, at_tsc); - - n2_tsc_offset = svm_get_tsc_offset(guest_tsc, - guest_tsc + n2_tsc_offset, - svm->ns_tscratio); - } vmcb_set_tsc_offset(n1vmcb, offset); } @@ -XXX,XX +XXX,XX @@ static int cf_check svm_msr_read_intercept( *msr_content = nsvm->ns_msr_hsavepa; break; - case MSR_AMD64_TSC_RATIO: - *msr_content = nsvm->ns_tscratio; - break; - case MSR_AMD_OSVW_ID_LENGTH: case MSR_AMD_OSVW_STATUS: if ( !d->arch.cpuid->extd.osvw ) @@ -XXX,XX +XXX,XX @@ static int cf_check svm_msr_write_intercept( goto gpf; break; - case MSR_AMD64_TSC_RATIO: - if ( msr_content & TSC_RATIO_RSVD_BITS ) - goto gpf; - nsvm->ns_tscratio = msr_content; - break; - case MSR_IA32_MCx_MISC(4): /* Threshold register */ case MSR_F10_MC4_MISC1 ... MSR_F10_MC4_MISC3: /* diff --git a/xen/arch/x86/include/asm/hvm/svm/nestedsvm.h b/xen/arch/x86/include/asm/hvm/svm/nestedsvm.h index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/include/asm/hvm/svm/nestedsvm.h +++ b/xen/arch/x86/include/asm/hvm/svm/nestedsvm.h @@ -XXX,XX +XXX,XX @@ struct nestedsvm { */ uint64_t ns_ovvmcb_pa; - /* virtual tscratio holding the value l1 guest writes to the - * MSR_AMD64_TSC_RATIO MSR. - */ - uint64_t ns_tscratio; - /* Cached real intercepts of the l2 guest */ uint32_t ns_cr_intercepts; uint32_t ns_dr_intercepts; -- 2.25.1
In order to make implementation and testing tractable, we will require specific host functionality. Add a nested_virt bit to hvm_funcs.caps, and return an error if a domain is created with nested virt and this bit isn't set. Create VMX and SVM callbacks to be executed from start_nested_svm(), which is guaranteed to execute after all command-line options have been procesed. For VMX, start with always enabling it if HAP is present; this shouldn't change current behvior. For SVM, require some basic functionality, adding a document explaining the rationale. NB that only SVM CPUID bits 0-7 have been considered. Bits 10-16 may be considered in a follow-up patch. Signed-off-by: George Dunlap <george.dunlap@cloud.com> --- v2: - Fixed typo in title - Added hvm_nested_virt_supported() def for !CONFIG_HVM - Rebased over previous changes - Tweak some wording in document - Require npt rather than nrips twice - Remove stray __init from header - Set caps.nested_virt from callback from nestedhvm_setup() CC: Andrew Cooper <andrew.cooper3@citrix.com> CC: George Dunlap <george.dunlap@citrix.com> CC: Jan Beulich <jbeulich@suse.com> CC: Julien Grall <julien@xen.org> CC: Stefano Stabellini <sstabellini@kernel.org> CC: Wei Liu <wl@xen.org> CC: "Roger Pau Monné" <roger.pau@citrix.com> CC: Jun Nakajima <jun.nakajima@intel.com> CC: Kevin Tian <kevin.tian@intel.com> --- docs/designs/nested-svm-cpu-features.md | 111 +++++++++++++++++++++++ xen/arch/x86/domain.c | 6 ++ xen/arch/x86/hvm/nestedhvm.c | 10 ++ xen/arch/x86/hvm/svm/nestedsvm.c | 14 +++ xen/arch/x86/hvm/vmx/vvmx.c | 8 ++ xen/arch/x86/include/asm/hvm/hvm.h | 16 +++- xen/arch/x86/include/asm/hvm/nestedhvm.h | 4 + 7 files changed, 168 insertions(+), 1 deletion(-) diff --git a/docs/designs/nested-svm-cpu-features.md b/docs/designs/nested-svm-cpu-features.md new file mode 100644 index XXXXXXX..XXXXXXX --- /dev/null +++ b/docs/designs/nested-svm-cpu-features.md @@ -XXX,XX +XXX,XX @@ +# Nested SVM (AMD) CPUID requirements + +The first step in making nested SVM production-ready is to make sure +that all features are implemented and well-tested. To make this +tractable, we will initially be limiting the "supported" range of +nested virt to a specific subset of host and guest features. This +document describes the criteria for deciding on features, and the +rationale behind each feature. + +For AMD, all virtualization-related features can be found in CPUID +leaf 8000000A:edx + +# Criteria + +- Processor support: At a minimum we want to support processors from + the last 5 years. All things being equal, we'd prefer to cover + older processors than not. Bits 0:7 were available in the very + earliest processors; and even through bit 15 we should be pretty + good support-wise. + +- Faithfulness to hardware: We need the behavior of the "virtual cpu" + from the L1 hypervisor's perspective to be as close as possible to + the original hardware. In particular, the behavior of the hardware + on error paths 1) is not easy to understand or test, 2) can be the + source of surprising vulnerabiliies. (See XSA-7 for an example of a + case where subtle error-handling differences can open up a privilege + escalation.) We should avoid emulating any bit of the hardware with + complex error paths if we can at all help it. + +- Cost of implementation: We want to minimize the cost of + implementation (where this includes bringing an existing sub-par + implementation up to speed). All things being equal, we'll favor a + configuration which does not require any new implementation. + +- Performance: All things being equal, we'd prefer to choose a set of + L0 / L1 CPUID bits that are faster than slower. + + +# Bits + +- 0 `NP` *Nested Paging*: Required both for L0 and L1. + + Based primarily on faithfulness and performance, as well as + potential cost of implementation. Available on earliest hardware, + so no compatibility issues. + +- 1 `LbrVirt` *LBR / debugging virtualization*: Require for L0 and L1. + + For L0 this is required for performance: There's no way to tell the + guests not to use the LBR-related registers; and if the guest does, + then you have to save and restore all LBR-related registers on + context switch, which is prohibitive. Furthermore, the additional + emulation risks a security-relevant difference to come up. + + Providing it to L1 when we have it in L0 is basically free, and + already implemented. + + Just require it and provide it. + +- 2 `SVML` *SVM Lock*: Not required for L0, not provided to L1 + + Seems to be aboult enabling an operating system to prevent "blue + pill" attacks against itself. + + Xen doesn't use it, nor provide it; so it would need to be + implementend. The best way to protect a guest OS is to leave nested + virt disabled in the tools. + +- 3 `NRIPS` NRIP Save: Require for both L0 and L1 + + If NRIPS is not present, the software interrupt injection + functionality can't be used; and Xen has to emulate it. That's + another source of potential security issues. If hardware supports + it, then providing it to guest is basically free. + +- 4 `TscRateMsr`: Not required by L0, not provided to L1 + + The main putative use for this would be trying to maintain an + invariant TSC across cores with different clock speeds, or after a + migrate. Unlike others, this doesn't have an error path to worry + about compatibility-wise; and according to tests done when nestedSVM + was first implemented, it's actually faster to emliate TscRateMSR in + the L0 hypervisor than for L1 to attempt to emulate it itself. + + However, using this properly in L0 will take some implementation + effort; and composing it properly with L1 will take even more + effort. Just leave it off for now. + + - 5 `VmcbClean`: VMCB Clean Bits: Not required by L0, provide to L1 + + This is a pure optimization, both on the side of the L0 and L1. The + implementaiton for L1 is entirely Xen-side, so can be provided even + on hardware that doesn't provide it. And it's purely an + optimization, so could be "implemented" by ignoring the bits + entirely. + + As such, we don't need to require it for L0; and as it's already + implemented, no reason not to provide it to L1. Before this feature + was available those bits were marked SBZ ("should be zero"); setting + them was already advertised to cause unpredictable behavior. + +- 6 `FlushByAsid`: Require for L0, provide to L1 + + This is cheap and easy to use for L0 and to provide to the L1; + there's no reson not to just pass it through. + +- 7 `DecodeAssists`: Require for L0, provide to L1 + + Using it in L0 reduces the chance that we'll make some sort of error + in the decode path. And if hardware supports it, it's easy enough + to provide to the L1. diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -XXX,XX +XXX,XX @@ int arch_sanitise_domain_config(struct xen_domctl_createdomain *config) */ config->flags |= XEN_DOMCTL_CDF_oos_off; + if ( nested_virt && !hvm_nested_virt_supported() ) + { + dprintk(XENLOG_INFO, "Nested virt requested but not available\n"); + return -EINVAL; + } + if ( nested_virt && !hap ) { dprintk(XENLOG_INFO, "Nested virt not supported without HAP\n"); diff --git a/xen/arch/x86/hvm/nestedhvm.c b/xen/arch/x86/hvm/nestedhvm.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/hvm/nestedhvm.c +++ b/xen/arch/x86/hvm/nestedhvm.c @@ -XXX,XX +XXX,XX @@ static int __init cf_check nestedhvm_setup(void) __clear_bit(0x80, shadow_io_bitmap[0]); __clear_bit(0xed, shadow_io_bitmap[1]); + /* + * NB this must be called after all command-line processing has been + * done, so that if (for example) HAP is disabled, nested virt is + * disabled as well. + */ + if ( cpu_has_vmx ) + start_nested_vmx(&hvm_funcs); + else if ( cpu_has_svm ) + start_nested_svm(&hvm_funcs); + return 0; } __initcall(nestedhvm_setup); diff --git a/xen/arch/x86/hvm/svm/nestedsvm.c b/xen/arch/x86/hvm/svm/nestedsvm.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/hvm/svm/nestedsvm.c +++ b/xen/arch/x86/hvm/svm/nestedsvm.c @@ -XXX,XX +XXX,XX @@ void svm_nested_features_on_efer_update(struct vcpu *v) } } } + +void __init start_nested_svm(struct hvm_function_table *hvm_function_table) +{ + /* + * Required host functionality to support nested virt. See + * docs/designs/nested-svm-cpu-features.md for rationale. + */ + hvm_function_table->caps.nested_virt = + hvm_function_table->caps.hap && + cpu_has_svm_lbrv && + cpu_has_svm_nrips && + cpu_has_svm_flushbyasid && + cpu_has_svm_decode; +} diff --git a/xen/arch/x86/hvm/vmx/vvmx.c b/xen/arch/x86/hvm/vmx/vvmx.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/hvm/vmx/vvmx.c +++ b/xen/arch/x86/hvm/vmx/vvmx.c @@ -XXX,XX +XXX,XX @@ void nvmx_set_cr_read_shadow(struct vcpu *v, unsigned int cr) __vmwrite(read_shadow_field, v->arch.hvm.nvcpu.guest_cr[cr]); } +void __init start_nested_vmx(struct hvm_function_table *hvm_function_table) +{ + /* TODO: Require hardware support before enabling nested virt */ + hvm_function_table->caps.nested_virt = + hvm_function_table->caps.hap; +} + + /* * Local variables: * mode: C diff --git a/xen/arch/x86/include/asm/hvm/hvm.h b/xen/arch/x86/include/asm/hvm/hvm.h index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/include/asm/hvm/hvm.h +++ b/xen/arch/x86/include/asm/hvm/hvm.h @@ -XXX,XX +XXX,XX @@ struct hvm_function_table { singlestep:1, /* Hardware virtual interrupt delivery enable? */ - virtual_intr_delivery:1; + virtual_intr_delivery:1, + + /* Nested virt capabilities */ + nested_virt:1; } caps; /* @@ -XXX,XX +XXX,XX @@ static inline bool hvm_altp2m_supported(void) return hvm_funcs.caps.altp2m; } +/* Returns true if we have the minimum hardware requirements for nested virt */ +static inline bool hvm_nested_virt_supported(void) +{ + return hvm_funcs.caps.nested_virt; +} + /* updates the current hardware p2m */ static inline void altp2m_vcpu_update_p2m(struct vcpu *v) { @@ -XXX,XX +XXX,XX @@ static inline bool hvm_hap_supported(void) return false; } +static inline bool hvm_nested_virt_supported(void) +{ + return false; +} + static inline bool nhvm_vmcx_hap_enabled(const struct vcpu *v) { ASSERT_UNREACHABLE(); diff --git a/xen/arch/x86/include/asm/hvm/nestedhvm.h b/xen/arch/x86/include/asm/hvm/nestedhvm.h index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/include/asm/hvm/nestedhvm.h +++ b/xen/arch/x86/include/asm/hvm/nestedhvm.h @@ -XXX,XX +XXX,XX @@ static inline bool vvmcx_valid(const struct vcpu *v) return vcpu_nestedhvm(v).nv_vvmcxaddr != INVALID_PADDR; } + +void start_nested_svm(struct hvm_function_table *); +void start_nested_vmx(struct hvm_function_table *); + #endif /* _HVM_NESTEDHVM_H */ -- 2.25.1