:p
atchew
Login
This series lays the groundwork for revamp of the AMD nested virt functionality. The first five patches are clean-ups or reorganizations of existing code. The final patch is the first major step towards making the feature supportable: allowing Xen to refuse nested virt support if certain hardware features are not present. George Dunlap (6): xen/hvm: Convert hap_capabilities into a bitfield svm: Improve type of cpu_has_svm_feature xen/hvm: Move other hvm_function_table booleans into the caps bitfield nestedsvm: Disable TscRateMSR nestedsvm: Remove bogus debug message from nestedsvm_check_intercepts svm/nestedvm: Introduce nested capabilities bit docs/designs/nested-svm-cpu-features.md | 110 +++++++++++++++++++ xen/arch/x86/cpu-policy.c | 3 +- xen/arch/x86/domain.c | 6 + xen/arch/x86/hvm/hvm.c | 14 +-- xen/arch/x86/hvm/svm/nestedhvm.h | 1 + xen/arch/x86/hvm/svm/nestedsvm.c | 22 +++- xen/arch/x86/hvm/svm/svm.c | 65 +---------- xen/arch/x86/hvm/vlapic.c | 4 +- xen/arch/x86/hvm/vmx/vmcs.c | 6 +- xen/arch/x86/hvm/vmx/vmx.c | 19 ++-- xen/arch/x86/include/asm/hvm/hvm.h | 47 ++++---- xen/arch/x86/include/asm/hvm/svm/nestedsvm.h | 5 - xen/arch/x86/include/asm/hvm/svm/svm.h | 5 +- 13 files changed, 191 insertions(+), 116 deletions(-) create mode 100644 docs/designs/nested-svm-cpu-features.md -- 2.25.1
hvm_function_table is an internal structure; rather than manually |-ing and &-ing bits, just make it a boolean bitfield and let the compiler do all the work. This makes everything easier to read, and presumably allows the compiler more flexibility in producing efficient code. No functional change intended. Signed-off-by: George Dunlap <george.dunlap@cloud.com> --- Questions: * Should hap_superpage_2m really be set unconditionally, or should we condition it on cpu_has_svm_npt? * Do we really need to "!!cpu_has_svm_npt"? If so, wouldn't it be better to put the "!!" in the #define, rather than requiring the user to know that it's needed? CC: Jan Beulich <jbeulich@suse.com> CC: Andrew Cooper <andrew.cooper3@citrix.com> CC: "Roger Pau Monné" <roger.pau@citrix.com> CC: Wei Liu <wl@xen.org> CC: Jun Nakajima <jun.nakajima@intel.com> CC: Kevin Tian <kevin.tian@intel.com> --- xen/arch/x86/hvm/hvm.c | 8 ++++---- xen/arch/x86/hvm/svm/svm.c | 4 ++-- xen/arch/x86/hvm/vmx/vmcs.c | 4 ++-- xen/arch/x86/hvm/vmx/vmx.c | 8 ++------ xen/arch/x86/include/asm/hvm/hvm.h | 19 +++++++------------ 5 files changed, 17 insertions(+), 26 deletions(-) diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/hvm/hvm.c +++ b/xen/arch/x86/hvm/hvm.c @@ -XXX,XX +XXX,XX @@ static int __init cf_check hvm_enable(void) { printk("HVM: Hardware Assisted Paging (HAP) detected\n"); printk("HVM: HAP page sizes: 4kB"); - if ( fns->hap_capabilities & HVM_HAP_SUPERPAGE_2MB ) + if ( fns->caps.hap_superpage_2mb ) { printk(", 2MB%s", opt_hap_2mb ? "" : " [disabled]"); if ( !opt_hap_2mb ) - hvm_funcs.hap_capabilities &= ~HVM_HAP_SUPERPAGE_2MB; + hvm_funcs.caps.hap_superpage_2mb = false; } - if ( fns->hap_capabilities & HVM_HAP_SUPERPAGE_1GB ) + if ( fns->caps.hap_superpage_1gb ) { printk(", 1GB%s", opt_hap_1gb ? "" : " [disabled]"); if ( !opt_hap_1gb ) - hvm_funcs.hap_capabilities &= ~HVM_HAP_SUPERPAGE_1GB; + hvm_funcs.caps.hap_superpage_1gb = false; } printk("\n"); } diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/hvm/svm/svm.c +++ b/xen/arch/x86/hvm/svm/svm.c @@ -XXX,XX +XXX,XX @@ const struct hvm_function_table * __init start_svm(void) printk(" - none\n"); svm_function_table.hap_supported = !!cpu_has_svm_npt; - svm_function_table.hap_capabilities = HVM_HAP_SUPERPAGE_2MB | - (cpu_has_page1gb ? HVM_HAP_SUPERPAGE_1GB : 0); + svm_function_table.caps.hap_superpage_2mb = true; + svm_function_table.caps.hap_superpage_1gb = cpu_has_page1gb; return &svm_function_table; } diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/hvm/vmx/vmcs.c +++ b/xen/arch/x86/hvm/vmx/vmcs.c @@ -XXX,XX +XXX,XX @@ static int cf_check parse_ept_param_runtime(const char *s) int val; if ( !cpu_has_vmx_ept || !hvm_funcs.hap_supported || - !(hvm_funcs.hap_capabilities & - (HVM_HAP_SUPERPAGE_2MB | HVM_HAP_SUPERPAGE_1GB)) ) + !(hvm_funcs.caps.hap_superpage_2mb || + hvm_funcs.caps.hap_superpage_1gb) ) { printk("VMX: EPT not available, or not in use - ignoring\n"); return 0; diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/hvm/vmx/vmx.c +++ b/xen/arch/x86/hvm/vmx/vmx.c @@ -XXX,XX +XXX,XX @@ const struct hvm_function_table * __init start_vmx(void) vmx_function_table.hap_supported = 1; vmx_function_table.altp2m_supported = 1; - vmx_function_table.hap_capabilities = 0; - - if ( cpu_has_vmx_ept_2mb ) - vmx_function_table.hap_capabilities |= HVM_HAP_SUPERPAGE_2MB; - if ( cpu_has_vmx_ept_1gb ) - vmx_function_table.hap_capabilities |= HVM_HAP_SUPERPAGE_1GB; + vmx_function_table.caps.hap_superpage_2mb = cpu_has_vmx_ept_2mb; + vmx_function_table.caps.hap_superpage_1gb = cpu_has_vmx_ept_1gb; setup_ept_dump(); } diff --git a/xen/arch/x86/include/asm/hvm/hvm.h b/xen/arch/x86/include/asm/hvm/hvm.h index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/include/asm/hvm/hvm.h +++ b/xen/arch/x86/include/asm/hvm/hvm.h @@ -XXX,XX +XXX,XX @@ enum hvm_intblk { #define HVM_INTR_SHADOW_SMI 0x00000004 #define HVM_INTR_SHADOW_NMI 0x00000008 -/* - * HAP super page capabilities: - * bit0: if 2MB super page is allowed? - * bit1: if 1GB super page is allowed? - */ -#define HVM_HAP_SUPERPAGE_2MB 0x00000001 -#define HVM_HAP_SUPERPAGE_1GB 0x00000002 - #define HVM_EVENT_VECTOR_UNSET (-1) #define HVM_EVENT_VECTOR_UPDATING (-2) @@ -XXX,XX +XXX,XX @@ struct hvm_function_table { /* Hardware virtual interrupt delivery enable? */ bool virtual_intr_delivery_enabled; - /* Indicate HAP capabilities. */ - unsigned int hap_capabilities; + struct { + /* Indicate HAP capabilities. */ + bool hap_superpage_1gb:1, + hap_superpage_2mb:1; + } caps; /* * Initialise/destroy HVM domain/vcpu resources @@ -XXX,XX +XXX,XX @@ int hvm_get_param(struct domain *d, uint32_t index, uint64_t *value); (hvm_paging_enabled(v) && ((v)->arch.hvm.guest_cr[4] & X86_CR4_PKS)) /* Can we use superpages in the HAP p2m table? */ -#define hap_has_1gb (!!(hvm_funcs.hap_capabilities & HVM_HAP_SUPERPAGE_1GB)) -#define hap_has_2mb (!!(hvm_funcs.hap_capabilities & HVM_HAP_SUPERPAGE_2MB)) +#define hap_has_1gb hvm_funcs.caps.hap_superpage_1gb +#define hap_has_2mb hvm_funcs.caps.hap_superpage_2mb #define hvm_long_mode_active(v) (!!((v)->arch.hvm.guest_efer & EFER_LMA)) -- 2.25.1
The "effective type" of the cpu_has_svm_feature macro is effectively an unsigned log with one bit set (or not); at least one place someone felt compelled to do a !! to make sure that they got a boolean out of it. Ideally the whole of this would be folded into the cpufeature.h infrastructure. But for now, duplicate the more type-safe static inlines in that file, and remove the !!. Signed-off-by: George Dunlap <george.dunlap@cloud.com> --- CC: Jan Beulich <jbeulich@suse.com> CC: Andrew Cooper <andrew.cooper3@citrix.com> CC: "Roger Pau Monné" <roger.pau@citrix.com> CC: Wei Liu <wl@xen.org> --- xen/arch/x86/hvm/svm/svm.c | 2 +- xen/arch/x86/include/asm/hvm/svm/svm.h | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/hvm/svm/svm.c +++ b/xen/arch/x86/hvm/svm/svm.c @@ -XXX,XX +XXX,XX @@ const struct hvm_function_table * __init start_svm(void) if ( !printed ) printk(" - none\n"); - svm_function_table.hap_supported = !!cpu_has_svm_npt; + svm_function_table.hap_supported = cpu_has_svm_npt; svm_function_table.caps.hap_superpage_2mb = true; svm_function_table.caps.hap_superpage_1gb = cpu_has_page1gb; diff --git a/xen/arch/x86/include/asm/hvm/svm/svm.h b/xen/arch/x86/include/asm/hvm/svm/svm.h index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/include/asm/hvm/svm/svm.h +++ b/xen/arch/x86/include/asm/hvm/svm/svm.h @@ -XXX,XX +XXX,XX @@ extern u32 svm_feature_flags; #define SVM_FEATURE_SSS 19 /* NPT Supervisor Shadow Stacks */ #define SVM_FEATURE_SPEC_CTRL 20 /* MSR_SPEC_CTRL virtualisation */ -#define cpu_has_svm_feature(f) (svm_feature_flags & (1u << (f))) +static inline bool cpu_has_svm_feature(unsigned int feat) +{ + return svm_feature_flags & (1u << (feat)); +} #define cpu_has_svm_npt cpu_has_svm_feature(SVM_FEATURE_NPT) #define cpu_has_svm_lbrv cpu_has_svm_feature(SVM_FEATURE_LBRV) #define cpu_has_svm_svml cpu_has_svm_feature(SVM_FEATURE_SVML) -- 2.25.1
Moving them all together has several advantages: * Collects them all in one part of the struct * The `caps` field means that we can drop the "_supported" suffix, as it's clear what is meant. Signed-off-by: George Dunlap <george.dunlap@cloud.com> --- CC: Jan Beulich <jbeulich@suse.com> CC: Andrew Cooper <andrew.cooper3@citrix.com> CC: "Roger Pau Monné" <roger.pau@citrix.com> CC: Wei Liu <wl@xen.org> CC: Jun Nakajima <jun.nakajima@intel.com> CC: Kevin Tian <kevin.tian@intel.com> --- xen/arch/x86/hvm/hvm.c | 6 +++--- xen/arch/x86/hvm/svm/svm.c | 2 +- xen/arch/x86/hvm/vlapic.c | 4 ++-- xen/arch/x86/hvm/vmx/vmcs.c | 2 +- xen/arch/x86/hvm/vmx/vmx.c | 8 ++++---- xen/arch/x86/include/asm/hvm/hvm.h | 29 ++++++++++++++--------------- 6 files changed, 25 insertions(+), 26 deletions(-) diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/hvm/hvm.c +++ b/xen/arch/x86/hvm/hvm.c @@ -XXX,XX +XXX,XX @@ static struct notifier_block cpu_nfb = { static bool __init hap_supported(struct hvm_function_table *fns) { - if ( !fns->hap_supported ) + if ( !fns->caps.hap ) { printk("HVM: Hardware Assisted Paging (HAP) not detected\n"); return false; @@ -XXX,XX +XXX,XX @@ static bool __init hap_supported(struct hvm_function_table *fns) if ( !opt_hap_enabled ) { - fns->hap_supported = 0; + fns->caps.hap = 0; printk("HVM: Hardware Assisted Paging (HAP) detected but disabled\n"); return false; } @@ -XXX,XX +XXX,XX @@ static int __init cf_check hvm_enable(void) } if ( !opt_altp2m_enabled ) - hvm_funcs.altp2m_supported = 0; + hvm_funcs.caps.altp2m = 0; if ( opt_hvm_fep ) warning_add(warning_hvm_fep); diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/hvm/svm/svm.c +++ b/xen/arch/x86/hvm/svm/svm.c @@ -XXX,XX +XXX,XX @@ const struct hvm_function_table * __init start_svm(void) if ( !printed ) printk(" - none\n"); - svm_function_table.hap_supported = cpu_has_svm_npt; + svm_function_table.caps.hap = cpu_has_svm_npt; svm_function_table.caps.hap_superpage_2mb = true; svm_function_table.caps.hap_superpage_1gb = cpu_has_page1gb; diff --git a/xen/arch/x86/hvm/vlapic.c b/xen/arch/x86/hvm/vlapic.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/hvm/vlapic.c +++ b/xen/arch/x86/hvm/vlapic.c @@ -XXX,XX +XXX,XX @@ int vlapic_has_pending_irq(struct vcpu *v) if ( irr == -1 ) return -1; - if ( hvm_funcs.virtual_intr_delivery_enabled && + if ( hvm_funcs.caps.virtual_intr_delivery && !nestedhvm_vcpu_in_guestmode(v) ) return irr; @@ -XXX,XX +XXX,XX @@ int vlapic_ack_pending_irq(struct vcpu *v, int vector, bool force_ack) int isr; if ( !force_ack && - hvm_funcs.virtual_intr_delivery_enabled ) + hvm_funcs.caps.virtual_intr_delivery ) return 1; /* If there's no chance of using APIC assist then bail now. */ diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/hvm/vmx/vmcs.c +++ b/xen/arch/x86/hvm/vmx/vmcs.c @@ -XXX,XX +XXX,XX @@ static int cf_check parse_ept_param_runtime(const char *s) struct domain *d; int val; - if ( !cpu_has_vmx_ept || !hvm_funcs.hap_supported || + if ( !cpu_has_vmx_ept || !hvm_funcs.caps.hap || !(hvm_funcs.caps.hap_superpage_2mb || hvm_funcs.caps.hap_superpage_1gb) ) { diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/hvm/vmx/vmx.c +++ b/xen/arch/x86/hvm/vmx/vmx.c @@ -XXX,XX +XXX,XX @@ const struct hvm_function_table * __init start_vmx(void) return NULL; } - vmx_function_table.singlestep_supported = cpu_has_monitor_trap_flag; + vmx_function_table.caps.singlestep = cpu_has_monitor_trap_flag; if ( cpu_has_vmx_dt_exiting ) vmx_function_table.set_descriptor_access_exiting = @@ -XXX,XX +XXX,XX @@ const struct hvm_function_table * __init start_vmx(void) printk("VMX: Disabling executable EPT superpages due to CVE-2018-12207\n"); } - vmx_function_table.hap_supported = 1; - vmx_function_table.altp2m_supported = 1; + vmx_function_table.caps.hap = 1; + vmx_function_table.caps.altp2m = 1; vmx_function_table.caps.hap_superpage_2mb = cpu_has_vmx_ept_2mb; vmx_function_table.caps.hap_superpage_1gb = cpu_has_vmx_ept_1gb; @@ -XXX,XX +XXX,XX @@ const struct hvm_function_table * __init start_vmx(void) vmx_function_table.update_eoi_exit_bitmap = vmx_update_eoi_exit_bitmap; vmx_function_table.process_isr = vmx_process_isr; vmx_function_table.handle_eoi = vmx_handle_eoi; - vmx_function_table.virtual_intr_delivery_enabled = true; + vmx_function_table.caps.virtual_intr_delivery = true; } if ( cpu_has_vmx_posted_intr_processing ) diff --git a/xen/arch/x86/include/asm/hvm/hvm.h b/xen/arch/x86/include/asm/hvm/hvm.h index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/include/asm/hvm/hvm.h +++ b/xen/arch/x86/include/asm/hvm/hvm.h @@ -XXX,XX +XXX,XX @@ struct hvm_vcpu_nonreg_state { struct hvm_function_table { const char *name; - /* Support Hardware-Assisted Paging? */ - bool hap_supported; - - /* Necessary hardware support for alternate p2m's? */ - bool altp2m_supported; - bool singlestep_supported; - - /* Hardware virtual interrupt delivery enable? */ - bool virtual_intr_delivery_enabled; - struct { /* Indicate HAP capabilities. */ - bool hap_superpage_1gb:1, - hap_superpage_2mb:1; + bool hap:1, + hap_superpage_1gb:1, + hap_superpage_2mb:1, + + /* Altp2m capabilities */ + altp2m:1, + singlestep:1, + + /* Hardware virtual interrupt delivery enable? */ + virtual_intr_delivery; + } caps; /* @@ -XXX,XX +XXX,XX @@ static inline void hvm_enable_msr_interception(struct domain *d, uint32_t msr) static inline bool hvm_is_singlestep_supported(void) { - return hvm_funcs.singlestep_supported; + return hvm_funcs.caps.singlestep; } static inline bool hvm_hap_supported(void) { - return hvm_funcs.hap_supported; + return hvm_funcs.caps.hap; } /* returns true if hardware supports alternate p2m's */ static inline bool hvm_altp2m_supported(void) { - return hvm_funcs.altp2m_supported; + return hvm_funcs.caps.altp2m; } /* updates the current hardware p2m */ -- 2.25.1
The primary purpose of TSC scaling, from our perspective, is to maintain the fiction of an "invariant TSC" across migrates between platforms with different clock speeds. On AMD, the TscRateMSR CPUID bit is unconditionally enabled in the "host cpuid", even if the hardware doesn't actually support it. According to c/s fd14a1943c4 ("nestedsvm: Support TSC Rate MSR"), testing showed that emulating TSC scaling in an L1 was more expensive than emulating TSC scaling on an L0 (due to extra sets of vmexit / vmenter). However, the current implementation seems to be broken. First of all, the final L2 scaling ratio should be a composition of the L0 scaling ratio and the L1 scaling ratio; there's no indication this is being done anywhere. Secondly, it's not clear that the L1 tsc scaling ratio actually affects the L0 tsc scaling ratio. The stored value (ns_tscratio) is used to affect the tsc *offset*, but doesn't seem to actually be factored into d->hvm.tsc_scaling_ratio. (Which shouldn't be per-domain anyway, but per-vcpu.) Having the *offset* scaled according to the nested scaling without the actual RDTSC itself also being scaled has got to produce inconsistent results. For now, just disable the functionality entirely until we can implement it properly: - Don't set TSCRATEMSR in the host CPUID policy - Remove MSR_AMD64_TSC_RATIO emulation handling, so that the guest guests a #GP if it tries to access them (as it should when TSCRATEMSR is clear) - Remove ns_tscratio from struct nestedhvm, and all code that touches it Unfortunately this means ripping out the scaling calculation stuff as well, since it's only used in the nested case; it's there in the git tree if we need it for reference when we re-introduce it. Signed-off-by: George Dunlap <george.dunlap@cloud.com> --- CC: Jan Beulich <jbeulich@suse.com> CC: Andrew Cooper <andrew.cooper3@citrix.com> CC: "Roger Pau Monné" <roger.pau@citrix.com> CC: Wei Liu <wl@xen.org> --- xen/arch/x86/cpu-policy.c | 3 +- xen/arch/x86/hvm/svm/nestedsvm.c | 2 - xen/arch/x86/hvm/svm/svm.c | 57 -------------------- xen/arch/x86/include/asm/hvm/svm/nestedsvm.h | 5 -- 4 files changed, 1 insertion(+), 66 deletions(-) diff --git a/xen/arch/x86/cpu-policy.c b/xen/arch/x86/cpu-policy.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/cpu-policy.c +++ b/xen/arch/x86/cpu-policy.c @@ -XXX,XX +XXX,XX @@ static void __init calculate_host_policy(void) (1u << SVM_FEATURE_PAUSEFILTER) | (1u << SVM_FEATURE_DECODEASSISTS)); /* Enable features which are always emulated. */ - p->extd.raw[0xa].d |= ((1u << SVM_FEATURE_VMCBCLEAN) | - (1u << SVM_FEATURE_TSCRATEMSR)); + p->extd.raw[0xa].d |= (1u << SVM_FEATURE_VMCBCLEAN); } /* 0x000000ce MSR_INTEL_PLATFORM_INFO */ diff --git a/xen/arch/x86/hvm/svm/nestedsvm.c b/xen/arch/x86/hvm/svm/nestedsvm.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/hvm/svm/nestedsvm.c +++ b/xen/arch/x86/hvm/svm/nestedsvm.c @@ -XXX,XX +XXX,XX @@ int cf_check nsvm_vcpu_reset(struct vcpu *v) svm->ns_msr_hsavepa = INVALID_PADDR; svm->ns_ovvmcb_pa = INVALID_PADDR; - svm->ns_tscratio = DEFAULT_TSC_RATIO; - svm->ns_cr_intercepts = 0; svm->ns_dr_intercepts = 0; svm->ns_exception_intercepts = 0; diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/hvm/svm/svm.c +++ b/xen/arch/x86/hvm/svm/svm.c @@ -XXX,XX +XXX,XX @@ static int cf_check svm_get_guest_pat(struct vcpu *v, u64 *gpat) return 1; } -static uint64_t scale_tsc(uint64_t host_tsc, uint64_t ratio) -{ - uint64_t mult, frac, scaled_host_tsc; - - if ( ratio == DEFAULT_TSC_RATIO ) - return host_tsc; - - /* - * Suppose the most significant 32 bits of host_tsc and ratio are - * tsc_h and mult, and the least 32 bits of them are tsc_l and frac, - * then - * host_tsc * ratio * 2^-32 - * = host_tsc * (mult * 2^32 + frac) * 2^-32 - * = host_tsc * mult + (tsc_h * 2^32 + tsc_l) * frac * 2^-32 - * = host_tsc * mult + tsc_h * frac + ((tsc_l * frac) >> 32) - * - * Multiplications in the last two terms are between 32-bit integers, - * so both of them can fit in 64-bit integers. - * - * Because mult is usually less than 10 in practice, it's very rare - * that host_tsc * mult can overflow a 64-bit integer. - */ - mult = ratio >> 32; - frac = ratio & ((1ULL << 32) - 1); - scaled_host_tsc = host_tsc * mult; - scaled_host_tsc += (host_tsc >> 32) * frac; - scaled_host_tsc += ((host_tsc & ((1ULL << 32) - 1)) * frac) >> 32; - - return scaled_host_tsc; -} - -static uint64_t svm_get_tsc_offset(uint64_t host_tsc, uint64_t guest_tsc, - uint64_t ratio) -{ - return guest_tsc - scale_tsc(host_tsc, ratio); -} - static void cf_check svm_set_tsc_offset(struct vcpu *v, u64 offset, u64 at_tsc) { struct vmcb_struct *vmcb = v->arch.hvm.svm.vmcb; @@ -XXX,XX +XXX,XX @@ static void cf_check svm_set_tsc_offset(struct vcpu *v, u64 offset, u64 at_tsc) if ( nestedhvm_vcpu_in_guestmode(v) ) { - struct nestedsvm *svm = &vcpu_nestedsvm(v); - n2_tsc_offset = vmcb_get_tsc_offset(n2vmcb) - vmcb_get_tsc_offset(n1vmcb); - if ( svm->ns_tscratio != DEFAULT_TSC_RATIO ) - { - uint64_t guest_tsc = hvm_get_guest_tsc_fixed(v, at_tsc); - - n2_tsc_offset = svm_get_tsc_offset(guest_tsc, - guest_tsc + n2_tsc_offset, - svm->ns_tscratio); - } vmcb_set_tsc_offset(n1vmcb, offset); } @@ -XXX,XX +XXX,XX @@ static int cf_check svm_msr_read_intercept( *msr_content = nsvm->ns_msr_hsavepa; break; - case MSR_AMD64_TSC_RATIO: - *msr_content = nsvm->ns_tscratio; - break; - case MSR_AMD_OSVW_ID_LENGTH: case MSR_AMD_OSVW_STATUS: if ( !d->arch.cpuid->extd.osvw ) @@ -XXX,XX +XXX,XX @@ static int cf_check svm_msr_write_intercept( goto gpf; break; - case MSR_AMD64_TSC_RATIO: - if ( msr_content & TSC_RATIO_RSVD_BITS ) - goto gpf; - nsvm->ns_tscratio = msr_content; - break; - case MSR_IA32_MCx_MISC(4): /* Threshold register */ case MSR_F10_MC4_MISC1 ... MSR_F10_MC4_MISC3: /* diff --git a/xen/arch/x86/include/asm/hvm/svm/nestedsvm.h b/xen/arch/x86/include/asm/hvm/svm/nestedsvm.h index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/include/asm/hvm/svm/nestedsvm.h +++ b/xen/arch/x86/include/asm/hvm/svm/nestedsvm.h @@ -XXX,XX +XXX,XX @@ struct nestedsvm { */ uint64_t ns_ovvmcb_pa; - /* virtual tscratio holding the value l1 guest writes to the - * MSR_AMD64_TSC_RATIO MSR. - */ - uint64_t ns_tscratio; - /* Cached real intercepts of the l2 guest */ uint32_t ns_cr_intercepts; uint32_t ns_dr_intercepts; -- 2.25.1
Changeset ef3e8db8068 ("x86/hvm: Corrections and improvements to unhandled vmexit logging") introduced a printk to the default path of the switch statement in nestedsvm_check_intercepts(), complaining of an unknown exit reason. Unfortunately, the "core" switch statement which is meant to handle all vmexit reasons is in nsvm_vmcb_guest_intercepts_exitcode(); the switch statement in nestedsvm_check_intercepts() is only meant to superimpose on top of that some special-casing for how to interaction between L1 and L0 vmexits. Remove the printk, and add a comment to prevent future confusion. Signed-off-by: George Dunlap <george.dunlap@cloud.com> --- xen/arch/x86/hvm/svm/nestedsvm.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/xen/arch/x86/hvm/svm/nestedsvm.c b/xen/arch/x86/hvm/svm/nestedsvm.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/hvm/svm/nestedsvm.c +++ b/xen/arch/x86/hvm/svm/nestedsvm.c @@ -XXX,XX +XXX,XX @@ nestedsvm_check_intercepts(struct vcpu *v, struct cpu_user_regs *regs, ASSERT(vcpu_nestedhvm(v).nv_vmexit_pending == 0); is_intercepted = nsvm_vmcb_guest_intercepts_exitcode(v, regs, exitcode); + /* + * Handle specific interactions between things the guest and host + * may both want to intercept + */ switch ( exitcode ) { case VMEXIT_INVALID: @@ -XXX,XX +XXX,XX @@ nestedsvm_check_intercepts(struct vcpu *v, struct cpu_user_regs *regs, /* Always let the guest handle VMMCALL/VMCALL */ return NESTEDHVM_VMEXIT_INJECT; default: - gprintk(XENLOG_ERR, "Unexpected nested vmexit: reason %#"PRIx64"\n", - exitcode); break; } -- 2.25.1
In order to make implementation and testing tractable, we will require specific host functionality. Add a nested_virt bit to hvm_funcs.caps, and return an error if a domain is created with nested virt and this bit isn't set. For VMX, start with always enabling it if HAP is present; this shouldn't change current behvior. For SVM, require some basic functionality, adding a document explaining the rationale. NB that only SVM CPUID bits 0-7 have been considered. Bits 10-16 may be considered in a follow-up patch. Signed-off-by: George Dunlap <george.dunlap@cloud.com> --- CC: Andrew Cooper <andrew.cooper3@citrix.com> CC: George Dunlap <george.dunlap@citrix.com> CC: Jan Beulich <jbeulich@suse.com> CC: Julien Grall <julien@xen.org> CC: Stefano Stabellini <sstabellini@kernel.org> CC: Wei Liu <wl@xen.org> CC: "Roger Pau Monné" <roger.pau@citrix.com> CC: Jun Nakajima <jun.nakajima@intel.com> CC: Kevin Tian <kevin.tian@intel.com> --- docs/designs/nested-svm-cpu-features.md | 110 ++++++++++++++++++++++++ xen/arch/x86/domain.c | 6 ++ xen/arch/x86/hvm/svm/nestedhvm.h | 1 + xen/arch/x86/hvm/svm/nestedsvm.c | 14 +++ xen/arch/x86/hvm/svm/svm.c | 2 + xen/arch/x86/hvm/vmx/vmx.c | 3 + xen/arch/x86/include/asm/hvm/hvm.h | 11 ++- 7 files changed, 146 insertions(+), 1 deletion(-) diff --git a/docs/designs/nested-svm-cpu-features.md b/docs/designs/nested-svm-cpu-features.md new file mode 100644 index XXXXXXX..XXXXXXX --- /dev/null +++ b/docs/designs/nested-svm-cpu-features.md @@ -XXX,XX +XXX,XX @@ +# Nested SVM (AMD) CPUID requirements + +The first step in making nested SVM production-ready is to make sure +that all features are implemented and well-tested. To make this +tractable, we will initially be limiting the "supported" range of +nested virt to a specific subset of host and guest features. This +document describes the criteria for deciding on features, and the +rationale behind each feature. + +For AMD, all virtualization-related features can be found in CPUID +leaf 8000000A:edx + +# Criteria + +- Processor support: At a minimum we want to support processors from + the last 5 years. All things being equal, older processors are + better. Bits 0:7 were available in the very earliest processors; + and even through bit 15 we should be pretty good support-wise. + +- Faithfulness to hardware: We need the behavior of the "virtual cpu" + from the L1 hypervisor's perspective to be as close as possible to + the original hardware. In particular, the behavior of the hardware + on error paths 1) is not easy to understand or test, 2) can be the + source of surprising vulnerabiliies. (See XSA-7 for an example of a + case where subtle error-handling differences can open up a privilege + escalation.) We should avoid emulating any bit of the hardware with + complex error paths if we can at all help it. + +- Cost of implementation: We want to minimize the cost of + implementation (where this includes bringing an existing sub-par + implementation up to speed). All things being equal, we'll favor a + configuration which does not require any new implementation. + +- Performance: All things being equal, we'd prefer to choose a set of + L0 / L1 CPUID bits that are faster than slower. + + +# Bits + +- 0 `NP` *Nested Paging*: Required both for L0 and L1. + + Based primarily on faithfulness and performance, as well as + potential cost of implementation. Available on earliest hardware, + so no compatibility issues. + +- 1 `LbrVirt` *LBR / debugging virtualization*: Require for L0 and L1. + + For L0 this is required for performance: There's no way to tell the + guests not to use the LBR-related registers; and if the guest does, + then you have to save and restore all LBR-related registers on + context switch, which is prohibitive. Furthermore, the additional + emulation risks a security-relevant difference to come up. + + Providing it to L1 when we have it in L0 is basically free, and + already implemented. + + Just require it and provide it. + +- 2 `SVML` *SVM Lock*: Not required for L0, not provided to L1 + + Seems to be aboult enabling an operating system to prevent "blue + pill" attacks against itself. + + Xen doesn't use it, nor provide it; so it would need to be + implementend. The best way to protect a guest OS is to leave nested + virt disabled in the tools. + +- 3 `NRIPS` NRIP Save: Require for both L0 and L1 + + If NRIPS is not present, the software interrupt injection + functionality can't be used; and Xen has to emulate it. That's + another source of potential security issues. If hardware supports + it, then providing it to guest is basically free. + +- 4 `TscRateMsr`: Not required by L0, not provided to L1 + + The main putative use for this would be trying to maintain an + invariant TSC across cores with different clock speeds, or after a + migrate. Unlike others, this doesn't have an error path to worry + about compatibility-wise; and according to tests done when nestedSVM + was first implemented, it's actually faster to emliate TscRateMSR in + the L0 hypervisor than for L1 to attempt to emulate it itself. + + However, using this properly in L0 will take some implementation + effort; and composing it properly with L1 will take even more + effort. Just leave it off for now. + + - 5 `VmcbClean`: VMCB Clean Bits: Not required by L0, provide to L1 + + This is a pure optimization, both on the side of the L0 and L1. The + implementaiton for L1 is entirely Xen-side, so can be provided even + on hardware that doesn't provide it. And it's purely an + optimization, so could be "implemented" by ignoring the bits + entirely. + + As such, we don't need to require it for L0; and as it's already + implemented, no reason not to provide it to L1. Before this feature + was available those bits were marked SBZ ("should be zero"); setting + them was already advertised to cause unpredictable behavior. + +- 6 `FlushByAsid`: Require for L0, provide to L1 + + This is cheap and easy to use for L0 and to provide to the L1; + there's no reson not to just pass it through. + +- 7 `DecodeAssists`: Require for L0, provide to L1 + + Using it in L0 reduces the chance that we'll make some sort of error + in the decode path. And if hardware supports it, it's easy enough + to provide to the L1. diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -XXX,XX +XXX,XX @@ int arch_sanitise_domain_config(struct xen_domctl_createdomain *config) */ config->flags |= XEN_DOMCTL_CDF_oos_off; + if ( nested_virt && !hvm_nested_virt_supported() ) + { + dprintk(XENLOG_INFO, "Nested virt requested but not available\n"); + return -EINVAL; + } + if ( nested_virt && !hap ) { dprintk(XENLOG_INFO, "Nested virt not supported without HAP\n"); diff --git a/xen/arch/x86/hvm/svm/nestedhvm.h b/xen/arch/x86/hvm/svm/nestedhvm.h index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/hvm/svm/nestedhvm.h +++ b/xen/arch/x86/hvm/svm/nestedhvm.h @@ -XXX,XX +XXX,XX @@ enum nestedhvm_vmexits nestedsvm_check_intercepts(struct vcpu *v, struct cpu_user_regs *regs, uint64_t exitcode); void svm_nested_features_on_efer_update(struct vcpu *v); +void __init start_nested_svm(struct hvm_function_table *svm_function_table); /* Interface methods */ void cf_check nsvm_vcpu_destroy(struct vcpu *v); diff --git a/xen/arch/x86/hvm/svm/nestedsvm.c b/xen/arch/x86/hvm/svm/nestedsvm.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/hvm/svm/nestedsvm.c +++ b/xen/arch/x86/hvm/svm/nestedsvm.c @@ -XXX,XX +XXX,XX @@ void svm_nested_features_on_efer_update(struct vcpu *v) } } } + +void __init start_nested_svm(struct hvm_function_table *svm_function_table) +{ + /* + * Required host functionality to support nested virt. See + * docs/designs/nested-svm-cpu-features.md for rationale. + */ + svm_function_table->caps.nested_virt = + cpu_has_svm_nrips && + cpu_has_svm_lbrv && + cpu_has_svm_nrips && + cpu_has_svm_flushbyasid && + cpu_has_svm_decode; +} diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/hvm/svm/svm.c +++ b/xen/arch/x86/hvm/svm/svm.c @@ -XXX,XX +XXX,XX @@ const struct hvm_function_table * __init start_svm(void) svm_function_table.caps.hap_superpage_2mb = true; svm_function_table.caps.hap_superpage_1gb = cpu_has_page1gb; + start_nested_svm(&svm_function_table); + return &svm_function_table; } diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/hvm/vmx/vmx.c +++ b/xen/arch/x86/hvm/vmx/vmx.c @@ -XXX,XX +XXX,XX @@ const struct hvm_function_table * __init start_vmx(void) if ( cpu_has_vmx_tsc_scaling ) vmx_function_table.tsc_scaling.ratio_frac_bits = 48; + /* TODO: Require hardware support before enabling nested virt */ + vmx_function_table.caps.nested_virt = vmx_function_table.caps.hap; + model_specific_lbr = get_model_specific_lbr(); lbr_tsx_fixup_check(); ler_to_fixup_check(); diff --git a/xen/arch/x86/include/asm/hvm/hvm.h b/xen/arch/x86/include/asm/hvm/hvm.h index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/include/asm/hvm/hvm.h +++ b/xen/arch/x86/include/asm/hvm/hvm.h @@ -XXX,XX +XXX,XX @@ struct hvm_function_table { singlestep:1, /* Hardware virtual interrupt delivery enable? */ - virtual_intr_delivery; + virtual_intr_delivery, + + /* Nested virt capabilities */ + nested_virt:1; } caps; @@ -XXX,XX +XXX,XX @@ static inline bool hvm_altp2m_supported(void) return hvm_funcs.caps.altp2m; } +/* Returns true if we have the minimum hardware requirements for nested virt */ +static inline bool hvm_nested_virt_supported(void) +{ + return hvm_funcs.caps.nested_virt; +} + /* updates the current hardware p2m */ static inline void altp2m_vcpu_update_p2m(struct vcpu *v) { -- 2.25.1
This is my work-in-progress series for getting nested virt working again on AMD. The first patch is an early draft to integrate the SVM bits into the CPUID framework. It will be partially superseded by a series Andrew has posted but which has not yet been checked in. The second patch is a workaround which hides the PCID bit from the L1 when nested HVM is enabled. Long-term, nested support for PCID will be necessary for performance. Most of the rest of the patches involve improving tracing: Either fixing what's bitrotted in xenalyze, what's bitrotted specifically on the SVM side, or what could be improved in the upstream tracing. The last patch is a placeholder for future work, commenting a place where behavior is wrong and another where more work needs to be done to assure safety. George Dunlap (14): x86/cpuid-policy: Add AMD SVM CPUID leaf to featureset x86/cpu-policy: HACK Disable PCID when nested virt is enabled xenalyze: Basic nested virt processing xenalyze: Track generic event information when not in summary mode xenalyze: Ignore vmexits where an HVM_HANDLER traces would be redundant xen/svm: Remove redundant HVM_HANDLER trace for EXCEPTION_AC xen/hvm: Don't skip MSR_READ trace record svm: Do NPF trace before calling hvm_hap_nested_page_fault x86/emulate: Don't trace cr reads during emulation xenalyze: Quiet warnings about VMEXIT_IOIO x86/trace: Add trace to xsetbv svm/vmx handler path xenalyze: Basic processing for XSETBV exits and handlers x86/svm: Add a trace for VMEXIT_VMRUN x86/nestedsvm: Note some places for improvement tools/libs/light/libxl_cpuid.c | 1 + tools/misc/xen-cpuid.c | 1 + tools/xentrace/xenalyze.c | 133 +++++++++++++++----- xen/arch/x86/cpu-policy.c | 24 ++-- xen/arch/x86/cpu/common.c | 2 + xen/arch/x86/hvm/emulate.c | 1 - xen/arch/x86/hvm/hvm.c | 4 +- xen/arch/x86/hvm/svm/nestedsvm.c | 13 ++ xen/arch/x86/hvm/svm/svm.c | 7 +- xen/include/public/arch-x86/cpufeatureset.h | 16 +++ xen/include/public/trace.h | 1 + xen/include/xen/lib/x86/cpu-policy.h | 10 +- xen/lib/x86/cpuid.c | 4 +- 13 files changed, 167 insertions(+), 50 deletions(-) -- 2.25.1
NOTE: This patch is be partially superseded by Andrew Cooper's "x86: AMD CPUID handling improvements" series. Currently, the CPUID leaf for SVM features (extd 0xa.edx) is manually twiddled: - hvm_max_policy takes host_policy and clamps it to supported features (with some features unilaterally enabled because they're always emulated - hvm_default_policy is copied from there - When recalculate_policy() is called for a guest, if SVM is clear, then the entire leaf is zeroed out. Move to a mode where the extended features are off by default, and enabled when nested_virt is enabled. In cpufeatureset.h, define a new featureset word for the AMD SVM features, and declare all of the bits defined in x86/include/asm/hvm/svm/svm.h. Mark the ones we currently pass through to the "max policy" as HAP-only and optional. In cpu-policy.h, define FEATURESET_ead, and convert the un-named space in struct_cpu_policy into the appropriate union. FIXME: Do this in a prerequisite patch, and change all references to p->extd.raw[0xa]. Update x86_cpu_X_to_Y and Y_to_X to copy this into and out of the appropriate leaf. Populate this during boot in generic_identify(). Add the new featureset definition into libxl_cpuid.c. Update the code in calculate_hvm_max_policy() to do nothing with the "normal" CPUID bits, and use the feature bit to unconditionally enable VMCBCLEAN. FIXME Move this to a follow-up patch. In recalculate_cpuid_policy(), enable max_fs when nested_hvm() is true. Signed-off-by: George Dunlap <george.dunlap@cloud.com> --- CC: Andrew Cooper <andrew.cooper3@citrix.com> CC: Jan Beulich <jbeulich@suse.com> CC: Roger Pau Monne <roger.pau@cloud.com> --- tools/libs/light/libxl_cpuid.c | 1 + tools/misc/xen-cpuid.c | 1 + xen/arch/x86/cpu-policy.c | 23 ++++++++++----------- xen/arch/x86/cpu/common.c | 2 ++ xen/include/public/arch-x86/cpufeatureset.h | 16 ++++++++++++++ xen/include/xen/lib/x86/cpu-policy.h | 10 ++++++++- xen/lib/x86/cpuid.c | 4 +++- 7 files changed, 43 insertions(+), 14 deletions(-) diff --git a/tools/libs/light/libxl_cpuid.c b/tools/libs/light/libxl_cpuid.c index XXXXXXX..XXXXXXX 100644 --- a/tools/libs/light/libxl_cpuid.c +++ b/tools/libs/light/libxl_cpuid.c @@ -XXX,XX +XXX,XX @@ int libxl_cpuid_parse_config(libxl_cpuid_policy_list *policy, const char* str) CPUID_ENTRY(0x00000007, 1, CPUID_REG_EDX), MSR_ENTRY(0x10a, CPUID_REG_EAX), MSR_ENTRY(0x10a, CPUID_REG_EDX), + CPUID_ENTRY(0x8000000a, NA, CPUID_REG_EDX), #undef MSR_ENTRY #undef CPUID_ENTRY }; diff --git a/tools/misc/xen-cpuid.c b/tools/misc/xen-cpuid.c index XXXXXXX..XXXXXXX 100644 --- a/tools/misc/xen-cpuid.c +++ b/tools/misc/xen-cpuid.c @@ -XXX,XX +XXX,XX @@ static const struct { { "CPUID 0x00000007:1.edx", "7d1" }, { "MSR_ARCH_CAPS.lo", "m10Al" }, { "MSR_ARCH_CAPS.hi", "m10Ah" }, + { "CPUID 0x8000000a.edx", "ead" }, }; #define COL_ALIGN "24" diff --git a/xen/arch/x86/cpu-policy.c b/xen/arch/x86/cpu-policy.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/cpu-policy.c +++ b/xen/arch/x86/cpu-policy.c @@ -XXX,XX +XXX,XX @@ static void __init calculate_hvm_max_policy(void) if ( !cpu_has_vmx ) __clear_bit(X86_FEATURE_PKS, fs); - /* + /* * Make adjustments to possible (nested) virtualization features exposed * to the guest */ if ( p->extd.svm ) { - /* Clamp to implemented features which require hardware support. */ - p->extd.raw[0xa].d &= ((1u << SVM_FEATURE_NPT) | - (1u << SVM_FEATURE_LBRV) | - (1u << SVM_FEATURE_NRIPS) | - (1u << SVM_FEATURE_PAUSEFILTER) | - (1u << SVM_FEATURE_DECODEASSISTS)); /* Enable features which are always emulated. */ - p->extd.raw[0xa].d |= (1u << SVM_FEATURE_VMCBCLEAN); + __set_bit(X86_FEATURE_VMCBCLEAN, fs); } - + guest_common_max_feature_adjustments(fs); guest_common_feature_adjustments(fs); @@ -XXX,XX +XXX,XX @@ void recalculate_cpuid_policy(struct domain *d) __clear_bit(X86_FEATURE_VMX, max_fs); __clear_bit(X86_FEATURE_SVM, max_fs); } + else + { + /* + * Enable SVM features. This will be empty on VMX + * hosts. + */ + fs[FEATURESET_ead] = max_fs[FEATURESET_ead]; + } } /* @@ -XXX,XX +XXX,XX @@ void recalculate_cpuid_policy(struct domain *d) ((vpmu_mode & XENPMU_MODE_ALL) && !is_hardware_domain(d)) ) p->basic.raw[0xa] = EMPTY_LEAF; - if ( !p->extd.svm ) - p->extd.raw[0xa] = EMPTY_LEAF; - if ( !p->extd.page1gb ) p->extd.raw[0x19] = EMPTY_LEAF; } diff --git a/xen/arch/x86/cpu/common.c b/xen/arch/x86/cpu/common.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/cpu/common.c +++ b/xen/arch/x86/cpu/common.c @@ -XXX,XX +XXX,XX @@ static void generic_identify(struct cpuinfo_x86 *c) c->x86_capability[FEATURESET_e7d] = cpuid_edx(0x80000007); if (c->extended_cpuid_level >= 0x80000008) c->x86_capability[FEATURESET_e8b] = cpuid_ebx(0x80000008); + if (c->extended_cpuid_level >= 0x8000000a) + c->x86_capability[FEATURESET_ead] = cpuid_edx(0x8000000a); if (c->extended_cpuid_level >= 0x80000021) c->x86_capability[FEATURESET_e21a] = cpuid_eax(0x80000021); diff --git a/xen/include/public/arch-x86/cpufeatureset.h b/xen/include/public/arch-x86/cpufeatureset.h index XXXXXXX..XXXXXXX 100644 --- a/xen/include/public/arch-x86/cpufeatureset.h +++ b/xen/include/public/arch-x86/cpufeatureset.h @@ -XXX,XX +XXX,XX @@ XEN_CPUFEATURE(RFDS_CLEAR, 16*32+28) /*!A| Register File(s) cleared by V /* Intel-defined CPU features, MSR_ARCH_CAPS 0x10a.edx, word 17 */ +/* AMD-defined CPU features, CPUID level 0x8000000a.edx, word 18 */ +XEN_CPUFEATURE(NPT, 18*32+ 0) /*h Nested page table support */ +XEN_CPUFEATURE(LBRV, 18*32+ 1) /*h LBR virtualization support */ +XEN_CPUFEATURE(SVML, 18*32+ 2) /* SVM locking MSR support */ +XEN_CPUFEATURE(NRIPS, 18*32+ 3) /*h Next RIP save on VMEXIT support */ +XEN_CPUFEATURE(TSCRATEMSR, 18*32+ 4) /* TSC ratio MSR support */ +XEN_CPUFEATURE(VMCBCLEAN, 18*32+ 5) /*h VMCB clean bits support */ +XEN_CPUFEATURE(FLUSHBYASID, 18*32+ 6) /* TLB flush by ASID support */ +XEN_CPUFEATURE(DECODEASSISTS, 18*32+ 7) /*h Decode assists support */ +XEN_CPUFEATURE(PAUSEFILTER, 18*32+10) /*h Pause intercept filter support */ +XEN_CPUFEATURE(PAUSETHRESH, 18*32+12) /* Pause intercept filter threshold */ +XEN_CPUFEATURE(VLOADSAVE, 18*32+15) /* virtual vmload/vmsave */ +XEN_CPUFEATURE(VGIF, 18*32+16) /* Virtual GIF */ +XEN_CPUFEATURE(SSS, 18*32+19) /* NPT Supervisor Shadow Stacks */ +XEN_CPUFEATURE(SPEC_CTRL, 18*32+20) /* MSR_SPEC_CTRL virtualisation */ + #endif /* XEN_CPUFEATURE */ /* Clean up from a default include. Close the enum (for C). */ diff --git a/xen/include/xen/lib/x86/cpu-policy.h b/xen/include/xen/lib/x86/cpu-policy.h index XXXXXXX..XXXXXXX 100644 --- a/xen/include/xen/lib/x86/cpu-policy.h +++ b/xen/include/xen/lib/x86/cpu-policy.h @@ -XXX,XX +XXX,XX @@ #define FEATURESET_7d1 15 /* 0x00000007:1.edx */ #define FEATURESET_m10Al 16 /* 0x0000010a.eax */ #define FEATURESET_m10Ah 17 /* 0x0000010a.edx */ +#define FEATURESET_ead 18 /* 0x8000000a.edx */ struct cpuid_leaf { @@ -XXX,XX +XXX,XX @@ struct cpu_policy uint32_t /* d */:32; uint64_t :64, :64; /* Leaf 0x80000009. */ - uint64_t :64, :64; /* Leaf 0x8000000a - SVM rev and features. */ + + /* Leaf 0x8000000a - SVM rev and features. */ + uint64_t /* a, b */:64, /* c */:32; + union { + uint32_t ead; + struct { DECL_BITFIELD(ead); }; + }; + uint64_t :64, :64; /* Leaf 0x8000000b. */ uint64_t :64, :64; /* Leaf 0x8000000c. */ uint64_t :64, :64; /* Leaf 0x8000000d. */ diff --git a/xen/lib/x86/cpuid.c b/xen/lib/x86/cpuid.c index XXXXXXX..XXXXXXX 100644 --- a/xen/lib/x86/cpuid.c +++ b/xen/lib/x86/cpuid.c @@ -XXX,XX +XXX,XX @@ void x86_cpu_policy_to_featureset( fs[FEATURESET_7d1] = p->feat._7d1; fs[FEATURESET_m10Al] = p->arch_caps.lo; fs[FEATURESET_m10Ah] = p->arch_caps.hi; -} + fs[FEATURESET_ead] = p->extd.ead; + } void x86_cpu_featureset_to_policy( const uint32_t fs[FEATURESET_NR_ENTRIES], struct cpu_policy *p) @@ -XXX,XX +XXX,XX @@ void x86_cpu_featureset_to_policy( p->feat._7d1 = fs[FEATURESET_7d1]; p->arch_caps.lo = fs[FEATURESET_m10Al]; p->arch_caps.hi = fs[FEATURESET_m10Ah]; + p->extd.ead = fs[FEATURESET_ead]; } void x86_cpu_policy_recalc_synth(struct cpu_policy *p) -- 2.25.1
The non-nested HVM code knows how to provide PCID functionality (non-zero values in the lower 12 bits of CR3 when running in 64-bit mode), but the nested code doesn't. If the L2 decides to use the PCID functionality, the L0 will fail the next L1 VMENTRY. Long term we definitely want to enable this feature, but for now, just hide it from guests when nested HVM is enabled. Signed-off-by: George Dunlap <george.dunlap@cloud.com> --- xen/arch/x86/cpu-policy.c | 1 + 1 file changed, 1 insertion(+) diff --git a/xen/arch/x86/cpu-policy.c b/xen/arch/x86/cpu-policy.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/cpu-policy.c +++ b/xen/arch/x86/cpu-policy.c @@ -XXX,XX +XXX,XX @@ void recalculate_cpuid_policy(struct domain *d) * hosts. */ fs[FEATURESET_ead] = max_fs[FEATURESET_ead]; + __clear_bit(X86_FEATURE_PCID, max_fs); } } -- 2.25.1
On SVM, VMEXIT that occur in when an L1 is in non-root mode ("nested exits") are tagged with the TRC_HVM_NESTEDFLAG flag. Expand xenalyze to do basic handling of these records: - Refactor hvm_process to filter out both TRC_HVM_NESTEDFLAG and TRC_64_FLAG when deciding how to process - Create separate "trace volume" sub-categories for them (HVM_VOL_*), tweaking layout so things continue to align - Visually distinquish nested from non-nested vmexits and vmentries in dump mode At the moment, nested VMEXITs which are passed through to the L1 hypervisor won't have any HVM handlers; note in hvm_data when a vmexit was nested, and don't warn when no handlers occur. While here: - Expand one of the error messages with a bit more information - Replace repeated `ri->event & TRC_64_FLAG` instances with `flag64` - Remove a stray whitespace at the end of a line - Add a missing newline to a warning statement Signed-off-by: George Dunlap <george.dunlap@cloud.com> --- tools/xentrace/xenalyze.c | 66 ++++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 21 deletions(-) diff --git a/tools/xentrace/xenalyze.c b/tools/xentrace/xenalyze.c index XXXXXXX..XXXXXXX 100644 --- a/tools/xentrace/xenalyze.c +++ b/tools/xentrace/xenalyze.c @@ -XXX,XX +XXX,XX @@ const char * hvm_event_handler_name[HVM_EVENT_HANDLER_MAX] = { enum { HVM_VOL_VMENTRY, + HVM_VOL_VMENTRY_NESTED, HVM_VOL_VMEXIT, + HVM_VOL_VMEXIT_NESTED, HVM_VOL_HANDLER, HVM_VOL_EMUL, HVM_VOL_MAX @@ -XXX,XX +XXX,XX @@ const char *guest_interrupt_case_name[] = { const char *hvm_vol_name[HVM_VOL_MAX] = { [HVM_VOL_VMENTRY]="vmentry", [HVM_VOL_VMEXIT] ="vmexit", + [HVM_VOL_VMENTRY_NESTED]="vmentry(n)", + [HVM_VOL_VMEXIT_NESTED] ="vmexit(n)", [HVM_VOL_HANDLER]="handler", [HVM_VOL_EMUL]="emul", }; @@ -XXX,XX +XXX,XX @@ struct hvm_data { tsc_t exit_tsc, arc_cycles, entry_tsc; unsigned long long rip; unsigned exit_reason, event_handler; - unsigned int short_summary_done:1, prealloc_unpin:1, wrmap_bf:1; + bool nested_vmexit, short_summary_done, prealloc_unpin, wrmap_bf; /* Immediate processing */ void *d; @@ -XXX,XX +XXX,XX @@ void volume_summary(struct trace_volume *vol) case TOPLEVEL_HVM: for(k=0; k<HVM_VOL_MAX; k++) { if(vol->hvm[k]) - printf(" +-%-7s: %10lld\n", + printf(" +-%-10s: %10lld\n", hvm_vol_name[k], vol->hvm[k]); } @@ -XXX,XX +XXX,XX @@ void hvm_generic_postprocess(struct hvm_data *h) /* Some exits we don't expect a handler; just return */ if(opt.svm_mode) { + /* Nested vmexits passed on to L1's have no handlers for now */ + if(h->nested_vmexit) + return; switch(h->exit_reason) { case VMEXIT_VINTR: /* Equivalent of PENDING_VIRT_INTR */ @@ -XXX,XX +XXX,XX @@ void hvm_generic_postprocess(struct hvm_data *h) if ( !warned[h->exit_reason] ) { /* If we aren't a known exception, warn and log results */ - fprintf(warn, "%s: Strange, exit %x(%s) missing a handler\n", - __func__, h->exit_reason, + fprintf(warn, "%s: d%dv%d Strange, exit %x(%s) missing a handler\n", + __func__, + h->v->d->did, h->v->vid, + h->exit_reason, (h->exit_reason > h->exit_reason_max) ? "[clipped]" : h->exit_reason_name[h->exit_reason]); @@ -XXX,XX +XXX,XX @@ static inline void runstate_update(struct vcpu_data *v, int new_runstate, void hvm_vmexit_process(struct record_info *ri, struct hvm_data *h, struct vcpu_data *v) { + bool flag64 = ri->event & TRC_64_FLAG; + bool nested = ri->event & TRC_HVM_NESTEDFLAG; + struct { union { struct { @@ -XXX,XX +XXX,XX @@ void hvm_vmexit_process(struct record_info *ri, struct hvm_data *h, }; } *r; - if ( ri->event & TRC_64_FLAG ) + if(flag64) { if (check_extra_words(ri, sizeof(r->x64), "vmexit")) return; @@ -XXX,XX +XXX,XX @@ void hvm_vmexit_process(struct record_info *ri, struct hvm_data *h, set_hvm_exit_reason_data(h, ri->event); h->vmexit_valid=1; + h->nested_vmexit=nested; bzero(&h->inflight, sizeof(h->inflight)); - if(ri->event & TRC_64_FLAG) { + if(flag64) { if(v->guest_paging_levels != 4) { if ( verbosity >= 6 ) @@ -XXX,XX +XXX,XX @@ void hvm_vmexit_process(struct record_info *ri, struct hvm_data *h, if(opt.dump_all) { if ( h->exit_reason < h->exit_reason_max && h->exit_reason_name[h->exit_reason] != NULL) - printf("]%s vmexit exit_reason %s eip %llx%s\n", + printf("]%s vmexit%s exit_reason %s eip %llx%s\n", ri->dump_header, + nested ? "(n)" : "", h->exit_reason_name[h->exit_reason], h->rip, find_symbol(h->rip)); else - printf("]%s vmexit exit_reason %x eip %llx%s\n", + printf("]%s vmexit%s exit_reason %x eip %llx%s\n", ri->dump_header, + nested ? "(n)" : "", h->exit_reason, h->rip, find_symbol(h->rip)); @@ -XXX,XX +XXX,XX @@ void hvm_close_vmexit(struct hvm_data *h, tsc_t tsc) { } void hvm_vmentry_process(struct record_info *ri, struct hvm_data *h) { + bool nested = ri->event & TRC_HVM_NESTEDFLAG; if(!h->init) { if(opt.dump_all) - printf("!%s vmentry\n", - ri->dump_header); + printf("!%s vmentry%s\n", + ri->dump_header, + nested ? "(n)" : ""); return; } @@ -XXX,XX +XXX,XX @@ void hvm_vmentry_process(struct record_info *ri, struct hvm_data *h) { if(!h->vmexit_valid) { if(opt.dump_all) - printf("!%s vmentry\n", - ri->dump_header); + printf("!%s vmentry%s\n", + ri->dump_header, + nested ? "(n)" : ""); return; } if(opt.dump_all) { unsigned long long arc_cycles = ri->tsc - h->exit_tsc; - printf("]%s vmentry cycles %lld %s\n", - ri->dump_header, arc_cycles, (arc_cycles>10000)?"!":""); + printf("]%s vmentry%s cycles %lld %s\n", + ri->dump_header, + nested ? "(n)" : "", + arc_cycles, (arc_cycles>10000)?"!":""); } hvm_close_vmexit(h, ri->tsc); @@ -XXX,XX +XXX,XX @@ void hvm_process(struct pcpu_info *p) /* FIXME: Collect analysis on this */ break; default: - switch(ri->event) { + switch(ri->event & ~(TRC_HVM_NESTEDFLAG|TRC_64_FLAG)) { case TRC_HVM_VMX_EXIT: - case TRC_HVM_VMX_EXIT64: case TRC_HVM_SVM_EXIT: - case TRC_HVM_SVM_EXIT64: - UPDATE_VOLUME(p, hvm[HVM_VOL_VMEXIT], ri->size); + if (ri->event & TRC_HVM_NESTEDFLAG) + UPDATE_VOLUME(p, hvm[HVM_VOL_VMEXIT_NESTED], ri->size); + else + UPDATE_VOLUME(p, hvm[HVM_VOL_VMEXIT], ri->size); hvm_vmexit_process(ri, h, v); break; case TRC_HVM_VMENTRY: - UPDATE_VOLUME(p, hvm[HVM_VOL_VMENTRY], ri->size); + if (ri->event & TRC_HVM_NESTEDFLAG) + UPDATE_VOLUME(p, hvm[HVM_VOL_VMENTRY_NESTED], ri->size); + else + UPDATE_VOLUME(p, hvm[HVM_VOL_VMENTRY], ri->size); hvm_vmentry_process(ri, &p->current->hvm); break; default: @@ -XXX,XX +XXX,XX @@ void vcpu_start(struct pcpu_info *p, struct vcpu_data *v, /* Change default domain to 'queued' */ runstate_update(p->current, RUNSTATE_QUEUED, p->first_tsc); - /* + /* * Set current to NULL, so that if another vcpu (not in INIT) * is scheduled here, we don't trip over the check in * vcpu_next_update() @@ -XXX,XX +XXX,XX @@ void vcpu_start(struct pcpu_info *p, struct vcpu_data *v, return; } else if ( v->delayed_init ) { - fprintf(warn, "d%dv%d RUNSTATE_RUNNING detected, leaving INIT", + fprintf(warn, "d%dv%d RUNSTATE_RUNNING detected, leaving INIT\n", v->d->did, v->vid); v->delayed_init = 0; } -- 2.25.1
Generally speaking, a VMEXIT/VMENTRY cycle should have at least three trace records: the VMEXIT trace (which contains the processor-specific exit code), a more generic Xen-based Xen event (an HVM_HANDLER trace record), and a VMEXIT trace; and any given VMEXIT exit reason should only have a single HVM_HANDLER trace type. Having duplicate or missing HVM_HANDLER traces is generally indicative of a problem that's crept in in the hypervisor tracing scheme. This is property is checked in hvm_generic_postprocess(), and violations are flagged with a warning. In order to do this, when an HVM trace record that doesn't have a specific post-processor happens, information about the HVM trace record is stored in hvm_data->inflight.generic. Unfortunately, while the check was being done in all "modes", the relevant information was only being copied into inflight.generic in summary mode. This resulted in spurious warnings about missing HVM_HANDLER traces when running in dump mode. Since running in dump mode is often critical to understanding how the warnings came about, just collect the information always as well. That said, the data from the trace doesn't appear to be used by anyone; so to save some time, don't bother copying it. Signed-off-by: George Dunlap <george.dunlap@cloud.com> --- tools/xentrace/xenalyze.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/tools/xentrace/xenalyze.c b/tools/xentrace/xenalyze.c index XXXXXXX..XXXXXXX 100644 --- a/tools/xentrace/xenalyze.c +++ b/tools/xentrace/xenalyze.c @@ -XXX,XX +XXX,XX @@ struct hvm_data { } msr; struct { unsigned int event; - uint32_t d[4]; } generic; struct { unsigned eax; @@ -XXX,XX +XXX,XX @@ void hvm_npf_process(struct record_info *ri, struct hvm_data *h) (unsigned long long)r->gpa, r->qualification, (unsigned long long)r->mfn, r->p2mt); - if ( opt.summary_info ) - hvm_generic_postprocess_init(ri, h); + hvm_generic_postprocess_init(ri, h); } void hvm_rdtsc_process(struct record_info *ri, struct hvm_data *h) @@ -XXX,XX +XXX,XX @@ void hvm_generic_postprocess_init(struct record_info *ri, struct hvm_data *h) fprintf(warn, "%s: Strange, h->postprocess set!\n", __func__); h->inflight.generic.event = ri->event; - bcopy(h->d, h->inflight.generic.d, sizeof(unsigned int) * 4); } void hvm_generic_postprocess(struct hvm_data *h) @@ -XXX,XX +XXX,XX @@ needs_vmexit: default: if(opt.dump_all) hvm_generic_dump(ri, "]"); - if(opt.summary_info) - hvm_generic_postprocess_init(ri, h); + hvm_generic_postprocess_init(ri, h); break; } } @@ -XXX,XX +XXX,XX @@ void shadow_fault_generic_process(struct record_info *ri, struct hvm_data *h) /* pf-case traces, vs others */ h->inflight.generic.event = ri->event; - bcopy(ri->d, h->inflight.generic.d, sizeof(unsigned int) * 4); if(opt.dump_all) - shadow_fault_generic_dump(h->inflight.generic.event, - h->inflight.generic.d, + shadow_fault_generic_dump(ri->event, + ri->d, "]", ri->dump_header); h->inflight.pf_xen.pf_case = sevt.minor; -- 2.25.1
VMX combines all exceptions into a single VMEXIT exit reason, and so needs a separate HVM_HANDLER trace record (HVM_TRAP) to say which exception happened; but for a number of exceptions, no further information is put into the trace log. SVM, by contrast, has a separate VMEXIT exit reason for each exception vector, so HVM_TRAP would be redundant. NB that VMEXIT_EXCEPTION_DB doesn't have an HVM_HANDLER for SVM yet; but for VMX, there's a specific HVM_HANDLER trace record which includes more information; so SVM really should record information as well. Signed-off-by: George Dunlap <george.dunlap@cloud.com> --- tools/xentrace/xenalyze.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/xentrace/xenalyze.c b/tools/xentrace/xenalyze.c index XXXXXXX..XXXXXXX 100644 --- a/tools/xentrace/xenalyze.c +++ b/tools/xentrace/xenalyze.c @@ -XXX,XX +XXX,XX @@ void hvm_generic_postprocess(struct hvm_data *h) { case VMEXIT_VINTR: /* Equivalent of PENDING_VIRT_INTR */ case VMEXIT_PAUSE: + /* + * VMX just has TRC_HVM_TRAP for these, which would be + * redundant on SVM + */ + case VMEXIT_EXCEPTION_BP: + case VMEXIT_EXCEPTION_NM: + case VMEXIT_EXCEPTION_AC: + case VMEXIT_EXCEPTION_UD: return; default: break; -- 2.25.1
Adding an HVM_TRAP trace record is redundant for EXCEPTION_AC: it adds trace volume without adding any information, and xenalyze already knows not to expect it. Remove it. Signed-off-by: George Dunlap <george.dunlap@cloud.com> --- xen/arch/x86/hvm/svm/svm.c | 1 - 1 file changed, 1 deletion(-) diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/hvm/svm/svm.c +++ b/xen/arch/x86/hvm/svm/svm.c @@ -XXX,XX +XXX,XX @@ void asmlinkage svm_vmexit_handler(void) } case VMEXIT_EXCEPTION_AC: - TRACE(TRC_HVM_TRAP, X86_EXC_AC); hvm_inject_hw_exception(X86_EXC_AC, vmcb->ei.exc.ec); break; -- 2.25.1
Commit 37f074a3383 ("x86/msr: introduce guest_rdmsr()") introduced a function to combine the MSR_READ handling between PV and HVM. Unfortunately, by returning directly, it skipped the trace generation, leading to gaps in the trace record, as well as xenalyze errors like this: hvm_generic_postprocess: d2v0 Strange, exit 7c(VMEXIT_MSR) missing a handler Replace the `return` with `goto out`. Fixes: 37f074a3383 ("x86/msr: introduce guest_rdmsr()") Signed-off-by: George Dunlap <george.dunlap@cloud.com> --- xen/arch/x86/hvm/hvm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/hvm/hvm.c +++ b/xen/arch/x86/hvm/hvm.c @@ -XXX,XX +XXX,XX @@ int hvm_msr_read_intercept(unsigned int msr, uint64_t *msr_content) fixed_range_base = (uint64_t *)v->arch.hvm.mtrr.fixed_ranges; if ( (ret = guest_rdmsr(v, msr, msr_content)) != X86EMUL_UNHANDLEABLE ) - return ret; + goto out; ret = X86EMUL_OKAY; -- 2.25.1
Unfortunately I've forgotten exactly why I made this change. I suspect that there were other traces (like MMIO traces) which were being put before the NPF trace; but to understand the trace record you'd want to have the NPF information first. Signed-off-by: George Dunlap <georg.dunlap@cloud.com> --- xen/arch/x86/hvm/svm/svm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/hvm/svm/svm.c +++ b/xen/arch/x86/hvm/svm/svm.c @@ -XXX,XX +XXX,XX @@ static void svm_do_nested_pgfault(struct vcpu *v, else if ( pfec & NPT_PFEC_in_gpt ) npfec.kind = npfec_kind_in_gpt; - ret = hvm_hap_nested_page_fault(gpa, ~0UL, npfec); - if ( tb_init_done ) { struct { @@ -XXX,XX +XXX,XX @@ static void svm_do_nested_pgfault(struct vcpu *v, trace(TRC_HVM_NPF, sizeof(_d), &_d); } + ret = hvm_hap_nested_page_fault(gpa, ~0UL, npfec); + switch ( ret ) { case 1: -- 2.25.1
hvmemul_read_cr ends up being called fairly freqently during emulation; these are generally not necessary for understanding a trace, and make processing more complicated (because they show up as extra trace records within a vmexit/entry "arc" that must be classified). Leave the trace in write_cr, as any hypothetical writes to CRs *will* be necessary to understand traces. Signed-off-by: George Dunlap <george.dunlap@cloud.com> --- xen/arch/x86/hvm/emulate.c | 1 - 1 file changed, 1 deletion(-) diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/hvm/emulate.c +++ b/xen/arch/x86/hvm/emulate.c @@ -XXX,XX +XXX,XX @@ static int cf_check hvmemul_read_cr( case 3: case 4: *val = current->arch.hvm.guest_cr[reg]; - TRACE(TRC_HVM_CR_READ64, reg, *val, *val >> 32); return X86EMUL_OKAY; default: break; -- 2.25.1
There's a general issue with both PIO and MMIO reads (as detailed in the comment); do a work-around for now. Signed-off-by: George Dunlap <george.dunlap@cloud.com> --- tools/xentrace/xenalyze.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tools/xentrace/xenalyze.c b/tools/xentrace/xenalyze.c index XXXXXXX..XXXXXXX 100644 --- a/tools/xentrace/xenalyze.c +++ b/tools/xentrace/xenalyze.c @@ -XXX,XX +XXX,XX @@ void hvm_generic_postprocess(struct hvm_data *h) case VMEXIT_EXCEPTION_AC: case VMEXIT_EXCEPTION_UD: return; + case VMEXIT_IOIO: + /* + * FIXME: Both IO and MMIO reads which have gone out + * to the emulator and back typically have the + * [mm]io_assist trace happen on resume, just before + * the subsequent VMENTRY. + * + * However, when a VM has blocked, we call + * hvm_vmexit_close() when it becomes RUNNABLE again, + * rather than when it actually runs again; meaning + * that when hvm_vmexit_close() is called, no HVM + * handler has been seen. + * + * What we really need is some sort of intermediate + * state for [mm]io reads; but for now just ignore + * VMEXIT_IOIO exits without a handler. + */ + return; default: break; } -- 2.25.1
There are already "HVM handler" trace records for writing to XCRs in the context of an HVM guest. This trace is currently taken in hvmemul_write_xcr. However, both VMX and SVM vmexits call hvm_handle_xsetbv as a result of an XSETBV vmexit, and hvm_handle_xsetbv calls x86emul_write_xcr directly, bypassing the trace, resulting in no "HVM handler" trace record for that VMEXIT. For maximal DRY-ness, we would want hvm_handle_xsetbv to call hvmemul_write_xcr; but since the intent seems to be for hvmemul_* to be only accesible via hvm_emulate(), just duplicate the trace. Signed-off-by: George Dunlap <george.dunlap@cloud.com> --- CC: Andrew Cooper <andrew.cooper@cloud.com> CC: Jan Beulich <jbeulich@suse.com> CC: Roger Pau Monne <roger.pau@cloud.com> --- xen/arch/x86/hvm/hvm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/hvm/hvm.c +++ b/xen/arch/x86/hvm/hvm.c @@ -XXX,XX +XXX,XX @@ int hvm_handle_xsetbv(u32 index, u64 new_bv) if ( index == 0 ) hvm_monitor_crX(XCR0, new_bv, current->arch.xcr0); + TRACE(TRC_HVM_XCR_WRITE64, index, new_bv, new_bv >> 32); + rc = x86emul_write_xcr(index, new_bv, NULL); if ( rc != X86EMUL_OKAY ) hvm_inject_hw_exception(X86_EXC_GP, 0); -- 2.25.1
Basically this means adding VMEXIT strings for XSETBV exit, and adding the handlers and strings for them. Signed-off-by: George Dunlap <george.dunlap@cloud.com> --- tools/xentrace/xenalyze.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tools/xentrace/xenalyze.c b/tools/xentrace/xenalyze.c index XXXXXXX..XXXXXXX 100644 --- a/tools/xentrace/xenalyze.c +++ b/tools/xentrace/xenalyze.c @@ -XXX,XX +XXX,XX @@ enum VMEXIT_EXITCODE VMEXIT_MONITOR = 138, VMEXIT_MWAIT = 139, VMEXIT_MWAIT_CONDITIONAL= 140, + VMEXIT_XSETBV = 141, /* 0x8d */ + VMEXIT_RDPRU = 142, /* 0x8e */ VMEXIT_NPF = 1024, /* nested paging fault */ VMEXIT_INVALID = -1 }; @@ -XXX,XX +XXX,XX @@ const char * hvm_svm_exit_reason_name[HVM_SVM_EXIT_REASON_MAX] = { "VMEXIT_MWAIT", /* 140 */ "VMEXIT_MWAIT_CONDITIONAL", + "VMEXIT_XSETBV", + "VMEXIT_RDPRU", [VMEXIT_NPF] = "VMEXIT_NPF", /* nested paging fault */ }; @@ -XXX,XX +XXX,XX @@ enum { HVM_EVENT_TRAP, HVM_EVENT_TRAP_DEBUG, HVM_EVENT_VLAPIC, + HVM_EVENT_XCR_READ, + HVM_EVENT_XCR_WRITE, HVM_EVENT_HANDLER_MAX }; const char * hvm_event_handler_name[HVM_EVENT_HANDLER_MAX] = { @@ -XXX,XX +XXX,XX @@ const char * hvm_event_handler_name[HVM_EVENT_HANDLER_MAX] = { "realmode_emulate", "trap", "trap_debug", - "vlapic" + "vlapic", + "xcr_read", + "xcr_write" }; enum { -- 2.25.1
Note that this trace is SVM-specific. Most HVM handler traces are shared between VMX and SVM because the underlying instruction set is largely the equivalent; but in this case, the instructions are different enough that there's no sensible way to share HVM handler traces between them. Keeping the target VMCB address should allow future analysis of which L2 vcpu within an L1 is running. Signed-off-by: George Dunlap <george.dunlap@cloud.com> --- tools/xentrace/xenalyze.c | 20 +++++++++++++++++++- xen/arch/x86/hvm/svm/svm.c | 2 ++ xen/include/public/trace.h | 1 + 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/tools/xentrace/xenalyze.c b/tools/xentrace/xenalyze.c index XXXXXXX..XXXXXXX 100644 --- a/tools/xentrace/xenalyze.c +++ b/tools/xentrace/xenalyze.c @@ -XXX,XX +XXX,XX @@ enum { HVM_EVENT_VLAPIC, HVM_EVENT_XCR_READ, HVM_EVENT_XCR_WRITE, + HVM_EVENT_VMRUN, HVM_EVENT_HANDLER_MAX }; const char * hvm_event_handler_name[HVM_EVENT_HANDLER_MAX] = { @@ -XXX,XX +XXX,XX @@ const char * hvm_event_handler_name[HVM_EVENT_HANDLER_MAX] = { "trap_debug", "vlapic", "xcr_read", - "xcr_write" + "xcr_write", + "vmrun" }; enum { @@ -XXX,XX +XXX,XX @@ void hvm_rdtsc_process(struct record_info *ri, struct hvm_data *h) h->last_rdtsc = r->tsc; } + +void hvm_vmrun_process(struct record_info *ri, struct hvm_data *h) +{ + struct { + uint64_t vmcbaddr; + } *r = (typeof(r))h->d; + + if ( opt.dump_all ) + printf(" %s vmrun %llx\n", + ri->dump_header, + (unsigned long long)r->vmcbaddr); +} + void hvm_generic_summary(struct hvm_data *h, void *data) { long evt = (long)data; @@ -XXX,XX +XXX,XX @@ needs_vmexit: case TRC_HVM_RDTSC: hvm_rdtsc_process(ri, h); break; + case TRC_HVM_VMRUN: + hvm_vmrun_process(ri, h); + break; case TRC_HVM_DR_READ: case TRC_HVM_DR_WRITE: case TRC_HVM_CPUID: diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/hvm/svm/svm.c +++ b/xen/arch/x86/hvm/svm/svm.c @@ -XXX,XX +XXX,XX @@ svm_vmexit_do_vmrun(struct cpu_user_regs *regs, return; } + TRACE(TRC_HVM_VMRUN, vmcbaddr, vmcbaddr >> 32); + vcpu_nestedhvm(v).nv_vmentry_pending = 1; return; } diff --git a/xen/include/public/trace.h b/xen/include/public/trace.h index XXXXXXX..XXXXXXX 100644 --- a/xen/include/public/trace.h +++ b/xen/include/public/trace.h @@ -XXX,XX +XXX,XX @@ #define TRC_HVM_VLAPIC (TRC_HVM_HANDLER + 0x25) #define TRC_HVM_XCR_READ64 (TRC_HVM_HANDLER + TRC_64_FLAG + 0x26) #define TRC_HVM_XCR_WRITE64 (TRC_HVM_HANDLER + TRC_64_FLAG + 0x27) +#define TRC_HVM_VMRUN (TRC_HVM_HANDLER + 0x28) #define TRC_HVM_IOPORT_WRITE (TRC_HVM_HANDLER + 0x216) #define TRC_HVM_IOMEM_WRITE (TRC_HVM_HANDLER + 0x217) -- 2.25.1
Signed-off-by: George Dunlap <george.dunlap@cloud.com> --- xen/arch/x86/hvm/svm/nestedsvm.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/xen/arch/x86/hvm/svm/nestedsvm.c b/xen/arch/x86/hvm/svm/nestedsvm.c index XXXXXXX..XXXXXXX 100644 --- a/xen/arch/x86/hvm/svm/nestedsvm.c +++ b/xen/arch/x86/hvm/svm/nestedsvm.c @@ -XXX,XX +XXX,XX @@ nestedsvm_check_intercepts(struct vcpu *v, struct cpu_user_regs *regs, return NESTEDHVM_VMEXIT_INJECT; case VMEXIT_VMMCALL: /* Always let the guest handle VMMCALL/VMCALL */ + /* + * FIXME: This is wrong; if the L1 hasn't set the VMMCALL + * intercept and the L2 executes a VMMACALL, it should result + * in a #UD, not a VMMCALL exception. + */ return NESTEDHVM_VMEXIT_INJECT; default: + /* + * FIXME: It's not true that any VMEXIT not intercepted by L1 + * can safely be handled "safely" by L0; VMCALL above is one + * such example, but there may be more. We either need to + * combine this switch statement into the one in + * nsvm_vmcb_guest_intercepts_exitcode(), or explicitly list + * known-safe exits here. + */ break; } -- 2.25.1