Switch to the newer cpu_policy nomenclature. Do some easy cleanup of
includes.
No practical change.
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Roger Pau Monné <roger.pau@citrix.com>
CC: Wei Liu <wl@xen.org>
v2:
* New
---
xen/arch/x86/cpu-policy.c | 752 ++++++++++++++++++++++++
xen/arch/x86/cpuid.c | 817 +-------------------------
xen/arch/x86/hvm/hvm.c | 1 -
xen/arch/x86/include/asm/cpu-policy.h | 6 +
xen/arch/x86/include/asm/cpuid.h | 11 +-
xen/arch/x86/pv/domain.c | 1 +
xen/arch/x86/setup.c | 2 -
7 files changed, 764 insertions(+), 826 deletions(-)
diff --git a/xen/arch/x86/cpu-policy.c b/xen/arch/x86/cpu-policy.c
index f6a2317ed7bd..83186e940ca7 100644
--- a/xen/arch/x86/cpu-policy.c
+++ b/xen/arch/x86/cpu-policy.c
@@ -1,13 +1,19 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
#include <xen/cache.h>
#include <xen/kernel.h>
+#include <xen/param.h>
#include <xen/sched.h>
#include <xen/lib/x86/cpu-policy.h>
+#include <asm/amd.h>
#include <asm/cpu-policy.h>
+#include <asm/hvm/nestedhvm.h>
+#include <asm/hvm/svm/svm.h>
#include <asm/msr-index.h>
+#include <asm/paging.h>
#include <asm/setup.h>
+#include <asm/xstate.h>
struct cpu_policy __ro_after_init raw_cpu_policy;
struct cpu_policy __ro_after_init host_cpu_policy;
@@ -20,10 +26,332 @@ struct cpu_policy __ro_after_init hvm_max_cpu_policy;
struct cpu_policy __ro_after_init hvm_def_cpu_policy;
#endif
+const uint32_t known_features[] = INIT_KNOWN_FEATURES;
+
+static const uint32_t __initconst pv_max_featuremask[] = INIT_PV_MAX_FEATURES;
+static const uint32_t hvm_shadow_max_featuremask[] = INIT_HVM_SHADOW_MAX_FEATURES;
+static const uint32_t __initconst hvm_hap_max_featuremask[] =
+ INIT_HVM_HAP_MAX_FEATURES;
+static const uint32_t __initconst pv_def_featuremask[] = INIT_PV_DEF_FEATURES;
+static const uint32_t __initconst hvm_shadow_def_featuremask[] =
+ INIT_HVM_SHADOW_DEF_FEATURES;
+static const uint32_t __initconst hvm_hap_def_featuremask[] =
+ INIT_HVM_HAP_DEF_FEATURES;
+static const uint32_t deep_features[] = INIT_DEEP_FEATURES;
+
+static const struct feature_name {
+ const char *name;
+ unsigned int bit;
+} feature_names[] __initconstrel = INIT_FEATURE_NAMES;
+
+/*
+ * Parse a list of cpuid feature names -> bool, calling the callback for any
+ * matches found.
+ *
+ * always_inline, because this is init code only and we really don't want a
+ * function pointer call in the middle of the loop.
+ */
+static int __init always_inline parse_cpuid(
+ const char *s, void (*callback)(unsigned int feat, bool val))
+{
+ const char *ss;
+ int val, rc = 0;
+
+ do {
+ const struct feature_name *lhs, *rhs, *mid = NULL /* GCC... */;
+ const char *feat;
+
+ ss = strchr(s, ',');
+ if ( !ss )
+ ss = strchr(s, '\0');
+
+ /* Skip the 'no-' prefix for name comparisons. */
+ feat = s;
+ if ( strncmp(s, "no-", 3) == 0 )
+ feat += 3;
+
+ /* (Re)initalise lhs and rhs for binary search. */
+ lhs = feature_names;
+ rhs = feature_names + ARRAY_SIZE(feature_names);
+
+ while ( lhs < rhs )
+ {
+ int res;
+
+ mid = lhs + (rhs - lhs) / 2;
+ res = cmdline_strcmp(feat, mid->name);
+
+ if ( res < 0 )
+ {
+ rhs = mid;
+ continue;
+ }
+ if ( res > 0 )
+ {
+ lhs = mid + 1;
+ continue;
+ }
+
+ if ( (val = parse_boolean(mid->name, s, ss)) >= 0 )
+ {
+ callback(mid->bit, val);
+ mid = NULL;
+ }
+
+ break;
+ }
+
+ /*
+ * Mid being NULL means that the name and boolean were successfully
+ * identified. Everything else is an error.
+ */
+ if ( mid )
+ rc = -EINVAL;
+
+ s = ss + 1;
+ } while ( *ss );
+
+ return rc;
+}
+
+static void __init cf_check _parse_xen_cpuid(unsigned int feat, bool val)
+{
+ if ( !val )
+ setup_clear_cpu_cap(feat);
+ else if ( feat == X86_FEATURE_RDRAND &&
+ (cpuid_ecx(1) & cpufeat_mask(X86_FEATURE_RDRAND)) )
+ setup_force_cpu_cap(X86_FEATURE_RDRAND);
+}
+
+static int __init cf_check parse_xen_cpuid(const char *s)
+{
+ return parse_cpuid(s, _parse_xen_cpuid);
+}
+custom_param("cpuid", parse_xen_cpuid);
+
+static bool __initdata dom0_cpuid_cmdline;
+static uint32_t __initdata dom0_enable_feat[FSCAPINTS];
+static uint32_t __initdata dom0_disable_feat[FSCAPINTS];
+
+static void __init cf_check _parse_dom0_cpuid(unsigned int feat, bool val)
+{
+ __set_bit (feat, val ? dom0_enable_feat : dom0_disable_feat);
+ __clear_bit(feat, val ? dom0_disable_feat : dom0_enable_feat );
+}
+
+static int __init cf_check parse_dom0_cpuid(const char *s)
+{
+ dom0_cpuid_cmdline = true;
+
+ return parse_cpuid(s, _parse_dom0_cpuid);
+}
+custom_param("dom0-cpuid", parse_dom0_cpuid);
+
+#define EMPTY_LEAF ((struct cpuid_leaf){})
+static void zero_leaves(struct cpuid_leaf *l,
+ unsigned int first, unsigned int last)
+{
+ memset(&l[first], 0, sizeof(*l) * (last - first + 1));
+}
+
+static void sanitise_featureset(uint32_t *fs)
+{
+ /* for_each_set_bit() uses unsigned longs. Extend with zeroes. */
+ uint32_t disabled_features[
+ ROUNDUP(FSCAPINTS, sizeof(unsigned long)/sizeof(uint32_t))] = {};
+ unsigned int i;
+
+ for ( i = 0; i < FSCAPINTS; ++i )
+ {
+ /* Clamp to known mask. */
+ fs[i] &= known_features[i];
+
+ /*
+ * Identify which features with deep dependencies have been
+ * disabled.
+ */
+ disabled_features[i] = ~fs[i] & deep_features[i];
+ }
+
+ for_each_set_bit(i, (void *)disabled_features,
+ sizeof(disabled_features) * 8)
+ {
+ const uint32_t *dfs = x86_cpuid_lookup_deep_deps(i);
+ unsigned int j;
+
+ ASSERT(dfs); /* deep_features[] should guarentee this. */
+
+ for ( j = 0; j < FSCAPINTS; ++j )
+ {
+ fs[j] &= ~dfs[j];
+ disabled_features[j] &= ~dfs[j];
+ }
+ }
+}
+
+static void recalculate_xstate(struct cpu_policy *p)
+{
+ uint64_t xstates = XSTATE_FP_SSE;
+ uint32_t xstate_size = XSTATE_AREA_MIN_SIZE;
+ unsigned int i, Da1 = p->xstate.Da1;
+
+ /*
+ * The Da1 leaf is the only piece of information preserved in the common
+ * case. Everything else is derived from other feature state.
+ */
+ memset(&p->xstate, 0, sizeof(p->xstate));
+
+ if ( !p->basic.xsave )
+ return;
+
+ if ( p->basic.avx )
+ {
+ xstates |= X86_XCR0_YMM;
+ xstate_size = max(xstate_size,
+ xstate_offsets[X86_XCR0_YMM_POS] +
+ xstate_sizes[X86_XCR0_YMM_POS]);
+ }
+
+ if ( p->feat.mpx )
+ {
+ xstates |= X86_XCR0_BNDREGS | X86_XCR0_BNDCSR;
+ xstate_size = max(xstate_size,
+ xstate_offsets[X86_XCR0_BNDCSR_POS] +
+ xstate_sizes[X86_XCR0_BNDCSR_POS]);
+ }
+
+ if ( p->feat.avx512f )
+ {
+ xstates |= X86_XCR0_OPMASK | X86_XCR0_ZMM | X86_XCR0_HI_ZMM;
+ xstate_size = max(xstate_size,
+ xstate_offsets[X86_XCR0_HI_ZMM_POS] +
+ xstate_sizes[X86_XCR0_HI_ZMM_POS]);
+ }
+
+ if ( p->feat.pku )
+ {
+ xstates |= X86_XCR0_PKRU;
+ xstate_size = max(xstate_size,
+ xstate_offsets[X86_XCR0_PKRU_POS] +
+ xstate_sizes[X86_XCR0_PKRU_POS]);
+ }
+
+ p->xstate.max_size = xstate_size;
+ p->xstate.xcr0_low = xstates & ~XSTATE_XSAVES_ONLY;
+ p->xstate.xcr0_high = (xstates & ~XSTATE_XSAVES_ONLY) >> 32;
+
+ p->xstate.Da1 = Da1;
+ if ( p->xstate.xsaves )
+ {
+ p->xstate.xss_low = xstates & XSTATE_XSAVES_ONLY;
+ p->xstate.xss_high = (xstates & XSTATE_XSAVES_ONLY) >> 32;
+ }
+ else
+ xstates &= ~XSTATE_XSAVES_ONLY;
+
+ for ( i = 2; i < min(63ul, ARRAY_SIZE(p->xstate.comp)); ++i )
+ {
+ uint64_t curr_xstate = 1ul << i;
+
+ if ( !(xstates & curr_xstate) )
+ continue;
+
+ p->xstate.comp[i].size = xstate_sizes[i];
+ p->xstate.comp[i].offset = xstate_offsets[i];
+ p->xstate.comp[i].xss = curr_xstate & XSTATE_XSAVES_ONLY;
+ p->xstate.comp[i].align = curr_xstate & xstate_align;
+ }
+}
+
+/*
+ * Misc adjustments to the policy. Mostly clobbering reserved fields and
+ * duplicating shared fields. Intentionally hidden fields are annotated.
+ */
+static void recalculate_misc(struct cpu_policy *p)
+{
+ p->basic.raw_fms &= 0x0fff0fff; /* Clobber Processor Type on Intel. */
+ p->basic.apic_id = 0; /* Dynamic. */
+
+ p->basic.raw[0x5] = EMPTY_LEAF; /* MONITOR not exposed to guests. */
+ p->basic.raw[0x6] = EMPTY_LEAF; /* Therm/Power not exposed to guests. */
+
+ p->basic.raw[0x8] = EMPTY_LEAF;
+
+ /* TODO: Rework topology logic. */
+ memset(p->topo.raw, 0, sizeof(p->topo.raw));
+
+ p->basic.raw[0xc] = EMPTY_LEAF;
+
+ p->extd.e1d &= ~CPUID_COMMON_1D_FEATURES;
+
+ /* Most of Power/RAS hidden from guests. */
+ p->extd.raw[0x7].a = p->extd.raw[0x7].b = p->extd.raw[0x7].c = 0;
+
+ p->extd.raw[0x8].d = 0;
+
+ switch ( p->x86_vendor )
+ {
+ case X86_VENDOR_INTEL:
+ p->basic.l2_nr_queries = 1; /* Fixed to 1 query. */
+ p->basic.raw[0x3] = EMPTY_LEAF; /* PSN - always hidden. */
+ p->basic.raw[0x9] = EMPTY_LEAF; /* DCA - always hidden. */
+
+ p->extd.vendor_ebx = 0;
+ p->extd.vendor_ecx = 0;
+ p->extd.vendor_edx = 0;
+
+ p->extd.raw[0x1].a = p->extd.raw[0x1].b = 0;
+
+ p->extd.raw[0x5] = EMPTY_LEAF;
+ p->extd.raw[0x6].a = p->extd.raw[0x6].b = p->extd.raw[0x6].d = 0;
+
+ p->extd.raw[0x8].a &= 0x0000ffff;
+ p->extd.raw[0x8].c = 0;
+ break;
+
+ case X86_VENDOR_AMD:
+ case X86_VENDOR_HYGON:
+ zero_leaves(p->basic.raw, 0x2, 0x3);
+ memset(p->cache.raw, 0, sizeof(p->cache.raw));
+ zero_leaves(p->basic.raw, 0x9, 0xa);
+
+ p->extd.vendor_ebx = p->basic.vendor_ebx;
+ p->extd.vendor_ecx = p->basic.vendor_ecx;
+ p->extd.vendor_edx = p->basic.vendor_edx;
+
+ p->extd.raw_fms = p->basic.raw_fms;
+ p->extd.raw[0x1].b &= 0xff00ffff;
+ p->extd.e1d |= p->basic._1d & CPUID_COMMON_1D_FEATURES;
+
+ p->extd.raw[0x8].a &= 0x0000ffff; /* GuestMaxPhysAddr hidden. */
+ p->extd.raw[0x8].c &= 0x0003f0ff;
+
+ p->extd.raw[0x9] = EMPTY_LEAF;
+
+ zero_leaves(p->extd.raw, 0xb, 0x18);
+
+ /* 0x19 - TLB details. Pass through. */
+ /* 0x1a - Perf hints. Pass through. */
+
+ p->extd.raw[0x1b] = EMPTY_LEAF; /* IBS - not supported. */
+ p->extd.raw[0x1c] = EMPTY_LEAF; /* LWP - not supported. */
+ p->extd.raw[0x1d] = EMPTY_LEAF; /* TopoExt Cache */
+ p->extd.raw[0x1e] = EMPTY_LEAF; /* TopoExt APIC ID/Core/Node */
+ p->extd.raw[0x1f] = EMPTY_LEAF; /* SEV */
+ p->extd.raw[0x20] = EMPTY_LEAF; /* Platform QoS */
+ break;
+ }
+}
+
static void __init calculate_raw_policy(void)
{
struct cpu_policy *p = &raw_cpu_policy;
+ x86_cpuid_policy_fill_native(p);
+
+ /* Nothing good will come from Xen and libx86 disagreeing on vendor. */
+ ASSERT(p->x86_vendor == boot_cpu_data.x86_vendor);
+
/* 0x000000ce MSR_INTEL_PLATFORM_INFO */
/* Was already added by probe_cpuid_faulting() */
@@ -34,9 +362,50 @@ static void __init calculate_raw_policy(void)
static void __init calculate_host_policy(void)
{
struct cpu_policy *p = &host_cpu_policy;
+ unsigned int max_extd_leaf;
*p = raw_cpu_policy;
+ p->basic.max_leaf =
+ min_t(uint32_t, p->basic.max_leaf, ARRAY_SIZE(p->basic.raw) - 1);
+ p->feat.max_subleaf =
+ min_t(uint32_t, p->feat.max_subleaf, ARRAY_SIZE(p->feat.raw) - 1);
+
+ max_extd_leaf = p->extd.max_leaf;
+
+ /*
+ * For AMD/Hygon hardware before Zen3, we unilaterally modify LFENCE to be
+ * dispatch serialising for Spectre mitigations. Extend max_extd_leaf
+ * beyond what hardware supports, to include the feature leaf containing
+ * this information.
+ */
+ if ( cpu_has_lfence_dispatch )
+ max_extd_leaf = max(max_extd_leaf, 0x80000021);
+
+ p->extd.max_leaf = 0x80000000 | min_t(uint32_t, max_extd_leaf & 0xffff,
+ ARRAY_SIZE(p->extd.raw) - 1);
+
+ x86_cpu_featureset_to_policy(boot_cpu_data.x86_capability, p);
+ recalculate_xstate(p);
+ recalculate_misc(p);
+
+ /* When vPMU is disabled, drop it from the host policy. */
+ if ( vpmu_mode == XENPMU_MODE_OFF )
+ p->basic.raw[0xa] = EMPTY_LEAF;
+
+ if ( p->extd.svm )
+ {
+ /* Clamp to implemented features which require hardware support. */
+ p->extd.raw[0xa].d &= ((1u << SVM_FEATURE_NPT) |
+ (1u << SVM_FEATURE_LBRV) |
+ (1u << SVM_FEATURE_NRIPS) |
+ (1u << SVM_FEATURE_PAUSEFILTER) |
+ (1u << SVM_FEATURE_DECODEASSISTS));
+ /* Enable features which are always emulated. */
+ p->extd.raw[0xa].d |= ((1u << SVM_FEATURE_VMCBCLEAN) |
+ (1u << SVM_FEATURE_TSCRATEMSR));
+ }
+
/* 0x000000ce MSR_INTEL_PLATFORM_INFO */
/* probe_cpuid_faulting() sanity checks presence of MISC_FEATURES_ENABLES */
p->platform_info.cpuid_faulting = cpu_has_cpuid_faulting;
@@ -51,11 +420,88 @@ static void __init calculate_host_policy(void)
ARCH_CAPS_PBRSB_NO);
}
+static void __init guest_common_default_feature_adjustments(uint32_t *fs)
+{
+ /*
+ * IvyBridge client parts suffer from leakage of RDRAND data due to SRBDS
+ * (XSA-320 / CVE-2020-0543), and won't be receiving microcode to
+ * compensate.
+ *
+ * Mitigate by hiding RDRAND from guests by default, unless explicitly
+ * overridden on the Xen command line (cpuid=rdrand). Irrespective of the
+ * default setting, guests can use RDRAND if explicitly enabled
+ * (cpuid="host,rdrand=1") in the VM's config file, and VMs which were
+ * previously using RDRAND can migrate in.
+ */
+ if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
+ boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x3a &&
+ cpu_has_rdrand && !is_forced_cpu_cap(X86_FEATURE_RDRAND) )
+ __clear_bit(X86_FEATURE_RDRAND, fs);
+
+ /*
+ * On certain hardware, speculative or errata workarounds can result in
+ * TSX being placed in "force-abort" mode, where it doesn't actually
+ * function as expected, but is technically compatible with the ISA.
+ *
+ * Do not advertise RTM to guests by default if it won't actually work.
+ */
+ if ( rtm_disabled )
+ __clear_bit(X86_FEATURE_RTM, fs);
+}
+
+static void __init guest_common_feature_adjustments(uint32_t *fs)
+{
+ /* Unconditionally claim to be able to set the hypervisor bit. */
+ __set_bit(X86_FEATURE_HYPERVISOR, fs);
+
+ /*
+ * If IBRS is offered to the guest, unconditionally offer STIBP. It is a
+ * nop on non-HT hardware, and has this behaviour to make heterogeneous
+ * setups easier to manage.
+ */
+ if ( test_bit(X86_FEATURE_IBRSB, fs) )
+ __set_bit(X86_FEATURE_STIBP, fs);
+ if ( test_bit(X86_FEATURE_IBRS, fs) )
+ __set_bit(X86_FEATURE_AMD_STIBP, fs);
+
+ /*
+ * On hardware which supports IBRS/IBPB, we can offer IBPB independently
+ * of IBRS by using the AMD feature bit. An administrator may wish for
+ * performance reasons to offer IBPB without IBRS.
+ */
+ if ( host_cpu_policy.feat.ibrsb )
+ __set_bit(X86_FEATURE_IBPB, fs);
+}
+
static void __init calculate_pv_max_policy(void)
{
struct cpu_policy *p = &pv_max_cpu_policy;
+ uint32_t fs[FSCAPINTS];
+ unsigned int i;
*p = host_cpu_policy;
+ x86_cpu_policy_to_featureset(p, fs);
+
+ for ( i = 0; i < ARRAY_SIZE(fs); ++i )
+ fs[i] &= pv_max_featuremask[i];
+
+ /*
+ * If Xen isn't virtualising MSR_SPEC_CTRL for PV guests (functional
+ * availability, or admin choice), hide the feature.
+ */
+ if ( !boot_cpu_has(X86_FEATURE_SC_MSR_PV) )
+ {
+ __clear_bit(X86_FEATURE_IBRSB, fs);
+ __clear_bit(X86_FEATURE_IBRS, fs);
+ }
+
+ guest_common_feature_adjustments(fs);
+
+ sanitise_featureset(fs);
+ x86_cpu_featureset_to_policy(fs, p);
+ recalculate_xstate(p);
+
+ p->extd.raw[0xa] = EMPTY_LEAF; /* No SVM for PV guests. */
p->arch_caps.raw = 0; /* Not supported yet. */
}
@@ -63,15 +509,112 @@ static void __init calculate_pv_max_policy(void)
static void __init calculate_pv_def_policy(void)
{
struct cpu_policy *p = &pv_def_cpu_policy;
+ uint32_t fs[FSCAPINTS];
+ unsigned int i;
*p = pv_max_cpu_policy;
+ x86_cpu_policy_to_featureset(p, fs);
+
+ for ( i = 0; i < ARRAY_SIZE(fs); ++i )
+ fs[i] &= pv_def_featuremask[i];
+
+ guest_common_feature_adjustments(fs);
+ guest_common_default_feature_adjustments(fs);
+
+ sanitise_featureset(fs);
+ x86_cpu_featureset_to_policy(fs, p);
+ recalculate_xstate(p);
}
static void __init calculate_hvm_max_policy(void)
{
struct cpu_policy *p = &hvm_max_cpu_policy;
+ uint32_t fs[FSCAPINTS];
+ unsigned int i;
+ const uint32_t *mask;
*p = host_cpu_policy;
+ x86_cpu_policy_to_featureset(p, fs);
+
+ mask = hvm_hap_supported() ?
+ hvm_hap_max_featuremask : hvm_shadow_max_featuremask;
+
+ for ( i = 0; i < ARRAY_SIZE(fs); ++i )
+ fs[i] &= mask[i];
+
+ /*
+ * Xen can provide an (x2)APIC emulation to HVM guests even if the host's
+ * (x2)APIC isn't enabled.
+ */
+ __set_bit(X86_FEATURE_APIC, fs);
+ __set_bit(X86_FEATURE_X2APIC, fs);
+
+ /*
+ * We don't support EFER.LMSLE at all. AMD has dropped the feature from
+ * hardware and allocated a CPUID bit to indicate its absence.
+ */
+ __set_bit(X86_FEATURE_NO_LMSL, fs);
+
+ /*
+ * On AMD, PV guests are entirely unable to use SYSENTER as Xen runs in
+ * long mode (and init_amd() has cleared it out of host capabilities), but
+ * HVM guests are able if running in protected mode.
+ */
+ if ( (boot_cpu_data.x86_vendor & (X86_VENDOR_AMD | X86_VENDOR_HYGON)) &&
+ raw_cpu_policy.basic.sep )
+ __set_bit(X86_FEATURE_SEP, fs);
+
+ /*
+ * VIRT_SSBD is exposed in the default policy as a result of
+ * amd_virt_spec_ctrl being set, it also needs exposing in the max policy.
+ */
+ if ( amd_virt_spec_ctrl )
+ __set_bit(X86_FEATURE_VIRT_SSBD, fs);
+
+ /*
+ * If Xen isn't virtualising MSR_SPEC_CTRL for HVM guests (functional
+ * availability, or admin choice), hide the feature.
+ */
+ if ( !boot_cpu_has(X86_FEATURE_SC_MSR_HVM) )
+ {
+ __clear_bit(X86_FEATURE_IBRSB, fs);
+ __clear_bit(X86_FEATURE_IBRS, fs);
+ }
+ else if ( boot_cpu_has(X86_FEATURE_AMD_SSBD) )
+ /*
+ * If SPEC_CTRL.SSBD is available VIRT_SPEC_CTRL.SSBD can be exposed
+ * and implemented using the former. Expose in the max policy only as
+ * the preference is for guests to use SPEC_CTRL.SSBD if available.
+ */
+ __set_bit(X86_FEATURE_VIRT_SSBD, fs);
+
+ /*
+ * With VT-x, some features are only supported by Xen if dedicated
+ * hardware support is also available.
+ */
+ if ( cpu_has_vmx )
+ {
+ if ( !cpu_has_vmx_mpx )
+ __clear_bit(X86_FEATURE_MPX, fs);
+
+ if ( !cpu_has_vmx_xsaves )
+ __clear_bit(X86_FEATURE_XSAVES, fs);
+ }
+
+ /*
+ * Xen doesn't use PKS, so the guest support for it has opted to not use
+ * the VMCS load/save controls for efficiency reasons. This depends on
+ * the exact vmentry/exit behaviour, so don't expose PKS in other
+ * situations until someone has cross-checked the behaviour for safety.
+ */
+ if ( !cpu_has_vmx )
+ __clear_bit(X86_FEATURE_PKS, fs);
+
+ guest_common_feature_adjustments(fs);
+
+ sanitise_featureset(fs);
+ x86_cpu_featureset_to_policy(fs, p);
+ recalculate_xstate(p);
/* It's always possible to emulate CPUID faulting for HVM guests */
p->platform_info.cpuid_faulting = true;
@@ -82,8 +625,32 @@ static void __init calculate_hvm_max_policy(void)
static void __init calculate_hvm_def_policy(void)
{
struct cpu_policy *p = &hvm_def_cpu_policy;
+ uint32_t fs[FSCAPINTS];
+ unsigned int i;
+ const uint32_t *mask;
*p = hvm_max_cpu_policy;
+ x86_cpu_policy_to_featureset(p, fs);
+
+ mask = hvm_hap_supported() ?
+ hvm_hap_def_featuremask : hvm_shadow_def_featuremask;
+
+ for ( i = 0; i < ARRAY_SIZE(fs); ++i )
+ fs[i] &= mask[i];
+
+ guest_common_feature_adjustments(fs);
+ guest_common_default_feature_adjustments(fs);
+
+ /*
+ * Only expose VIRT_SSBD if AMD_SSBD is not available, and thus
+ * amd_virt_spec_ctrl is set.
+ */
+ if ( amd_virt_spec_ctrl )
+ __set_bit(X86_FEATURE_VIRT_SSBD, fs);
+
+ sanitise_featureset(fs);
+ x86_cpu_featureset_to_policy(fs, p);
+ recalculate_xstate(p);
}
void __init init_guest_cpu_policies(void)
@@ -149,3 +716,188 @@ int init_domain_cpu_policy(struct domain *d)
return 0;
}
+
+void recalculate_cpuid_policy(struct domain *d)
+{
+ struct cpu_policy *p = d->arch.cpuid;
+ const struct cpu_policy *max = is_pv_domain(d)
+ ? (IS_ENABLED(CONFIG_PV) ? &pv_max_cpu_policy : NULL)
+ : (IS_ENABLED(CONFIG_HVM) ? &hvm_max_cpu_policy : NULL);
+ uint32_t fs[FSCAPINTS], max_fs[FSCAPINTS];
+ unsigned int i;
+
+ if ( !max )
+ {
+ ASSERT_UNREACHABLE();
+ return;
+ }
+
+ p->x86_vendor = x86_cpuid_lookup_vendor(
+ p->basic.vendor_ebx, p->basic.vendor_ecx, p->basic.vendor_edx);
+
+ p->basic.max_leaf = min(p->basic.max_leaf, max->basic.max_leaf);
+ p->feat.max_subleaf = min(p->feat.max_subleaf, max->feat.max_subleaf);
+ p->extd.max_leaf = 0x80000000 | min(p->extd.max_leaf & 0xffff,
+ ((p->x86_vendor & (X86_VENDOR_AMD |
+ X86_VENDOR_HYGON))
+ ? CPUID_GUEST_NR_EXTD_AMD
+ : CPUID_GUEST_NR_EXTD_INTEL) - 1);
+
+ x86_cpu_policy_to_featureset(p, fs);
+ x86_cpu_policy_to_featureset(max, max_fs);
+
+ if ( is_hvm_domain(d) )
+ {
+ /*
+ * HVM domains using Shadow paging have further restrictions on their
+ * available paging features.
+ */
+ if ( !hap_enabled(d) )
+ {
+ for ( i = 0; i < ARRAY_SIZE(max_fs); i++ )
+ max_fs[i] &= hvm_shadow_max_featuremask[i];
+ }
+
+ /* Hide nested-virt if it hasn't been explicitly configured. */
+ if ( !nestedhvm_enabled(d) )
+ {
+ __clear_bit(X86_FEATURE_VMX, max_fs);
+ __clear_bit(X86_FEATURE_SVM, max_fs);
+ }
+ }
+
+ /*
+ * Allow the toolstack to set HTT, X2APIC and CMP_LEGACY. These bits
+ * affect how to interpret topology information in other cpuid leaves.
+ */
+ __set_bit(X86_FEATURE_HTT, max_fs);
+ __set_bit(X86_FEATURE_X2APIC, max_fs);
+ __set_bit(X86_FEATURE_CMP_LEGACY, max_fs);
+
+ /*
+ * 32bit PV domains can't use any Long Mode features, and cannot use
+ * SYSCALL on non-AMD hardware.
+ */
+ if ( is_pv_32bit_domain(d) )
+ {
+ __clear_bit(X86_FEATURE_LM, max_fs);
+ if ( !(boot_cpu_data.x86_vendor & (X86_VENDOR_AMD | X86_VENDOR_HYGON)) )
+ __clear_bit(X86_FEATURE_SYSCALL, max_fs);
+ }
+
+ /* Clamp the toolstacks choices to reality. */
+ for ( i = 0; i < ARRAY_SIZE(fs); i++ )
+ fs[i] &= max_fs[i];
+
+ if ( p->basic.max_leaf < XSTATE_CPUID )
+ __clear_bit(X86_FEATURE_XSAVE, fs);
+
+ sanitise_featureset(fs);
+
+ /* Fold host's FDP_EXCP_ONLY and NO_FPU_SEL into guest's view. */
+ fs[FEATURESET_7b0] &= ~(cpufeat_mask(X86_FEATURE_FDP_EXCP_ONLY) |
+ cpufeat_mask(X86_FEATURE_NO_FPU_SEL));
+ fs[FEATURESET_7b0] |= (host_cpu_policy.feat._7b0 &
+ (cpufeat_mask(X86_FEATURE_FDP_EXCP_ONLY) |
+ cpufeat_mask(X86_FEATURE_NO_FPU_SEL)));
+
+ x86_cpu_featureset_to_policy(fs, p);
+
+ /* Pass host cacheline size through to guests. */
+ p->basic.clflush_size = max->basic.clflush_size;
+
+ p->extd.maxphysaddr = min(p->extd.maxphysaddr, max->extd.maxphysaddr);
+ p->extd.maxphysaddr = min_t(uint8_t, p->extd.maxphysaddr,
+ paging_max_paddr_bits(d));
+ p->extd.maxphysaddr = max_t(uint8_t, p->extd.maxphysaddr,
+ (p->basic.pae || p->basic.pse36) ? 36 : 32);
+
+ p->extd.maxlinaddr = p->extd.lm ? 48 : 32;
+
+ recalculate_xstate(p);
+ recalculate_misc(p);
+
+ for ( i = 0; i < ARRAY_SIZE(p->cache.raw); ++i )
+ {
+ if ( p->cache.subleaf[i].type >= 1 &&
+ p->cache.subleaf[i].type <= 3 )
+ {
+ /* Subleaf has a valid cache type. Zero reserved fields. */
+ p->cache.raw[i].a &= 0xffffc3ffu;
+ p->cache.raw[i].d &= 0x00000007u;
+ }
+ else
+ {
+ /* Subleaf is not valid. Zero the rest of the union. */
+ zero_leaves(p->cache.raw, i, ARRAY_SIZE(p->cache.raw) - 1);
+ break;
+ }
+ }
+
+ if ( vpmu_mode == XENPMU_MODE_OFF ||
+ ((vpmu_mode & XENPMU_MODE_ALL) && !is_hardware_domain(d)) )
+ p->basic.raw[0xa] = EMPTY_LEAF;
+
+ if ( !p->extd.svm )
+ p->extd.raw[0xa] = EMPTY_LEAF;
+
+ if ( !p->extd.page1gb )
+ p->extd.raw[0x19] = EMPTY_LEAF;
+}
+
+void __init init_dom0_cpuid_policy(struct domain *d)
+{
+ struct cpu_policy *p = d->arch.cpuid;
+
+ /* dom0 can't migrate. Give it ITSC if available. */
+ if ( cpu_has_itsc )
+ p->extd.itsc = true;
+
+ /*
+ * Expose the "hardware speculation behaviour" bits of ARCH_CAPS to dom0,
+ * so dom0 can turn off workarounds as appropriate. Temporary, until the
+ * domain policy logic gains a better understanding of MSRs.
+ */
+ if ( cpu_has_arch_caps )
+ p->feat.arch_caps = true;
+
+ /* Apply dom0-cpuid= command line settings, if provided. */
+ if ( dom0_cpuid_cmdline )
+ {
+ uint32_t fs[FSCAPINTS];
+ unsigned int i;
+
+ x86_cpu_policy_to_featureset(p, fs);
+
+ for ( i = 0; i < ARRAY_SIZE(fs); ++i )
+ {
+ fs[i] |= dom0_enable_feat [i];
+ fs[i] &= ~dom0_disable_feat[i];
+ }
+
+ x86_cpu_featureset_to_policy(fs, p);
+
+ recalculate_cpuid_policy(d);
+ }
+}
+
+static void __init __maybe_unused build_assertions(void)
+{
+ BUILD_BUG_ON(ARRAY_SIZE(known_features) != FSCAPINTS);
+ BUILD_BUG_ON(ARRAY_SIZE(pv_max_featuremask) != FSCAPINTS);
+ BUILD_BUG_ON(ARRAY_SIZE(hvm_shadow_max_featuremask) != FSCAPINTS);
+ BUILD_BUG_ON(ARRAY_SIZE(hvm_hap_max_featuremask) != FSCAPINTS);
+ BUILD_BUG_ON(ARRAY_SIZE(deep_features) != FSCAPINTS);
+
+ /* Find some more clever allocation scheme if this trips. */
+ BUILD_BUG_ON(sizeof(struct cpu_policy) > PAGE_SIZE);
+
+ BUILD_BUG_ON(sizeof(raw_cpu_policy.basic) !=
+ sizeof(raw_cpu_policy.basic.raw));
+ BUILD_BUG_ON(sizeof(raw_cpu_policy.feat) !=
+ sizeof(raw_cpu_policy.feat.raw));
+ BUILD_BUG_ON(sizeof(raw_cpu_policy.xstate) !=
+ sizeof(raw_cpu_policy.xstate.raw));
+ BUILD_BUG_ON(sizeof(raw_cpu_policy.extd) !=
+ sizeof(raw_cpu_policy.extd.raw));
+}
diff --git a/xen/arch/x86/cpuid.c b/xen/arch/x86/cpuid.c
index 5eb5f1893516..3f20c342fde8 100644
--- a/xen/arch/x86/cpuid.c
+++ b/xen/arch/x86/cpuid.c
@@ -1,638 +1,14 @@
-#include <xen/init.h>
-#include <xen/lib.h>
-#include <xen/param.h>
#include <xen/sched.h>
-#include <xen/nospec.h>
-#include <asm/amd.h>
+#include <xen/types.h>
+
+#include <public/hvm/params.h>
+
#include <asm/cpu-policy.h>
#include <asm/cpuid.h>
-#include <asm/hvm/hvm.h>
-#include <asm/hvm/nestedhvm.h>
-#include <asm/hvm/svm/svm.h>
#include <asm/hvm/viridian.h>
-#include <asm/hvm/vmx/vmcs.h>
-#include <asm/paging.h>
-#include <asm/processor.h>
#include <asm/xstate.h>
-const uint32_t known_features[] = INIT_KNOWN_FEATURES;
-
-static const uint32_t __initconst pv_max_featuremask[] = INIT_PV_MAX_FEATURES;
-static const uint32_t hvm_shadow_max_featuremask[] = INIT_HVM_SHADOW_MAX_FEATURES;
-static const uint32_t __initconst hvm_hap_max_featuremask[] =
- INIT_HVM_HAP_MAX_FEATURES;
-static const uint32_t __initconst pv_def_featuremask[] = INIT_PV_DEF_FEATURES;
-static const uint32_t __initconst hvm_shadow_def_featuremask[] =
- INIT_HVM_SHADOW_DEF_FEATURES;
-static const uint32_t __initconst hvm_hap_def_featuremask[] =
- INIT_HVM_HAP_DEF_FEATURES;
-static const uint32_t deep_features[] = INIT_DEEP_FEATURES;
-
-static const struct feature_name {
- const char *name;
- unsigned int bit;
-} feature_names[] __initconstrel = INIT_FEATURE_NAMES;
-
-/*
- * Parse a list of cpuid feature names -> bool, calling the callback for any
- * matches found.
- *
- * always_inline, because this is init code only and we really don't want a
- * function pointer call in the middle of the loop.
- */
-static int __init always_inline parse_cpuid(
- const char *s, void (*callback)(unsigned int feat, bool val))
-{
- const char *ss;
- int val, rc = 0;
-
- do {
- const struct feature_name *lhs, *rhs, *mid = NULL /* GCC... */;
- const char *feat;
-
- ss = strchr(s, ',');
- if ( !ss )
- ss = strchr(s, '\0');
-
- /* Skip the 'no-' prefix for name comparisons. */
- feat = s;
- if ( strncmp(s, "no-", 3) == 0 )
- feat += 3;
-
- /* (Re)initalise lhs and rhs for binary search. */
- lhs = feature_names;
- rhs = feature_names + ARRAY_SIZE(feature_names);
-
- while ( lhs < rhs )
- {
- int res;
-
- mid = lhs + (rhs - lhs) / 2;
- res = cmdline_strcmp(feat, mid->name);
-
- if ( res < 0 )
- {
- rhs = mid;
- continue;
- }
- if ( res > 0 )
- {
- lhs = mid + 1;
- continue;
- }
-
- if ( (val = parse_boolean(mid->name, s, ss)) >= 0 )
- {
- callback(mid->bit, val);
- mid = NULL;
- }
-
- break;
- }
-
- /*
- * Mid being NULL means that the name and boolean were successfully
- * identified. Everything else is an error.
- */
- if ( mid )
- rc = -EINVAL;
-
- s = ss + 1;
- } while ( *ss );
-
- return rc;
-}
-
-static void __init cf_check _parse_xen_cpuid(unsigned int feat, bool val)
-{
- if ( !val )
- setup_clear_cpu_cap(feat);
- else if ( feat == X86_FEATURE_RDRAND &&
- (cpuid_ecx(1) & cpufeat_mask(X86_FEATURE_RDRAND)) )
- setup_force_cpu_cap(X86_FEATURE_RDRAND);
-}
-
-static int __init cf_check parse_xen_cpuid(const char *s)
-{
- return parse_cpuid(s, _parse_xen_cpuid);
-}
-custom_param("cpuid", parse_xen_cpuid);
-
-static bool __initdata dom0_cpuid_cmdline;
-static uint32_t __initdata dom0_enable_feat[FSCAPINTS];
-static uint32_t __initdata dom0_disable_feat[FSCAPINTS];
-
-static void __init cf_check _parse_dom0_cpuid(unsigned int feat, bool val)
-{
- __set_bit (feat, val ? dom0_enable_feat : dom0_disable_feat);
- __clear_bit(feat, val ? dom0_disable_feat : dom0_enable_feat );
-}
-
-static int __init cf_check parse_dom0_cpuid(const char *s)
-{
- dom0_cpuid_cmdline = true;
-
- return parse_cpuid(s, _parse_dom0_cpuid);
-}
-custom_param("dom0-cpuid", parse_dom0_cpuid);
-
#define EMPTY_LEAF ((struct cpuid_leaf){})
-static void zero_leaves(struct cpuid_leaf *l,
- unsigned int first, unsigned int last)
-{
- memset(&l[first], 0, sizeof(*l) * (last - first + 1));
-}
-
-static void sanitise_featureset(uint32_t *fs)
-{
- /* for_each_set_bit() uses unsigned longs. Extend with zeroes. */
- uint32_t disabled_features[
- ROUNDUP(FSCAPINTS, sizeof(unsigned long)/sizeof(uint32_t))] = {};
- unsigned int i;
-
- for ( i = 0; i < FSCAPINTS; ++i )
- {
- /* Clamp to known mask. */
- fs[i] &= known_features[i];
-
- /*
- * Identify which features with deep dependencies have been
- * disabled.
- */
- disabled_features[i] = ~fs[i] & deep_features[i];
- }
-
- for_each_set_bit(i, (void *)disabled_features,
- sizeof(disabled_features) * 8)
- {
- const uint32_t *dfs = x86_cpuid_lookup_deep_deps(i);
- unsigned int j;
-
- ASSERT(dfs); /* deep_features[] should guarentee this. */
-
- for ( j = 0; j < FSCAPINTS; ++j )
- {
- fs[j] &= ~dfs[j];
- disabled_features[j] &= ~dfs[j];
- }
- }
-}
-
-static void recalculate_xstate(struct cpuid_policy *p)
-{
- uint64_t xstates = XSTATE_FP_SSE;
- uint32_t xstate_size = XSTATE_AREA_MIN_SIZE;
- unsigned int i, Da1 = p->xstate.Da1;
-
- /*
- * The Da1 leaf is the only piece of information preserved in the common
- * case. Everything else is derived from other feature state.
- */
- memset(&p->xstate, 0, sizeof(p->xstate));
-
- if ( !p->basic.xsave )
- return;
-
- if ( p->basic.avx )
- {
- xstates |= X86_XCR0_YMM;
- xstate_size = max(xstate_size,
- xstate_offsets[X86_XCR0_YMM_POS] +
- xstate_sizes[X86_XCR0_YMM_POS]);
- }
-
- if ( p->feat.mpx )
- {
- xstates |= X86_XCR0_BNDREGS | X86_XCR0_BNDCSR;
- xstate_size = max(xstate_size,
- xstate_offsets[X86_XCR0_BNDCSR_POS] +
- xstate_sizes[X86_XCR0_BNDCSR_POS]);
- }
-
- if ( p->feat.avx512f )
- {
- xstates |= X86_XCR0_OPMASK | X86_XCR0_ZMM | X86_XCR0_HI_ZMM;
- xstate_size = max(xstate_size,
- xstate_offsets[X86_XCR0_HI_ZMM_POS] +
- xstate_sizes[X86_XCR0_HI_ZMM_POS]);
- }
-
- if ( p->feat.pku )
- {
- xstates |= X86_XCR0_PKRU;
- xstate_size = max(xstate_size,
- xstate_offsets[X86_XCR0_PKRU_POS] +
- xstate_sizes[X86_XCR0_PKRU_POS]);
- }
-
- p->xstate.max_size = xstate_size;
- p->xstate.xcr0_low = xstates & ~XSTATE_XSAVES_ONLY;
- p->xstate.xcr0_high = (xstates & ~XSTATE_XSAVES_ONLY) >> 32;
-
- p->xstate.Da1 = Da1;
- if ( p->xstate.xsaves )
- {
- p->xstate.xss_low = xstates & XSTATE_XSAVES_ONLY;
- p->xstate.xss_high = (xstates & XSTATE_XSAVES_ONLY) >> 32;
- }
- else
- xstates &= ~XSTATE_XSAVES_ONLY;
-
- for ( i = 2; i < min(63ul, ARRAY_SIZE(p->xstate.comp)); ++i )
- {
- uint64_t curr_xstate = 1ul << i;
-
- if ( !(xstates & curr_xstate) )
- continue;
-
- p->xstate.comp[i].size = xstate_sizes[i];
- p->xstate.comp[i].offset = xstate_offsets[i];
- p->xstate.comp[i].xss = curr_xstate & XSTATE_XSAVES_ONLY;
- p->xstate.comp[i].align = curr_xstate & xstate_align;
- }
-}
-
-/*
- * Misc adjustments to the policy. Mostly clobbering reserved fields and
- * duplicating shared fields. Intentionally hidden fields are annotated.
- */
-static void recalculate_misc(struct cpuid_policy *p)
-{
- p->basic.raw_fms &= 0x0fff0fff; /* Clobber Processor Type on Intel. */
- p->basic.apic_id = 0; /* Dynamic. */
-
- p->basic.raw[0x5] = EMPTY_LEAF; /* MONITOR not exposed to guests. */
- p->basic.raw[0x6] = EMPTY_LEAF; /* Therm/Power not exposed to guests. */
-
- p->basic.raw[0x8] = EMPTY_LEAF;
-
- /* TODO: Rework topology logic. */
- memset(p->topo.raw, 0, sizeof(p->topo.raw));
-
- p->basic.raw[0xc] = EMPTY_LEAF;
-
- p->extd.e1d &= ~CPUID_COMMON_1D_FEATURES;
-
- /* Most of Power/RAS hidden from guests. */
- p->extd.raw[0x7].a = p->extd.raw[0x7].b = p->extd.raw[0x7].c = 0;
-
- p->extd.raw[0x8].d = 0;
-
- switch ( p->x86_vendor )
- {
- case X86_VENDOR_INTEL:
- p->basic.l2_nr_queries = 1; /* Fixed to 1 query. */
- p->basic.raw[0x3] = EMPTY_LEAF; /* PSN - always hidden. */
- p->basic.raw[0x9] = EMPTY_LEAF; /* DCA - always hidden. */
-
- p->extd.vendor_ebx = 0;
- p->extd.vendor_ecx = 0;
- p->extd.vendor_edx = 0;
-
- p->extd.raw[0x1].a = p->extd.raw[0x1].b = 0;
-
- p->extd.raw[0x5] = EMPTY_LEAF;
- p->extd.raw[0x6].a = p->extd.raw[0x6].b = p->extd.raw[0x6].d = 0;
-
- p->extd.raw[0x8].a &= 0x0000ffff;
- p->extd.raw[0x8].c = 0;
- break;
-
- case X86_VENDOR_AMD:
- case X86_VENDOR_HYGON:
- zero_leaves(p->basic.raw, 0x2, 0x3);
- memset(p->cache.raw, 0, sizeof(p->cache.raw));
- zero_leaves(p->basic.raw, 0x9, 0xa);
-
- p->extd.vendor_ebx = p->basic.vendor_ebx;
- p->extd.vendor_ecx = p->basic.vendor_ecx;
- p->extd.vendor_edx = p->basic.vendor_edx;
-
- p->extd.raw_fms = p->basic.raw_fms;
- p->extd.raw[0x1].b &= 0xff00ffff;
- p->extd.e1d |= p->basic._1d & CPUID_COMMON_1D_FEATURES;
-
- p->extd.raw[0x8].a &= 0x0000ffff; /* GuestMaxPhysAddr hidden. */
- p->extd.raw[0x8].c &= 0x0003f0ff;
-
- p->extd.raw[0x9] = EMPTY_LEAF;
-
- zero_leaves(p->extd.raw, 0xb, 0x18);
-
- /* 0x19 - TLB details. Pass through. */
- /* 0x1a - Perf hints. Pass through. */
-
- p->extd.raw[0x1b] = EMPTY_LEAF; /* IBS - not supported. */
- p->extd.raw[0x1c] = EMPTY_LEAF; /* LWP - not supported. */
- p->extd.raw[0x1d] = EMPTY_LEAF; /* TopoExt Cache */
- p->extd.raw[0x1e] = EMPTY_LEAF; /* TopoExt APIC ID/Core/Node */
- p->extd.raw[0x1f] = EMPTY_LEAF; /* SEV */
- p->extd.raw[0x20] = EMPTY_LEAF; /* Platform QoS */
- break;
- }
-}
-
-static void __init calculate_raw_policy(void)
-{
- struct cpuid_policy *p = &raw_cpu_policy;
-
- x86_cpuid_policy_fill_native(p);
-
- /* Nothing good will come from Xen and libx86 disagreeing on vendor. */
- ASSERT(p->x86_vendor == boot_cpu_data.x86_vendor);
-}
-
-static void __init calculate_host_policy(void)
-{
- struct cpuid_policy *p = &host_cpu_policy;
- unsigned int max_extd_leaf;
-
- *p = raw_cpu_policy;
-
- p->basic.max_leaf =
- min_t(uint32_t, p->basic.max_leaf, ARRAY_SIZE(p->basic.raw) - 1);
- p->feat.max_subleaf =
- min_t(uint32_t, p->feat.max_subleaf, ARRAY_SIZE(p->feat.raw) - 1);
-
- max_extd_leaf = p->extd.max_leaf;
-
- /*
- * For AMD/Hygon hardware before Zen3, we unilaterally modify LFENCE to be
- * dispatch serialising for Spectre mitigations. Extend max_extd_leaf
- * beyond what hardware supports, to include the feature leaf containing
- * this information.
- */
- if ( cpu_has_lfence_dispatch )
- max_extd_leaf = max(max_extd_leaf, 0x80000021);
-
- p->extd.max_leaf = 0x80000000 | min_t(uint32_t, max_extd_leaf & 0xffff,
- ARRAY_SIZE(p->extd.raw) - 1);
-
- x86_cpu_featureset_to_policy(boot_cpu_data.x86_capability, p);
- recalculate_xstate(p);
- recalculate_misc(p);
-
- /* When vPMU is disabled, drop it from the host policy. */
- if ( vpmu_mode == XENPMU_MODE_OFF )
- p->basic.raw[0xa] = EMPTY_LEAF;
-
- if ( p->extd.svm )
- {
- /* Clamp to implemented features which require hardware support. */
- p->extd.raw[0xa].d &= ((1u << SVM_FEATURE_NPT) |
- (1u << SVM_FEATURE_LBRV) |
- (1u << SVM_FEATURE_NRIPS) |
- (1u << SVM_FEATURE_PAUSEFILTER) |
- (1u << SVM_FEATURE_DECODEASSISTS));
- /* Enable features which are always emulated. */
- p->extd.raw[0xa].d |= ((1u << SVM_FEATURE_VMCBCLEAN) |
- (1u << SVM_FEATURE_TSCRATEMSR));
- }
-}
-
-static void __init guest_common_default_feature_adjustments(uint32_t *fs)
-{
- /*
- * IvyBridge client parts suffer from leakage of RDRAND data due to SRBDS
- * (XSA-320 / CVE-2020-0543), and won't be receiving microcode to
- * compensate.
- *
- * Mitigate by hiding RDRAND from guests by default, unless explicitly
- * overridden on the Xen command line (cpuid=rdrand). Irrespective of the
- * default setting, guests can use RDRAND if explicitly enabled
- * (cpuid="host,rdrand=1") in the VM's config file, and VMs which were
- * previously using RDRAND can migrate in.
- */
- if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
- boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x3a &&
- cpu_has_rdrand && !is_forced_cpu_cap(X86_FEATURE_RDRAND) )
- __clear_bit(X86_FEATURE_RDRAND, fs);
-
- /*
- * On certain hardware, speculative or errata workarounds can result in
- * TSX being placed in "force-abort" mode, where it doesn't actually
- * function as expected, but is technically compatible with the ISA.
- *
- * Do not advertise RTM to guests by default if it won't actually work.
- */
- if ( rtm_disabled )
- __clear_bit(X86_FEATURE_RTM, fs);
-}
-
-static void __init guest_common_feature_adjustments(uint32_t *fs)
-{
- /* Unconditionally claim to be able to set the hypervisor bit. */
- __set_bit(X86_FEATURE_HYPERVISOR, fs);
-
- /*
- * If IBRS is offered to the guest, unconditionally offer STIBP. It is a
- * nop on non-HT hardware, and has this behaviour to make heterogeneous
- * setups easier to manage.
- */
- if ( test_bit(X86_FEATURE_IBRSB, fs) )
- __set_bit(X86_FEATURE_STIBP, fs);
- if ( test_bit(X86_FEATURE_IBRS, fs) )
- __set_bit(X86_FEATURE_AMD_STIBP, fs);
-
- /*
- * On hardware which supports IBRS/IBPB, we can offer IBPB independently
- * of IBRS by using the AMD feature bit. An administrator may wish for
- * performance reasons to offer IBPB without IBRS.
- */
- if ( host_cpu_policy.feat.ibrsb )
- __set_bit(X86_FEATURE_IBPB, fs);
-}
-
-static void __init calculate_pv_max_policy(void)
-{
- struct cpuid_policy *p = &pv_max_cpu_policy;
- uint32_t pv_featureset[FSCAPINTS];
- unsigned int i;
-
- *p = host_cpu_policy;
- x86_cpu_policy_to_featureset(p, pv_featureset);
-
- for ( i = 0; i < ARRAY_SIZE(pv_featureset); ++i )
- pv_featureset[i] &= pv_max_featuremask[i];
-
- /*
- * If Xen isn't virtualising MSR_SPEC_CTRL for PV guests (functional
- * availability, or admin choice), hide the feature.
- */
- if ( !boot_cpu_has(X86_FEATURE_SC_MSR_PV) )
- {
- __clear_bit(X86_FEATURE_IBRSB, pv_featureset);
- __clear_bit(X86_FEATURE_IBRS, pv_featureset);
- }
-
- guest_common_feature_adjustments(pv_featureset);
-
- sanitise_featureset(pv_featureset);
- x86_cpu_featureset_to_policy(pv_featureset, p);
- recalculate_xstate(p);
-
- p->extd.raw[0xa] = EMPTY_LEAF; /* No SVM for PV guests. */
-}
-
-static void __init calculate_pv_def_policy(void)
-{
- struct cpuid_policy *p = &pv_def_cpu_policy;
- uint32_t pv_featureset[FSCAPINTS];
- unsigned int i;
-
- *p = pv_max_cpu_policy;
- x86_cpu_policy_to_featureset(p, pv_featureset);
-
- for ( i = 0; i < ARRAY_SIZE(pv_featureset); ++i )
- pv_featureset[i] &= pv_def_featuremask[i];
-
- guest_common_feature_adjustments(pv_featureset);
- guest_common_default_feature_adjustments(pv_featureset);
-
- sanitise_featureset(pv_featureset);
- x86_cpu_featureset_to_policy(pv_featureset, p);
- recalculate_xstate(p);
-}
-
-static void __init calculate_hvm_max_policy(void)
-{
- struct cpuid_policy *p = &hvm_max_cpu_policy;
- uint32_t hvm_featureset[FSCAPINTS];
- unsigned int i;
- const uint32_t *hvm_featuremask;
-
- *p = host_cpu_policy;
- x86_cpu_policy_to_featureset(p, hvm_featureset);
-
- hvm_featuremask = hvm_hap_supported() ?
- hvm_hap_max_featuremask : hvm_shadow_max_featuremask;
-
- for ( i = 0; i < ARRAY_SIZE(hvm_featureset); ++i )
- hvm_featureset[i] &= hvm_featuremask[i];
-
- /*
- * Xen can provide an (x2)APIC emulation to HVM guests even if the host's
- * (x2)APIC isn't enabled.
- */
- __set_bit(X86_FEATURE_APIC, hvm_featureset);
- __set_bit(X86_FEATURE_X2APIC, hvm_featureset);
-
- /*
- * We don't support EFER.LMSLE at all. AMD has dropped the feature from
- * hardware and allocated a CPUID bit to indicate its absence.
- */
- __set_bit(X86_FEATURE_NO_LMSL, hvm_featureset);
-
- /*
- * On AMD, PV guests are entirely unable to use SYSENTER as Xen runs in
- * long mode (and init_amd() has cleared it out of host capabilities), but
- * HVM guests are able if running in protected mode.
- */
- if ( (boot_cpu_data.x86_vendor & (X86_VENDOR_AMD | X86_VENDOR_HYGON)) &&
- raw_cpu_policy.basic.sep )
- __set_bit(X86_FEATURE_SEP, hvm_featureset);
-
- /*
- * VIRT_SSBD is exposed in the default policy as a result of
- * amd_virt_spec_ctrl being set, it also needs exposing in the max policy.
- */
- if ( amd_virt_spec_ctrl )
- __set_bit(X86_FEATURE_VIRT_SSBD, hvm_featureset);
-
- /*
- * If Xen isn't virtualising MSR_SPEC_CTRL for HVM guests (functional
- * availability, or admin choice), hide the feature.
- */
- if ( !boot_cpu_has(X86_FEATURE_SC_MSR_HVM) )
- {
- __clear_bit(X86_FEATURE_IBRSB, hvm_featureset);
- __clear_bit(X86_FEATURE_IBRS, hvm_featureset);
- }
- else if ( boot_cpu_has(X86_FEATURE_AMD_SSBD) )
- /*
- * If SPEC_CTRL.SSBD is available VIRT_SPEC_CTRL.SSBD can be exposed
- * and implemented using the former. Expose in the max policy only as
- * the preference is for guests to use SPEC_CTRL.SSBD if available.
- */
- __set_bit(X86_FEATURE_VIRT_SSBD, hvm_featureset);
-
- /*
- * With VT-x, some features are only supported by Xen if dedicated
- * hardware support is also available.
- */
- if ( cpu_has_vmx )
- {
- if ( !cpu_has_vmx_mpx )
- __clear_bit(X86_FEATURE_MPX, hvm_featureset);
-
- if ( !cpu_has_vmx_xsaves )
- __clear_bit(X86_FEATURE_XSAVES, hvm_featureset);
- }
-
- /*
- * Xen doesn't use PKS, so the guest support for it has opted to not use
- * the VMCS load/save controls for efficiency reasons. This depends on
- * the exact vmentry/exit behaviour, so don't expose PKS in other
- * situations until someone has cross-checked the behaviour for safety.
- */
- if ( !cpu_has_vmx )
- __clear_bit(X86_FEATURE_PKS, hvm_featureset);
-
- guest_common_feature_adjustments(hvm_featureset);
-
- sanitise_featureset(hvm_featureset);
- x86_cpu_featureset_to_policy(hvm_featureset, p);
- recalculate_xstate(p);
-}
-
-static void __init calculate_hvm_def_policy(void)
-{
- struct cpuid_policy *p = &hvm_def_cpu_policy;
- uint32_t hvm_featureset[FSCAPINTS];
- unsigned int i;
- const uint32_t *hvm_featuremask;
-
- *p = hvm_max_cpu_policy;
- x86_cpu_policy_to_featureset(p, hvm_featureset);
-
- hvm_featuremask = hvm_hap_supported() ?
- hvm_hap_def_featuremask : hvm_shadow_def_featuremask;
-
- for ( i = 0; i < ARRAY_SIZE(hvm_featureset); ++i )
- hvm_featureset[i] &= hvm_featuremask[i];
-
- guest_common_feature_adjustments(hvm_featureset);
- guest_common_default_feature_adjustments(hvm_featureset);
-
- /*
- * Only expose VIRT_SSBD if AMD_SSBD is not available, and thus
- * amd_virt_spec_ctrl is set.
- */
- if ( amd_virt_spec_ctrl )
- __set_bit(X86_FEATURE_VIRT_SSBD, hvm_featureset);
-
- sanitise_featureset(hvm_featureset);
- x86_cpu_featureset_to_policy(hvm_featureset, p);
- recalculate_xstate(p);
-}
-
-void __init init_guest_cpuid(void)
-{
- calculate_raw_policy();
- calculate_host_policy();
-
- if ( IS_ENABLED(CONFIG_PV) )
- {
- calculate_pv_max_policy();
- calculate_pv_def_policy();
- }
-
- if ( hvm_enabled )
- {
- calculate_hvm_max_policy();
- calculate_hvm_def_policy();
- }
-}
bool recheck_cpu_features(unsigned int cpu)
{
@@ -656,170 +32,6 @@ bool recheck_cpu_features(unsigned int cpu)
return okay;
}
-void recalculate_cpuid_policy(struct domain *d)
-{
- struct cpuid_policy *p = d->arch.cpuid;
- const struct cpuid_policy *max = is_pv_domain(d)
- ? (IS_ENABLED(CONFIG_PV) ? &pv_max_cpu_policy : NULL)
- : (IS_ENABLED(CONFIG_HVM) ? &hvm_max_cpu_policy : NULL);
- uint32_t fs[FSCAPINTS], max_fs[FSCAPINTS];
- unsigned int i;
-
- if ( !max )
- {
- ASSERT_UNREACHABLE();
- return;
- }
-
- p->x86_vendor = x86_cpuid_lookup_vendor(
- p->basic.vendor_ebx, p->basic.vendor_ecx, p->basic.vendor_edx);
-
- p->basic.max_leaf = min(p->basic.max_leaf, max->basic.max_leaf);
- p->feat.max_subleaf = min(p->feat.max_subleaf, max->feat.max_subleaf);
- p->extd.max_leaf = 0x80000000 | min(p->extd.max_leaf & 0xffff,
- ((p->x86_vendor & (X86_VENDOR_AMD |
- X86_VENDOR_HYGON))
- ? CPUID_GUEST_NR_EXTD_AMD
- : CPUID_GUEST_NR_EXTD_INTEL) - 1);
-
- x86_cpu_policy_to_featureset(p, fs);
- x86_cpu_policy_to_featureset(max, max_fs);
-
- if ( is_hvm_domain(d) )
- {
- /*
- * HVM domains using Shadow paging have further restrictions on their
- * available paging features.
- */
- if ( !hap_enabled(d) )
- {
- for ( i = 0; i < ARRAY_SIZE(max_fs); i++ )
- max_fs[i] &= hvm_shadow_max_featuremask[i];
- }
-
- /* Hide nested-virt if it hasn't been explicitly configured. */
- if ( !nestedhvm_enabled(d) )
- {
- __clear_bit(X86_FEATURE_VMX, max_fs);
- __clear_bit(X86_FEATURE_SVM, max_fs);
- }
- }
-
- /*
- * Allow the toolstack to set HTT, X2APIC and CMP_LEGACY. These bits
- * affect how to interpret topology information in other cpuid leaves.
- */
- __set_bit(X86_FEATURE_HTT, max_fs);
- __set_bit(X86_FEATURE_X2APIC, max_fs);
- __set_bit(X86_FEATURE_CMP_LEGACY, max_fs);
-
- /*
- * 32bit PV domains can't use any Long Mode features, and cannot use
- * SYSCALL on non-AMD hardware.
- */
- if ( is_pv_32bit_domain(d) )
- {
- __clear_bit(X86_FEATURE_LM, max_fs);
- if ( !(boot_cpu_data.x86_vendor & (X86_VENDOR_AMD | X86_VENDOR_HYGON)) )
- __clear_bit(X86_FEATURE_SYSCALL, max_fs);
- }
-
- /* Clamp the toolstacks choices to reality. */
- for ( i = 0; i < ARRAY_SIZE(fs); i++ )
- fs[i] &= max_fs[i];
-
- if ( p->basic.max_leaf < XSTATE_CPUID )
- __clear_bit(X86_FEATURE_XSAVE, fs);
-
- sanitise_featureset(fs);
-
- /* Fold host's FDP_EXCP_ONLY and NO_FPU_SEL into guest's view. */
- fs[FEATURESET_7b0] &= ~(cpufeat_mask(X86_FEATURE_FDP_EXCP_ONLY) |
- cpufeat_mask(X86_FEATURE_NO_FPU_SEL));
- fs[FEATURESET_7b0] |= (host_cpu_policy.feat._7b0 &
- (cpufeat_mask(X86_FEATURE_FDP_EXCP_ONLY) |
- cpufeat_mask(X86_FEATURE_NO_FPU_SEL)));
-
- x86_cpu_featureset_to_policy(fs, p);
-
- /* Pass host cacheline size through to guests. */
- p->basic.clflush_size = max->basic.clflush_size;
-
- p->extd.maxphysaddr = min(p->extd.maxphysaddr, max->extd.maxphysaddr);
- p->extd.maxphysaddr = min_t(uint8_t, p->extd.maxphysaddr,
- paging_max_paddr_bits(d));
- p->extd.maxphysaddr = max_t(uint8_t, p->extd.maxphysaddr,
- (p->basic.pae || p->basic.pse36) ? 36 : 32);
-
- p->extd.maxlinaddr = p->extd.lm ? 48 : 32;
-
- recalculate_xstate(p);
- recalculate_misc(p);
-
- for ( i = 0; i < ARRAY_SIZE(p->cache.raw); ++i )
- {
- if ( p->cache.subleaf[i].type >= 1 &&
- p->cache.subleaf[i].type <= 3 )
- {
- /* Subleaf has a valid cache type. Zero reserved fields. */
- p->cache.raw[i].a &= 0xffffc3ffu;
- p->cache.raw[i].d &= 0x00000007u;
- }
- else
- {
- /* Subleaf is not valid. Zero the rest of the union. */
- zero_leaves(p->cache.raw, i, ARRAY_SIZE(p->cache.raw) - 1);
- break;
- }
- }
-
- if ( vpmu_mode == XENPMU_MODE_OFF ||
- ((vpmu_mode & XENPMU_MODE_ALL) && !is_hardware_domain(d)) )
- p->basic.raw[0xa] = EMPTY_LEAF;
-
- if ( !p->extd.svm )
- p->extd.raw[0xa] = EMPTY_LEAF;
-
- if ( !p->extd.page1gb )
- p->extd.raw[0x19] = EMPTY_LEAF;
-}
-
-void __init init_dom0_cpuid_policy(struct domain *d)
-{
- struct cpuid_policy *p = d->arch.cpuid;
-
- /* dom0 can't migrate. Give it ITSC if available. */
- if ( cpu_has_itsc )
- p->extd.itsc = true;
-
- /*
- * Expose the "hardware speculation behaviour" bits of ARCH_CAPS to dom0,
- * so dom0 can turn off workarounds as appropriate. Temporary, until the
- * domain policy logic gains a better understanding of MSRs.
- */
- if ( cpu_has_arch_caps )
- p->feat.arch_caps = true;
-
- /* Apply dom0-cpuid= command line settings, if provided. */
- if ( dom0_cpuid_cmdline )
- {
- uint32_t fs[FSCAPINTS];
- unsigned int i;
-
- x86_cpu_policy_to_featureset(p, fs);
-
- for ( i = 0; i < ARRAY_SIZE(fs); ++i )
- {
- fs[i] |= dom0_enable_feat [i];
- fs[i] &= ~dom0_disable_feat[i];
- }
-
- x86_cpu_featureset_to_policy(fs, p);
-
- recalculate_cpuid_policy(d);
- }
-}
-
void guest_cpuid(const struct vcpu *v, uint32_t leaf,
uint32_t subleaf, struct cpuid_leaf *res)
{
@@ -1190,27 +402,6 @@ void guest_cpuid(const struct vcpu *v, uint32_t leaf,
}
}
-static void __init __maybe_unused build_assertions(void)
-{
- BUILD_BUG_ON(ARRAY_SIZE(known_features) != FSCAPINTS);
- BUILD_BUG_ON(ARRAY_SIZE(pv_max_featuremask) != FSCAPINTS);
- BUILD_BUG_ON(ARRAY_SIZE(hvm_shadow_max_featuremask) != FSCAPINTS);
- BUILD_BUG_ON(ARRAY_SIZE(hvm_hap_max_featuremask) != FSCAPINTS);
- BUILD_BUG_ON(ARRAY_SIZE(deep_features) != FSCAPINTS);
-
- /* Find some more clever allocation scheme if this trips. */
- BUILD_BUG_ON(sizeof(struct cpuid_policy) > PAGE_SIZE);
-
- BUILD_BUG_ON(sizeof(raw_cpu_policy.basic) !=
- sizeof(raw_cpu_policy.basic.raw));
- BUILD_BUG_ON(sizeof(raw_cpu_policy.feat) !=
- sizeof(raw_cpu_policy.feat.raw));
- BUILD_BUG_ON(sizeof(raw_cpu_policy.xstate) !=
- sizeof(raw_cpu_policy.xstate.raw));
- BUILD_BUG_ON(sizeof(raw_cpu_policy.extd) !=
- sizeof(raw_cpu_policy.extd.raw));
-}
-
/*
* Local variables:
* mode: C
diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index d326fa1c0136..675c523d9909 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -77,7 +77,6 @@
#include <public/memory.h>
#include <public/vm_event.h>
#include <public/arch-x86/cpuid.h>
-#include <asm/cpuid.h>
#include <compat/hvm/hvm_op.h>
diff --git a/xen/arch/x86/include/asm/cpu-policy.h b/xen/arch/x86/include/asm/cpu-policy.h
index 13e2a1f86d13..b361537a602b 100644
--- a/xen/arch/x86/include/asm/cpu-policy.h
+++ b/xen/arch/x86/include/asm/cpu-policy.h
@@ -18,4 +18,10 @@ void init_guest_cpu_policies(void);
/* Allocate and initialise a CPU policy suitable for the domain. */
int init_domain_cpu_policy(struct domain *d);
+/* Apply dom0-specific tweaks to the CPUID policy. */
+void init_dom0_cpuid_policy(struct domain *d);
+
+/* Clamp the CPUID policy to reality. */
+void recalculate_cpuid_policy(struct domain *d);
+
#endif /* X86_CPU_POLICY_H */
diff --git a/xen/arch/x86/include/asm/cpuid.h b/xen/arch/x86/include/asm/cpuid.h
index 7f81b998ce01..b32ba0bbfe5c 100644
--- a/xen/arch/x86/include/asm/cpuid.h
+++ b/xen/arch/x86/include/asm/cpuid.h
@@ -8,14 +8,10 @@
#include <xen/kernel.h>
#include <xen/percpu.h>
-#include <xen/lib/x86/cpu-policy.h>
-
#include <public/sysctl.h>
extern const uint32_t known_features[FSCAPINTS];
-void init_guest_cpuid(void);
-
/*
* Expected levelling capabilities (given cpuid vendor/family information),
* and levelling capabilities actually available (given MSR probing).
@@ -49,13 +45,8 @@ extern struct cpuidmasks cpuidmask_defaults;
/* Check that all previously present features are still available. */
bool recheck_cpu_features(unsigned int cpu);
-/* Apply dom0-specific tweaks to the CPUID policy. */
-void init_dom0_cpuid_policy(struct domain *d);
-
-/* Clamp the CPUID policy to reality. */
-void recalculate_cpuid_policy(struct domain *d);
-
struct vcpu;
+struct cpuid_leaf;
void guest_cpuid(const struct vcpu *v, uint32_t leaf,
uint32_t subleaf, struct cpuid_leaf *res);
diff --git a/xen/arch/x86/pv/domain.c b/xen/arch/x86/pv/domain.c
index f94f28c8e271..95492715d8ad 100644
--- a/xen/arch/x86/pv/domain.c
+++ b/xen/arch/x86/pv/domain.c
@@ -10,6 +10,7 @@
#include <xen/param.h>
#include <xen/sched.h>
+#include <asm/cpu-policy.h>
#include <asm/cpufeature.h>
#include <asm/invpcid.h>
#include <asm/spec_ctrl.h>
diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
index 51a19b9019eb..08ade715a3ce 100644
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -51,7 +51,6 @@
#include <asm/alternative.h>
#include <asm/mc146818rtc.h>
#include <asm/cpu-policy.h>
-#include <asm/cpuid.h>
#include <asm/spec_ctrl.h>
#include <asm/guest.h>
#include <asm/microcode.h>
@@ -1991,7 +1990,6 @@ void __init noreturn __start_xen(unsigned long mbi_p)
if ( !tboot_protect_mem_regions() )
panic("Could not protect TXT memory regions\n");
- init_guest_cpuid();
init_guest_cpu_policies();
if ( xen_cpuidle )
--
2.30.2
On 04.04.2023 11:52, Andrew Cooper wrote:
> Switch to the newer cpu_policy nomenclature. Do some easy cleanup of
> includes.
>
> No practical change.
>
> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
> ---
> CC: Jan Beulich <JBeulich@suse.com>
> CC: Roger Pau Monné <roger.pau@citrix.com>
> CC: Wei Liu <wl@xen.org>
>
> v2:
> * New
> ---
> xen/arch/x86/cpu-policy.c | 752 ++++++++++++++++++++++++
> xen/arch/x86/cpuid.c | 817 +-------------------------
> xen/arch/x86/hvm/hvm.c | 1 -
> xen/arch/x86/include/asm/cpu-policy.h | 6 +
> xen/arch/x86/include/asm/cpuid.h | 11 +-
> xen/arch/x86/pv/domain.c | 1 +
> xen/arch/x86/setup.c | 2 -
> 7 files changed, 764 insertions(+), 826 deletions(-)
>
> diff --git a/xen/arch/x86/cpu-policy.c b/xen/arch/x86/cpu-policy.c
> index f6a2317ed7bd..83186e940ca7 100644
> --- a/xen/arch/x86/cpu-policy.c
> +++ b/xen/arch/x86/cpu-policy.c
> @@ -1,13 +1,19 @@
> /* SPDX-License-Identifier: GPL-2.0-or-later */
> #include <xen/cache.h>
> #include <xen/kernel.h>
> +#include <xen/param.h>
> #include <xen/sched.h>
>
> #include <xen/lib/x86/cpu-policy.h>
>
> +#include <asm/amd.h>
> #include <asm/cpu-policy.h>
> +#include <asm/hvm/nestedhvm.h>
> +#include <asm/hvm/svm/svm.h>
> #include <asm/msr-index.h>
> +#include <asm/paging.h>
> #include <asm/setup.h>
> +#include <asm/xstate.h>
>
> struct cpu_policy __ro_after_init raw_cpu_policy;
> struct cpu_policy __ro_after_init host_cpu_policy;
> @@ -20,10 +26,332 @@ struct cpu_policy __ro_after_init hvm_max_cpu_policy;
> struct cpu_policy __ro_after_init hvm_def_cpu_policy;
> #endif
>
> +const uint32_t known_features[] = INIT_KNOWN_FEATURES;
> +
> +static const uint32_t __initconst pv_max_featuremask[] = INIT_PV_MAX_FEATURES;
> +static const uint32_t hvm_shadow_max_featuremask[] = INIT_HVM_SHADOW_MAX_FEATURES;
> +static const uint32_t __initconst hvm_hap_max_featuremask[] =
> + INIT_HVM_HAP_MAX_FEATURES;
> +static const uint32_t __initconst pv_def_featuremask[] = INIT_PV_DEF_FEATURES;
> +static const uint32_t __initconst hvm_shadow_def_featuremask[] =
> + INIT_HVM_SHADOW_DEF_FEATURES;
> +static const uint32_t __initconst hvm_hap_def_featuremask[] =
> + INIT_HVM_HAP_DEF_FEATURES;
> +static const uint32_t deep_features[] = INIT_DEEP_FEATURES;
> +
> +static const struct feature_name {
> + const char *name;
> + unsigned int bit;
> +} feature_names[] __initconstrel = INIT_FEATURE_NAMES;
> +
> +/*
> + * Parse a list of cpuid feature names -> bool, calling the callback for any
> + * matches found.
> + *
> + * always_inline, because this is init code only and we really don't want a
> + * function pointer call in the middle of the loop.
> + */
> +static int __init always_inline parse_cpuid(
> + const char *s, void (*callback)(unsigned int feat, bool val))
> +{
> + const char *ss;
> + int val, rc = 0;
> +
> + do {
> + const struct feature_name *lhs, *rhs, *mid = NULL /* GCC... */;
> + const char *feat;
> +
> + ss = strchr(s, ',');
> + if ( !ss )
> + ss = strchr(s, '\0');
> +
> + /* Skip the 'no-' prefix for name comparisons. */
> + feat = s;
> + if ( strncmp(s, "no-", 3) == 0 )
> + feat += 3;
> +
> + /* (Re)initalise lhs and rhs for binary search. */
> + lhs = feature_names;
> + rhs = feature_names + ARRAY_SIZE(feature_names);
> +
> + while ( lhs < rhs )
> + {
> + int res;
> +
> + mid = lhs + (rhs - lhs) / 2;
> + res = cmdline_strcmp(feat, mid->name);
> +
> + if ( res < 0 )
> + {
> + rhs = mid;
> + continue;
> + }
> + if ( res > 0 )
> + {
> + lhs = mid + 1;
> + continue;
> + }
> +
> + if ( (val = parse_boolean(mid->name, s, ss)) >= 0 )
> + {
> + callback(mid->bit, val);
> + mid = NULL;
> + }
> +
> + break;
> + }
> +
> + /*
> + * Mid being NULL means that the name and boolean were successfully
> + * identified. Everything else is an error.
> + */
> + if ( mid )
> + rc = -EINVAL;
> +
> + s = ss + 1;
> + } while ( *ss );
> +
> + return rc;
> +}
> +
> +static void __init cf_check _parse_xen_cpuid(unsigned int feat, bool val)
> +{
> + if ( !val )
> + setup_clear_cpu_cap(feat);
> + else if ( feat == X86_FEATURE_RDRAND &&
> + (cpuid_ecx(1) & cpufeat_mask(X86_FEATURE_RDRAND)) )
> + setup_force_cpu_cap(X86_FEATURE_RDRAND);
> +}
> +
> +static int __init cf_check parse_xen_cpuid(const char *s)
> +{
> + return parse_cpuid(s, _parse_xen_cpuid);
> +}
> +custom_param("cpuid", parse_xen_cpuid);
> +
> +static bool __initdata dom0_cpuid_cmdline;
> +static uint32_t __initdata dom0_enable_feat[FSCAPINTS];
> +static uint32_t __initdata dom0_disable_feat[FSCAPINTS];
> +
> +static void __init cf_check _parse_dom0_cpuid(unsigned int feat, bool val)
> +{
> + __set_bit (feat, val ? dom0_enable_feat : dom0_disable_feat);
> + __clear_bit(feat, val ? dom0_disable_feat : dom0_enable_feat );
> +}
> +
> +static int __init cf_check parse_dom0_cpuid(const char *s)
> +{
> + dom0_cpuid_cmdline = true;
> +
> + return parse_cpuid(s, _parse_dom0_cpuid);
> +}
> +custom_param("dom0-cpuid", parse_dom0_cpuid);
Unless the plan is to completely remove cpuid.c, this command line
handling would imo better fit there. I understand that to keep
dom0_{en,dis}able_feat[] static, the _parse_dom0_cpuid() helper
would then need to be exposed (under a different name), but I think
that's quite okay, the more that it's an __init function.
> +#define EMPTY_LEAF ((struct cpuid_leaf){})
> +static void zero_leaves(struct cpuid_leaf *l,
> + unsigned int first, unsigned int last)
> +{
> + memset(&l[first], 0, sizeof(*l) * (last - first + 1));
> +}
> +
> +static void sanitise_featureset(uint32_t *fs)
> +{
> + /* for_each_set_bit() uses unsigned longs. Extend with zeroes. */
> + uint32_t disabled_features[
> + ROUNDUP(FSCAPINTS, sizeof(unsigned long)/sizeof(uint32_t))] = {};
> + unsigned int i;
> +
> + for ( i = 0; i < FSCAPINTS; ++i )
> + {
> + /* Clamp to known mask. */
> + fs[i] &= known_features[i];
> +
> + /*
> + * Identify which features with deep dependencies have been
> + * disabled.
> + */
> + disabled_features[i] = ~fs[i] & deep_features[i];
> + }
> +
> + for_each_set_bit(i, (void *)disabled_features,
> + sizeof(disabled_features) * 8)
> + {
> + const uint32_t *dfs = x86_cpuid_lookup_deep_deps(i);
> + unsigned int j;
> +
> + ASSERT(dfs); /* deep_features[] should guarentee this. */
> +
> + for ( j = 0; j < FSCAPINTS; ++j )
> + {
> + fs[j] &= ~dfs[j];
> + disabled_features[j] &= ~dfs[j];
> + }
> + }
> +}
> +
> +static void recalculate_xstate(struct cpu_policy *p)
> +{
> + uint64_t xstates = XSTATE_FP_SSE;
> + uint32_t xstate_size = XSTATE_AREA_MIN_SIZE;
> + unsigned int i, Da1 = p->xstate.Da1;
> +
> + /*
> + * The Da1 leaf is the only piece of information preserved in the common
> + * case. Everything else is derived from other feature state.
> + */
> + memset(&p->xstate, 0, sizeof(p->xstate));
> +
> + if ( !p->basic.xsave )
> + return;
> +
> + if ( p->basic.avx )
> + {
> + xstates |= X86_XCR0_YMM;
> + xstate_size = max(xstate_size,
> + xstate_offsets[X86_XCR0_YMM_POS] +
> + xstate_sizes[X86_XCR0_YMM_POS]);
> + }
> +
> + if ( p->feat.mpx )
> + {
> + xstates |= X86_XCR0_BNDREGS | X86_XCR0_BNDCSR;
> + xstate_size = max(xstate_size,
> + xstate_offsets[X86_XCR0_BNDCSR_POS] +
> + xstate_sizes[X86_XCR0_BNDCSR_POS]);
> + }
> +
> + if ( p->feat.avx512f )
> + {
> + xstates |= X86_XCR0_OPMASK | X86_XCR0_ZMM | X86_XCR0_HI_ZMM;
> + xstate_size = max(xstate_size,
> + xstate_offsets[X86_XCR0_HI_ZMM_POS] +
> + xstate_sizes[X86_XCR0_HI_ZMM_POS]);
> + }
> +
> + if ( p->feat.pku )
> + {
> + xstates |= X86_XCR0_PKRU;
> + xstate_size = max(xstate_size,
> + xstate_offsets[X86_XCR0_PKRU_POS] +
> + xstate_sizes[X86_XCR0_PKRU_POS]);
> + }
> +
> + p->xstate.max_size = xstate_size;
> + p->xstate.xcr0_low = xstates & ~XSTATE_XSAVES_ONLY;
> + p->xstate.xcr0_high = (xstates & ~XSTATE_XSAVES_ONLY) >> 32;
> +
> + p->xstate.Da1 = Da1;
> + if ( p->xstate.xsaves )
> + {
> + p->xstate.xss_low = xstates & XSTATE_XSAVES_ONLY;
> + p->xstate.xss_high = (xstates & XSTATE_XSAVES_ONLY) >> 32;
> + }
> + else
> + xstates &= ~XSTATE_XSAVES_ONLY;
> +
> + for ( i = 2; i < min(63ul, ARRAY_SIZE(p->xstate.comp)); ++i )
> + {
> + uint64_t curr_xstate = 1ul << i;
> +
> + if ( !(xstates & curr_xstate) )
> + continue;
> +
> + p->xstate.comp[i].size = xstate_sizes[i];
> + p->xstate.comp[i].offset = xstate_offsets[i];
> + p->xstate.comp[i].xss = curr_xstate & XSTATE_XSAVES_ONLY;
> + p->xstate.comp[i].align = curr_xstate & xstate_align;
> + }
> +}
> +
> +/*
> + * Misc adjustments to the policy. Mostly clobbering reserved fields and
> + * duplicating shared fields. Intentionally hidden fields are annotated.
> + */
> +static void recalculate_misc(struct cpu_policy *p)
> +{
> + p->basic.raw_fms &= 0x0fff0fff; /* Clobber Processor Type on Intel. */
> + p->basic.apic_id = 0; /* Dynamic. */
> +
> + p->basic.raw[0x5] = EMPTY_LEAF; /* MONITOR not exposed to guests. */
> + p->basic.raw[0x6] = EMPTY_LEAF; /* Therm/Power not exposed to guests. */
> +
> + p->basic.raw[0x8] = EMPTY_LEAF;
> +
> + /* TODO: Rework topology logic. */
> + memset(p->topo.raw, 0, sizeof(p->topo.raw));
> +
> + p->basic.raw[0xc] = EMPTY_LEAF;
> +
> + p->extd.e1d &= ~CPUID_COMMON_1D_FEATURES;
> +
> + /* Most of Power/RAS hidden from guests. */
> + p->extd.raw[0x7].a = p->extd.raw[0x7].b = p->extd.raw[0x7].c = 0;
> +
> + p->extd.raw[0x8].d = 0;
> +
> + switch ( p->x86_vendor )
> + {
> + case X86_VENDOR_INTEL:
> + p->basic.l2_nr_queries = 1; /* Fixed to 1 query. */
> + p->basic.raw[0x3] = EMPTY_LEAF; /* PSN - always hidden. */
> + p->basic.raw[0x9] = EMPTY_LEAF; /* DCA - always hidden. */
> +
> + p->extd.vendor_ebx = 0;
> + p->extd.vendor_ecx = 0;
> + p->extd.vendor_edx = 0;
> +
> + p->extd.raw[0x1].a = p->extd.raw[0x1].b = 0;
> +
> + p->extd.raw[0x5] = EMPTY_LEAF;
> + p->extd.raw[0x6].a = p->extd.raw[0x6].b = p->extd.raw[0x6].d = 0;
> +
> + p->extd.raw[0x8].a &= 0x0000ffff;
> + p->extd.raw[0x8].c = 0;
> + break;
> +
> + case X86_VENDOR_AMD:
> + case X86_VENDOR_HYGON:
> + zero_leaves(p->basic.raw, 0x2, 0x3);
> + memset(p->cache.raw, 0, sizeof(p->cache.raw));
> + zero_leaves(p->basic.raw, 0x9, 0xa);
> +
> + p->extd.vendor_ebx = p->basic.vendor_ebx;
> + p->extd.vendor_ecx = p->basic.vendor_ecx;
> + p->extd.vendor_edx = p->basic.vendor_edx;
> +
> + p->extd.raw_fms = p->basic.raw_fms;
> + p->extd.raw[0x1].b &= 0xff00ffff;
> + p->extd.e1d |= p->basic._1d & CPUID_COMMON_1D_FEATURES;
> +
> + p->extd.raw[0x8].a &= 0x0000ffff; /* GuestMaxPhysAddr hidden. */
> + p->extd.raw[0x8].c &= 0x0003f0ff;
> +
> + p->extd.raw[0x9] = EMPTY_LEAF;
> +
> + zero_leaves(p->extd.raw, 0xb, 0x18);
> +
> + /* 0x19 - TLB details. Pass through. */
> + /* 0x1a - Perf hints. Pass through. */
> +
> + p->extd.raw[0x1b] = EMPTY_LEAF; /* IBS - not supported. */
> + p->extd.raw[0x1c] = EMPTY_LEAF; /* LWP - not supported. */
> + p->extd.raw[0x1d] = EMPTY_LEAF; /* TopoExt Cache */
> + p->extd.raw[0x1e] = EMPTY_LEAF; /* TopoExt APIC ID/Core/Node */
> + p->extd.raw[0x1f] = EMPTY_LEAF; /* SEV */
> + p->extd.raw[0x20] = EMPTY_LEAF; /* Platform QoS */
> + break;
> + }
> +}
> +
> static void __init calculate_raw_policy(void)
> {
> struct cpu_policy *p = &raw_cpu_policy;
>
> + x86_cpuid_policy_fill_native(p);
> +
> + /* Nothing good will come from Xen and libx86 disagreeing on vendor. */
> + ASSERT(p->x86_vendor == boot_cpu_data.x86_vendor);
> +
> /* 0x000000ce MSR_INTEL_PLATFORM_INFO */
> /* Was already added by probe_cpuid_faulting() */
>
> @@ -34,9 +362,50 @@ static void __init calculate_raw_policy(void)
> static void __init calculate_host_policy(void)
> {
> struct cpu_policy *p = &host_cpu_policy;
> + unsigned int max_extd_leaf;
>
> *p = raw_cpu_policy;
>
> + p->basic.max_leaf =
> + min_t(uint32_t, p->basic.max_leaf, ARRAY_SIZE(p->basic.raw) - 1);
> + p->feat.max_subleaf =
> + min_t(uint32_t, p->feat.max_subleaf, ARRAY_SIZE(p->feat.raw) - 1);
> +
> + max_extd_leaf = p->extd.max_leaf;
> +
> + /*
> + * For AMD/Hygon hardware before Zen3, we unilaterally modify LFENCE to be
> + * dispatch serialising for Spectre mitigations. Extend max_extd_leaf
> + * beyond what hardware supports, to include the feature leaf containing
> + * this information.
> + */
> + if ( cpu_has_lfence_dispatch )
> + max_extd_leaf = max(max_extd_leaf, 0x80000021);
> +
> + p->extd.max_leaf = 0x80000000 | min_t(uint32_t, max_extd_leaf & 0xffff,
> + ARRAY_SIZE(p->extd.raw) - 1);
> +
> + x86_cpu_featureset_to_policy(boot_cpu_data.x86_capability, p);
> + recalculate_xstate(p);
> + recalculate_misc(p);
> +
> + /* When vPMU is disabled, drop it from the host policy. */
> + if ( vpmu_mode == XENPMU_MODE_OFF )
> + p->basic.raw[0xa] = EMPTY_LEAF;
> +
> + if ( p->extd.svm )
> + {
> + /* Clamp to implemented features which require hardware support. */
> + p->extd.raw[0xa].d &= ((1u << SVM_FEATURE_NPT) |
> + (1u << SVM_FEATURE_LBRV) |
> + (1u << SVM_FEATURE_NRIPS) |
> + (1u << SVM_FEATURE_PAUSEFILTER) |
> + (1u << SVM_FEATURE_DECODEASSISTS));
> + /* Enable features which are always emulated. */
> + p->extd.raw[0xa].d |= ((1u << SVM_FEATURE_VMCBCLEAN) |
> + (1u << SVM_FEATURE_TSCRATEMSR));
> + }
> +
> /* 0x000000ce MSR_INTEL_PLATFORM_INFO */
> /* probe_cpuid_faulting() sanity checks presence of MISC_FEATURES_ENABLES */
> p->platform_info.cpuid_faulting = cpu_has_cpuid_faulting;
> @@ -51,11 +420,88 @@ static void __init calculate_host_policy(void)
> ARCH_CAPS_PBRSB_NO);
> }
>
> +static void __init guest_common_default_feature_adjustments(uint32_t *fs)
> +{
> + /*
> + * IvyBridge client parts suffer from leakage of RDRAND data due to SRBDS
> + * (XSA-320 / CVE-2020-0543), and won't be receiving microcode to
> + * compensate.
> + *
> + * Mitigate by hiding RDRAND from guests by default, unless explicitly
> + * overridden on the Xen command line (cpuid=rdrand). Irrespective of the
> + * default setting, guests can use RDRAND if explicitly enabled
> + * (cpuid="host,rdrand=1") in the VM's config file, and VMs which were
> + * previously using RDRAND can migrate in.
> + */
> + if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
> + boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x3a &&
> + cpu_has_rdrand && !is_forced_cpu_cap(X86_FEATURE_RDRAND) )
> + __clear_bit(X86_FEATURE_RDRAND, fs);
> +
> + /*
> + * On certain hardware, speculative or errata workarounds can result in
> + * TSX being placed in "force-abort" mode, where it doesn't actually
> + * function as expected, but is technically compatible with the ISA.
> + *
> + * Do not advertise RTM to guests by default if it won't actually work.
> + */
> + if ( rtm_disabled )
> + __clear_bit(X86_FEATURE_RTM, fs);
> +}
> +
> +static void __init guest_common_feature_adjustments(uint32_t *fs)
> +{
> + /* Unconditionally claim to be able to set the hypervisor bit. */
> + __set_bit(X86_FEATURE_HYPERVISOR, fs);
> +
> + /*
> + * If IBRS is offered to the guest, unconditionally offer STIBP. It is a
> + * nop on non-HT hardware, and has this behaviour to make heterogeneous
> + * setups easier to manage.
> + */
> + if ( test_bit(X86_FEATURE_IBRSB, fs) )
> + __set_bit(X86_FEATURE_STIBP, fs);
> + if ( test_bit(X86_FEATURE_IBRS, fs) )
> + __set_bit(X86_FEATURE_AMD_STIBP, fs);
> +
> + /*
> + * On hardware which supports IBRS/IBPB, we can offer IBPB independently
> + * of IBRS by using the AMD feature bit. An administrator may wish for
> + * performance reasons to offer IBPB without IBRS.
> + */
> + if ( host_cpu_policy.feat.ibrsb )
> + __set_bit(X86_FEATURE_IBPB, fs);
> +}
> +
> static void __init calculate_pv_max_policy(void)
> {
> struct cpu_policy *p = &pv_max_cpu_policy;
> + uint32_t fs[FSCAPINTS];
> + unsigned int i;
>
> *p = host_cpu_policy;
> + x86_cpu_policy_to_featureset(p, fs);
> +
> + for ( i = 0; i < ARRAY_SIZE(fs); ++i )
> + fs[i] &= pv_max_featuremask[i];
> +
> + /*
> + * If Xen isn't virtualising MSR_SPEC_CTRL for PV guests (functional
> + * availability, or admin choice), hide the feature.
> + */
> + if ( !boot_cpu_has(X86_FEATURE_SC_MSR_PV) )
> + {
> + __clear_bit(X86_FEATURE_IBRSB, fs);
> + __clear_bit(X86_FEATURE_IBRS, fs);
> + }
> +
> + guest_common_feature_adjustments(fs);
> +
> + sanitise_featureset(fs);
> + x86_cpu_featureset_to_policy(fs, p);
> + recalculate_xstate(p);
> +
> + p->extd.raw[0xa] = EMPTY_LEAF; /* No SVM for PV guests. */
>
> p->arch_caps.raw = 0; /* Not supported yet. */
> }
> @@ -63,15 +509,112 @@ static void __init calculate_pv_max_policy(void)
> static void __init calculate_pv_def_policy(void)
> {
> struct cpu_policy *p = &pv_def_cpu_policy;
> + uint32_t fs[FSCAPINTS];
> + unsigned int i;
>
> *p = pv_max_cpu_policy;
> + x86_cpu_policy_to_featureset(p, fs);
> +
> + for ( i = 0; i < ARRAY_SIZE(fs); ++i )
> + fs[i] &= pv_def_featuremask[i];
> +
> + guest_common_feature_adjustments(fs);
> + guest_common_default_feature_adjustments(fs);
> +
> + sanitise_featureset(fs);
> + x86_cpu_featureset_to_policy(fs, p);
> + recalculate_xstate(p);
> }
>
> static void __init calculate_hvm_max_policy(void)
> {
> struct cpu_policy *p = &hvm_max_cpu_policy;
> + uint32_t fs[FSCAPINTS];
> + unsigned int i;
> + const uint32_t *mask;
>
> *p = host_cpu_policy;
> + x86_cpu_policy_to_featureset(p, fs);
> +
> + mask = hvm_hap_supported() ?
> + hvm_hap_max_featuremask : hvm_shadow_max_featuremask;
> +
> + for ( i = 0; i < ARRAY_SIZE(fs); ++i )
> + fs[i] &= mask[i];
> +
> + /*
> + * Xen can provide an (x2)APIC emulation to HVM guests even if the host's
> + * (x2)APIC isn't enabled.
> + */
> + __set_bit(X86_FEATURE_APIC, fs);
> + __set_bit(X86_FEATURE_X2APIC, fs);
> +
> + /*
> + * We don't support EFER.LMSLE at all. AMD has dropped the feature from
> + * hardware and allocated a CPUID bit to indicate its absence.
> + */
> + __set_bit(X86_FEATURE_NO_LMSL, fs);
> +
> + /*
> + * On AMD, PV guests are entirely unable to use SYSENTER as Xen runs in
> + * long mode (and init_amd() has cleared it out of host capabilities), but
> + * HVM guests are able if running in protected mode.
> + */
> + if ( (boot_cpu_data.x86_vendor & (X86_VENDOR_AMD | X86_VENDOR_HYGON)) &&
> + raw_cpu_policy.basic.sep )
> + __set_bit(X86_FEATURE_SEP, fs);
> +
> + /*
> + * VIRT_SSBD is exposed in the default policy as a result of
> + * amd_virt_spec_ctrl being set, it also needs exposing in the max policy.
> + */
> + if ( amd_virt_spec_ctrl )
> + __set_bit(X86_FEATURE_VIRT_SSBD, fs);
> +
> + /*
> + * If Xen isn't virtualising MSR_SPEC_CTRL for HVM guests (functional
> + * availability, or admin choice), hide the feature.
> + */
> + if ( !boot_cpu_has(X86_FEATURE_SC_MSR_HVM) )
> + {
> + __clear_bit(X86_FEATURE_IBRSB, fs);
> + __clear_bit(X86_FEATURE_IBRS, fs);
> + }
> + else if ( boot_cpu_has(X86_FEATURE_AMD_SSBD) )
> + /*
> + * If SPEC_CTRL.SSBD is available VIRT_SPEC_CTRL.SSBD can be exposed
> + * and implemented using the former. Expose in the max policy only as
> + * the preference is for guests to use SPEC_CTRL.SSBD if available.
> + */
> + __set_bit(X86_FEATURE_VIRT_SSBD, fs);
> +
> + /*
> + * With VT-x, some features are only supported by Xen if dedicated
> + * hardware support is also available.
> + */
> + if ( cpu_has_vmx )
> + {
> + if ( !cpu_has_vmx_mpx )
> + __clear_bit(X86_FEATURE_MPX, fs);
> +
> + if ( !cpu_has_vmx_xsaves )
> + __clear_bit(X86_FEATURE_XSAVES, fs);
> + }
> +
> + /*
> + * Xen doesn't use PKS, so the guest support for it has opted to not use
> + * the VMCS load/save controls for efficiency reasons. This depends on
> + * the exact vmentry/exit behaviour, so don't expose PKS in other
> + * situations until someone has cross-checked the behaviour for safety.
> + */
> + if ( !cpu_has_vmx )
> + __clear_bit(X86_FEATURE_PKS, fs);
> +
> + guest_common_feature_adjustments(fs);
> +
> + sanitise_featureset(fs);
> + x86_cpu_featureset_to_policy(fs, p);
> + recalculate_xstate(p);
>
> /* It's always possible to emulate CPUID faulting for HVM guests */
> p->platform_info.cpuid_faulting = true;
> @@ -82,8 +625,32 @@ static void __init calculate_hvm_max_policy(void)
> static void __init calculate_hvm_def_policy(void)
> {
> struct cpu_policy *p = &hvm_def_cpu_policy;
> + uint32_t fs[FSCAPINTS];
> + unsigned int i;
> + const uint32_t *mask;
>
> *p = hvm_max_cpu_policy;
> + x86_cpu_policy_to_featureset(p, fs);
> +
> + mask = hvm_hap_supported() ?
> + hvm_hap_def_featuremask : hvm_shadow_def_featuremask;
> +
> + for ( i = 0; i < ARRAY_SIZE(fs); ++i )
> + fs[i] &= mask[i];
> +
> + guest_common_feature_adjustments(fs);
> + guest_common_default_feature_adjustments(fs);
> +
> + /*
> + * Only expose VIRT_SSBD if AMD_SSBD is not available, and thus
> + * amd_virt_spec_ctrl is set.
> + */
> + if ( amd_virt_spec_ctrl )
> + __set_bit(X86_FEATURE_VIRT_SSBD, fs);
> +
> + sanitise_featureset(fs);
> + x86_cpu_featureset_to_policy(fs, p);
> + recalculate_xstate(p);
> }
>
> void __init init_guest_cpu_policies(void)
> @@ -149,3 +716,188 @@ int init_domain_cpu_policy(struct domain *d)
>
> return 0;
> }
> +
> +void recalculate_cpuid_policy(struct domain *d)
> +{
> + struct cpu_policy *p = d->arch.cpuid;
> + const struct cpu_policy *max = is_pv_domain(d)
> + ? (IS_ENABLED(CONFIG_PV) ? &pv_max_cpu_policy : NULL)
> + : (IS_ENABLED(CONFIG_HVM) ? &hvm_max_cpu_policy : NULL);
While this is how the original code was, wouldn't this want to use
hvm_enabled, just like init_guest_cpu_policies() does (patch 10)?
> + uint32_t fs[FSCAPINTS], max_fs[FSCAPINTS];
> + unsigned int i;
> +
> + if ( !max )
> + {
> + ASSERT_UNREACHABLE();
> + return;
> + }
> +
> + p->x86_vendor = x86_cpuid_lookup_vendor(
> + p->basic.vendor_ebx, p->basic.vendor_ecx, p->basic.vendor_edx);
> +
> + p->basic.max_leaf = min(p->basic.max_leaf, max->basic.max_leaf);
> + p->feat.max_subleaf = min(p->feat.max_subleaf, max->feat.max_subleaf);
> + p->extd.max_leaf = 0x80000000 | min(p->extd.max_leaf & 0xffff,
> + ((p->x86_vendor & (X86_VENDOR_AMD |
> + X86_VENDOR_HYGON))
> + ? CPUID_GUEST_NR_EXTD_AMD
> + : CPUID_GUEST_NR_EXTD_INTEL) - 1);
> +
> + x86_cpu_policy_to_featureset(p, fs);
> + x86_cpu_policy_to_featureset(max, max_fs);
> +
> + if ( is_hvm_domain(d) )
> + {
> + /*
> + * HVM domains using Shadow paging have further restrictions on their
> + * available paging features.
> + */
> + if ( !hap_enabled(d) )
> + {
> + for ( i = 0; i < ARRAY_SIZE(max_fs); i++ )
> + max_fs[i] &= hvm_shadow_max_featuremask[i];
> + }
> +
> + /* Hide nested-virt if it hasn't been explicitly configured. */
> + if ( !nestedhvm_enabled(d) )
> + {
> + __clear_bit(X86_FEATURE_VMX, max_fs);
> + __clear_bit(X86_FEATURE_SVM, max_fs);
> + }
> + }
> +
> + /*
> + * Allow the toolstack to set HTT, X2APIC and CMP_LEGACY. These bits
> + * affect how to interpret topology information in other cpuid leaves.
> + */
> + __set_bit(X86_FEATURE_HTT, max_fs);
> + __set_bit(X86_FEATURE_X2APIC, max_fs);
> + __set_bit(X86_FEATURE_CMP_LEGACY, max_fs);
> +
> + /*
> + * 32bit PV domains can't use any Long Mode features, and cannot use
> + * SYSCALL on non-AMD hardware.
> + */
> + if ( is_pv_32bit_domain(d) )
> + {
> + __clear_bit(X86_FEATURE_LM, max_fs);
> + if ( !(boot_cpu_data.x86_vendor & (X86_VENDOR_AMD | X86_VENDOR_HYGON)) )
> + __clear_bit(X86_FEATURE_SYSCALL, max_fs);
> + }
> +
> + /* Clamp the toolstacks choices to reality. */
> + for ( i = 0; i < ARRAY_SIZE(fs); i++ )
> + fs[i] &= max_fs[i];
> +
> + if ( p->basic.max_leaf < XSTATE_CPUID )
> + __clear_bit(X86_FEATURE_XSAVE, fs);
> +
> + sanitise_featureset(fs);
> +
> + /* Fold host's FDP_EXCP_ONLY and NO_FPU_SEL into guest's view. */
> + fs[FEATURESET_7b0] &= ~(cpufeat_mask(X86_FEATURE_FDP_EXCP_ONLY) |
> + cpufeat_mask(X86_FEATURE_NO_FPU_SEL));
> + fs[FEATURESET_7b0] |= (host_cpu_policy.feat._7b0 &
> + (cpufeat_mask(X86_FEATURE_FDP_EXCP_ONLY) |
> + cpufeat_mask(X86_FEATURE_NO_FPU_SEL)));
> +
> + x86_cpu_featureset_to_policy(fs, p);
> +
> + /* Pass host cacheline size through to guests. */
> + p->basic.clflush_size = max->basic.clflush_size;
> +
> + p->extd.maxphysaddr = min(p->extd.maxphysaddr, max->extd.maxphysaddr);
> + p->extd.maxphysaddr = min_t(uint8_t, p->extd.maxphysaddr,
> + paging_max_paddr_bits(d));
> + p->extd.maxphysaddr = max_t(uint8_t, p->extd.maxphysaddr,
> + (p->basic.pae || p->basic.pse36) ? 36 : 32);
> +
> + p->extd.maxlinaddr = p->extd.lm ? 48 : 32;
> +
> + recalculate_xstate(p);
> + recalculate_misc(p);
> +
> + for ( i = 0; i < ARRAY_SIZE(p->cache.raw); ++i )
> + {
> + if ( p->cache.subleaf[i].type >= 1 &&
> + p->cache.subleaf[i].type <= 3 )
> + {
> + /* Subleaf has a valid cache type. Zero reserved fields. */
> + p->cache.raw[i].a &= 0xffffc3ffu;
> + p->cache.raw[i].d &= 0x00000007u;
> + }
> + else
> + {
> + /* Subleaf is not valid. Zero the rest of the union. */
> + zero_leaves(p->cache.raw, i, ARRAY_SIZE(p->cache.raw) - 1);
> + break;
> + }
> + }
> +
> + if ( vpmu_mode == XENPMU_MODE_OFF ||
> + ((vpmu_mode & XENPMU_MODE_ALL) && !is_hardware_domain(d)) )
> + p->basic.raw[0xa] = EMPTY_LEAF;
> +
> + if ( !p->extd.svm )
> + p->extd.raw[0xa] = EMPTY_LEAF;
> +
> + if ( !p->extd.page1gb )
> + p->extd.raw[0x19] = EMPTY_LEAF;
> +}
> +
> +void __init init_dom0_cpuid_policy(struct domain *d)
> +{
> + struct cpu_policy *p = d->arch.cpuid;
> +
> + /* dom0 can't migrate. Give it ITSC if available. */
> + if ( cpu_has_itsc )
> + p->extd.itsc = true;
> +
> + /*
> + * Expose the "hardware speculation behaviour" bits of ARCH_CAPS to dom0,
> + * so dom0 can turn off workarounds as appropriate. Temporary, until the
> + * domain policy logic gains a better understanding of MSRs.
> + */
> + if ( cpu_has_arch_caps )
> + p->feat.arch_caps = true;
> +
> + /* Apply dom0-cpuid= command line settings, if provided. */
> + if ( dom0_cpuid_cmdline )
> + {
> + uint32_t fs[FSCAPINTS];
> + unsigned int i;
> +
> + x86_cpu_policy_to_featureset(p, fs);
> +
> + for ( i = 0; i < ARRAY_SIZE(fs); ++i )
> + {
> + fs[i] |= dom0_enable_feat [i];
> + fs[i] &= ~dom0_disable_feat[i];
> + }
> +
> + x86_cpu_featureset_to_policy(fs, p);
> +
> + recalculate_cpuid_policy(d);
> + }
> +}
> +
> +static void __init __maybe_unused build_assertions(void)
> +{
> + BUILD_BUG_ON(ARRAY_SIZE(known_features) != FSCAPINTS);
> + BUILD_BUG_ON(ARRAY_SIZE(pv_max_featuremask) != FSCAPINTS);
> + BUILD_BUG_ON(ARRAY_SIZE(hvm_shadow_max_featuremask) != FSCAPINTS);
> + BUILD_BUG_ON(ARRAY_SIZE(hvm_hap_max_featuremask) != FSCAPINTS);
> + BUILD_BUG_ON(ARRAY_SIZE(deep_features) != FSCAPINTS);
> +
> + /* Find some more clever allocation scheme if this trips. */
> + BUILD_BUG_ON(sizeof(struct cpu_policy) > PAGE_SIZE);
> +
> + BUILD_BUG_ON(sizeof(raw_cpu_policy.basic) !=
> + sizeof(raw_cpu_policy.basic.raw));
> + BUILD_BUG_ON(sizeof(raw_cpu_policy.feat) !=
> + sizeof(raw_cpu_policy.feat.raw));
> + BUILD_BUG_ON(sizeof(raw_cpu_policy.xstate) !=
> + sizeof(raw_cpu_policy.xstate.raw));
> + BUILD_BUG_ON(sizeof(raw_cpu_policy.extd) !=
> + sizeof(raw_cpu_policy.extd.raw));
> +}
> diff --git a/xen/arch/x86/cpuid.c b/xen/arch/x86/cpuid.c
> index 5eb5f1893516..3f20c342fde8 100644
> --- a/xen/arch/x86/cpuid.c
> +++ b/xen/arch/x86/cpuid.c
> @@ -1,638 +1,14 @@
> -#include <xen/init.h>
> -#include <xen/lib.h>
> -#include <xen/param.h>
> #include <xen/sched.h>
> -#include <xen/nospec.h>
> -#include <asm/amd.h>
> +#include <xen/types.h>
> +
> +#include <public/hvm/params.h>
> +
> #include <asm/cpu-policy.h>
> #include <asm/cpuid.h>
> -#include <asm/hvm/hvm.h>
> -#include <asm/hvm/nestedhvm.h>
> -#include <asm/hvm/svm/svm.h>
> #include <asm/hvm/viridian.h>
> -#include <asm/hvm/vmx/vmcs.h>
> -#include <asm/paging.h>
> -#include <asm/processor.h>
> #include <asm/xstate.h>
>
> -const uint32_t known_features[] = INIT_KNOWN_FEATURES;
> -
> -static const uint32_t __initconst pv_max_featuremask[] = INIT_PV_MAX_FEATURES;
> -static const uint32_t hvm_shadow_max_featuremask[] = INIT_HVM_SHADOW_MAX_FEATURES;
> -static const uint32_t __initconst hvm_hap_max_featuremask[] =
> - INIT_HVM_HAP_MAX_FEATURES;
> -static const uint32_t __initconst pv_def_featuremask[] = INIT_PV_DEF_FEATURES;
> -static const uint32_t __initconst hvm_shadow_def_featuremask[] =
> - INIT_HVM_SHADOW_DEF_FEATURES;
> -static const uint32_t __initconst hvm_hap_def_featuremask[] =
> - INIT_HVM_HAP_DEF_FEATURES;
> -static const uint32_t deep_features[] = INIT_DEEP_FEATURES;
> -
> -static const struct feature_name {
> - const char *name;
> - unsigned int bit;
> -} feature_names[] __initconstrel = INIT_FEATURE_NAMES;
> -
> -/*
> - * Parse a list of cpuid feature names -> bool, calling the callback for any
> - * matches found.
> - *
> - * always_inline, because this is init code only and we really don't want a
> - * function pointer call in the middle of the loop.
> - */
> -static int __init always_inline parse_cpuid(
> - const char *s, void (*callback)(unsigned int feat, bool val))
> -{
> - const char *ss;
> - int val, rc = 0;
> -
> - do {
> - const struct feature_name *lhs, *rhs, *mid = NULL /* GCC... */;
> - const char *feat;
> -
> - ss = strchr(s, ',');
> - if ( !ss )
> - ss = strchr(s, '\0');
> -
> - /* Skip the 'no-' prefix for name comparisons. */
> - feat = s;
> - if ( strncmp(s, "no-", 3) == 0 )
> - feat += 3;
> -
> - /* (Re)initalise lhs and rhs for binary search. */
> - lhs = feature_names;
> - rhs = feature_names + ARRAY_SIZE(feature_names);
> -
> - while ( lhs < rhs )
> - {
> - int res;
> -
> - mid = lhs + (rhs - lhs) / 2;
> - res = cmdline_strcmp(feat, mid->name);
> -
> - if ( res < 0 )
> - {
> - rhs = mid;
> - continue;
> - }
> - if ( res > 0 )
> - {
> - lhs = mid + 1;
> - continue;
> - }
> -
> - if ( (val = parse_boolean(mid->name, s, ss)) >= 0 )
> - {
> - callback(mid->bit, val);
> - mid = NULL;
> - }
> -
> - break;
> - }
> -
> - /*
> - * Mid being NULL means that the name and boolean were successfully
> - * identified. Everything else is an error.
> - */
> - if ( mid )
> - rc = -EINVAL;
> -
> - s = ss + 1;
> - } while ( *ss );
> -
> - return rc;
> -}
> -
> -static void __init cf_check _parse_xen_cpuid(unsigned int feat, bool val)
> -{
> - if ( !val )
> - setup_clear_cpu_cap(feat);
> - else if ( feat == X86_FEATURE_RDRAND &&
> - (cpuid_ecx(1) & cpufeat_mask(X86_FEATURE_RDRAND)) )
> - setup_force_cpu_cap(X86_FEATURE_RDRAND);
> -}
> -
> -static int __init cf_check parse_xen_cpuid(const char *s)
> -{
> - return parse_cpuid(s, _parse_xen_cpuid);
> -}
> -custom_param("cpuid", parse_xen_cpuid);
> -
> -static bool __initdata dom0_cpuid_cmdline;
> -static uint32_t __initdata dom0_enable_feat[FSCAPINTS];
> -static uint32_t __initdata dom0_disable_feat[FSCAPINTS];
> -
> -static void __init cf_check _parse_dom0_cpuid(unsigned int feat, bool val)
> -{
> - __set_bit (feat, val ? dom0_enable_feat : dom0_disable_feat);
> - __clear_bit(feat, val ? dom0_disable_feat : dom0_enable_feat );
> -}
> -
> -static int __init cf_check parse_dom0_cpuid(const char *s)
> -{
> - dom0_cpuid_cmdline = true;
> -
> - return parse_cpuid(s, _parse_dom0_cpuid);
> -}
> -custom_param("dom0-cpuid", parse_dom0_cpuid);
> -
> #define EMPTY_LEAF ((struct cpuid_leaf){})
> -static void zero_leaves(struct cpuid_leaf *l,
> - unsigned int first, unsigned int last)
> -{
> - memset(&l[first], 0, sizeof(*l) * (last - first + 1));
> -}
> -
> -static void sanitise_featureset(uint32_t *fs)
> -{
> - /* for_each_set_bit() uses unsigned longs. Extend with zeroes. */
> - uint32_t disabled_features[
> - ROUNDUP(FSCAPINTS, sizeof(unsigned long)/sizeof(uint32_t))] = {};
> - unsigned int i;
> -
> - for ( i = 0; i < FSCAPINTS; ++i )
> - {
> - /* Clamp to known mask. */
> - fs[i] &= known_features[i];
> -
> - /*
> - * Identify which features with deep dependencies have been
> - * disabled.
> - */
> - disabled_features[i] = ~fs[i] & deep_features[i];
> - }
> -
> - for_each_set_bit(i, (void *)disabled_features,
> - sizeof(disabled_features) * 8)
> - {
> - const uint32_t *dfs = x86_cpuid_lookup_deep_deps(i);
> - unsigned int j;
> -
> - ASSERT(dfs); /* deep_features[] should guarentee this. */
> -
> - for ( j = 0; j < FSCAPINTS; ++j )
> - {
> - fs[j] &= ~dfs[j];
> - disabled_features[j] &= ~dfs[j];
> - }
> - }
> -}
> -
> -static void recalculate_xstate(struct cpuid_policy *p)
> -{
> - uint64_t xstates = XSTATE_FP_SSE;
> - uint32_t xstate_size = XSTATE_AREA_MIN_SIZE;
> - unsigned int i, Da1 = p->xstate.Da1;
> -
> - /*
> - * The Da1 leaf is the only piece of information preserved in the common
> - * case. Everything else is derived from other feature state.
> - */
> - memset(&p->xstate, 0, sizeof(p->xstate));
> -
> - if ( !p->basic.xsave )
> - return;
> -
> - if ( p->basic.avx )
> - {
> - xstates |= X86_XCR0_YMM;
> - xstate_size = max(xstate_size,
> - xstate_offsets[X86_XCR0_YMM_POS] +
> - xstate_sizes[X86_XCR0_YMM_POS]);
> - }
> -
> - if ( p->feat.mpx )
> - {
> - xstates |= X86_XCR0_BNDREGS | X86_XCR0_BNDCSR;
> - xstate_size = max(xstate_size,
> - xstate_offsets[X86_XCR0_BNDCSR_POS] +
> - xstate_sizes[X86_XCR0_BNDCSR_POS]);
> - }
> -
> - if ( p->feat.avx512f )
> - {
> - xstates |= X86_XCR0_OPMASK | X86_XCR0_ZMM | X86_XCR0_HI_ZMM;
> - xstate_size = max(xstate_size,
> - xstate_offsets[X86_XCR0_HI_ZMM_POS] +
> - xstate_sizes[X86_XCR0_HI_ZMM_POS]);
> - }
> -
> - if ( p->feat.pku )
> - {
> - xstates |= X86_XCR0_PKRU;
> - xstate_size = max(xstate_size,
> - xstate_offsets[X86_XCR0_PKRU_POS] +
> - xstate_sizes[X86_XCR0_PKRU_POS]);
> - }
> -
> - p->xstate.max_size = xstate_size;
> - p->xstate.xcr0_low = xstates & ~XSTATE_XSAVES_ONLY;
> - p->xstate.xcr0_high = (xstates & ~XSTATE_XSAVES_ONLY) >> 32;
> -
> - p->xstate.Da1 = Da1;
> - if ( p->xstate.xsaves )
> - {
> - p->xstate.xss_low = xstates & XSTATE_XSAVES_ONLY;
> - p->xstate.xss_high = (xstates & XSTATE_XSAVES_ONLY) >> 32;
> - }
> - else
> - xstates &= ~XSTATE_XSAVES_ONLY;
> -
> - for ( i = 2; i < min(63ul, ARRAY_SIZE(p->xstate.comp)); ++i )
> - {
> - uint64_t curr_xstate = 1ul << i;
> -
> - if ( !(xstates & curr_xstate) )
> - continue;
> -
> - p->xstate.comp[i].size = xstate_sizes[i];
> - p->xstate.comp[i].offset = xstate_offsets[i];
> - p->xstate.comp[i].xss = curr_xstate & XSTATE_XSAVES_ONLY;
> - p->xstate.comp[i].align = curr_xstate & xstate_align;
> - }
> -}
> -
> -/*
> - * Misc adjustments to the policy. Mostly clobbering reserved fields and
> - * duplicating shared fields. Intentionally hidden fields are annotated.
> - */
> -static void recalculate_misc(struct cpuid_policy *p)
> -{
> - p->basic.raw_fms &= 0x0fff0fff; /* Clobber Processor Type on Intel. */
> - p->basic.apic_id = 0; /* Dynamic. */
> -
> - p->basic.raw[0x5] = EMPTY_LEAF; /* MONITOR not exposed to guests. */
> - p->basic.raw[0x6] = EMPTY_LEAF; /* Therm/Power not exposed to guests. */
> -
> - p->basic.raw[0x8] = EMPTY_LEAF;
> -
> - /* TODO: Rework topology logic. */
> - memset(p->topo.raw, 0, sizeof(p->topo.raw));
> -
> - p->basic.raw[0xc] = EMPTY_LEAF;
> -
> - p->extd.e1d &= ~CPUID_COMMON_1D_FEATURES;
> -
> - /* Most of Power/RAS hidden from guests. */
> - p->extd.raw[0x7].a = p->extd.raw[0x7].b = p->extd.raw[0x7].c = 0;
> -
> - p->extd.raw[0x8].d = 0;
> -
> - switch ( p->x86_vendor )
> - {
> - case X86_VENDOR_INTEL:
> - p->basic.l2_nr_queries = 1; /* Fixed to 1 query. */
> - p->basic.raw[0x3] = EMPTY_LEAF; /* PSN - always hidden. */
> - p->basic.raw[0x9] = EMPTY_LEAF; /* DCA - always hidden. */
> -
> - p->extd.vendor_ebx = 0;
> - p->extd.vendor_ecx = 0;
> - p->extd.vendor_edx = 0;
> -
> - p->extd.raw[0x1].a = p->extd.raw[0x1].b = 0;
> -
> - p->extd.raw[0x5] = EMPTY_LEAF;
> - p->extd.raw[0x6].a = p->extd.raw[0x6].b = p->extd.raw[0x6].d = 0;
> -
> - p->extd.raw[0x8].a &= 0x0000ffff;
> - p->extd.raw[0x8].c = 0;
> - break;
> -
> - case X86_VENDOR_AMD:
> - case X86_VENDOR_HYGON:
> - zero_leaves(p->basic.raw, 0x2, 0x3);
> - memset(p->cache.raw, 0, sizeof(p->cache.raw));
> - zero_leaves(p->basic.raw, 0x9, 0xa);
> -
> - p->extd.vendor_ebx = p->basic.vendor_ebx;
> - p->extd.vendor_ecx = p->basic.vendor_ecx;
> - p->extd.vendor_edx = p->basic.vendor_edx;
> -
> - p->extd.raw_fms = p->basic.raw_fms;
> - p->extd.raw[0x1].b &= 0xff00ffff;
> - p->extd.e1d |= p->basic._1d & CPUID_COMMON_1D_FEATURES;
> -
> - p->extd.raw[0x8].a &= 0x0000ffff; /* GuestMaxPhysAddr hidden. */
> - p->extd.raw[0x8].c &= 0x0003f0ff;
> -
> - p->extd.raw[0x9] = EMPTY_LEAF;
> -
> - zero_leaves(p->extd.raw, 0xb, 0x18);
> -
> - /* 0x19 - TLB details. Pass through. */
> - /* 0x1a - Perf hints. Pass through. */
> -
> - p->extd.raw[0x1b] = EMPTY_LEAF; /* IBS - not supported. */
> - p->extd.raw[0x1c] = EMPTY_LEAF; /* LWP - not supported. */
> - p->extd.raw[0x1d] = EMPTY_LEAF; /* TopoExt Cache */
> - p->extd.raw[0x1e] = EMPTY_LEAF; /* TopoExt APIC ID/Core/Node */
> - p->extd.raw[0x1f] = EMPTY_LEAF; /* SEV */
> - p->extd.raw[0x20] = EMPTY_LEAF; /* Platform QoS */
> - break;
> - }
> -}
> -
> -static void __init calculate_raw_policy(void)
> -{
> - struct cpuid_policy *p = &raw_cpu_policy;
> -
> - x86_cpuid_policy_fill_native(p);
> -
> - /* Nothing good will come from Xen and libx86 disagreeing on vendor. */
> - ASSERT(p->x86_vendor == boot_cpu_data.x86_vendor);
> -}
> -
> -static void __init calculate_host_policy(void)
> -{
> - struct cpuid_policy *p = &host_cpu_policy;
> - unsigned int max_extd_leaf;
> -
> - *p = raw_cpu_policy;
> -
> - p->basic.max_leaf =
> - min_t(uint32_t, p->basic.max_leaf, ARRAY_SIZE(p->basic.raw) - 1);
> - p->feat.max_subleaf =
> - min_t(uint32_t, p->feat.max_subleaf, ARRAY_SIZE(p->feat.raw) - 1);
> -
> - max_extd_leaf = p->extd.max_leaf;
> -
> - /*
> - * For AMD/Hygon hardware before Zen3, we unilaterally modify LFENCE to be
> - * dispatch serialising for Spectre mitigations. Extend max_extd_leaf
> - * beyond what hardware supports, to include the feature leaf containing
> - * this information.
> - */
> - if ( cpu_has_lfence_dispatch )
> - max_extd_leaf = max(max_extd_leaf, 0x80000021);
> -
> - p->extd.max_leaf = 0x80000000 | min_t(uint32_t, max_extd_leaf & 0xffff,
> - ARRAY_SIZE(p->extd.raw) - 1);
> -
> - x86_cpu_featureset_to_policy(boot_cpu_data.x86_capability, p);
> - recalculate_xstate(p);
> - recalculate_misc(p);
> -
> - /* When vPMU is disabled, drop it from the host policy. */
> - if ( vpmu_mode == XENPMU_MODE_OFF )
> - p->basic.raw[0xa] = EMPTY_LEAF;
> -
> - if ( p->extd.svm )
> - {
> - /* Clamp to implemented features which require hardware support. */
> - p->extd.raw[0xa].d &= ((1u << SVM_FEATURE_NPT) |
> - (1u << SVM_FEATURE_LBRV) |
> - (1u << SVM_FEATURE_NRIPS) |
> - (1u << SVM_FEATURE_PAUSEFILTER) |
> - (1u << SVM_FEATURE_DECODEASSISTS));
> - /* Enable features which are always emulated. */
> - p->extd.raw[0xa].d |= ((1u << SVM_FEATURE_VMCBCLEAN) |
> - (1u << SVM_FEATURE_TSCRATEMSR));
> - }
> -}
> -
> -static void __init guest_common_default_feature_adjustments(uint32_t *fs)
> -{
> - /*
> - * IvyBridge client parts suffer from leakage of RDRAND data due to SRBDS
> - * (XSA-320 / CVE-2020-0543), and won't be receiving microcode to
> - * compensate.
> - *
> - * Mitigate by hiding RDRAND from guests by default, unless explicitly
> - * overridden on the Xen command line (cpuid=rdrand). Irrespective of the
> - * default setting, guests can use RDRAND if explicitly enabled
> - * (cpuid="host,rdrand=1") in the VM's config file, and VMs which were
> - * previously using RDRAND can migrate in.
> - */
> - if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
> - boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x3a &&
> - cpu_has_rdrand && !is_forced_cpu_cap(X86_FEATURE_RDRAND) )
> - __clear_bit(X86_FEATURE_RDRAND, fs);
> -
> - /*
> - * On certain hardware, speculative or errata workarounds can result in
> - * TSX being placed in "force-abort" mode, where it doesn't actually
> - * function as expected, but is technically compatible with the ISA.
> - *
> - * Do not advertise RTM to guests by default if it won't actually work.
> - */
> - if ( rtm_disabled )
> - __clear_bit(X86_FEATURE_RTM, fs);
> -}
> -
> -static void __init guest_common_feature_adjustments(uint32_t *fs)
> -{
> - /* Unconditionally claim to be able to set the hypervisor bit. */
> - __set_bit(X86_FEATURE_HYPERVISOR, fs);
> -
> - /*
> - * If IBRS is offered to the guest, unconditionally offer STIBP. It is a
> - * nop on non-HT hardware, and has this behaviour to make heterogeneous
> - * setups easier to manage.
> - */
> - if ( test_bit(X86_FEATURE_IBRSB, fs) )
> - __set_bit(X86_FEATURE_STIBP, fs);
> - if ( test_bit(X86_FEATURE_IBRS, fs) )
> - __set_bit(X86_FEATURE_AMD_STIBP, fs);
> -
> - /*
> - * On hardware which supports IBRS/IBPB, we can offer IBPB independently
> - * of IBRS by using the AMD feature bit. An administrator may wish for
> - * performance reasons to offer IBPB without IBRS.
> - */
> - if ( host_cpu_policy.feat.ibrsb )
> - __set_bit(X86_FEATURE_IBPB, fs);
> -}
> -
> -static void __init calculate_pv_max_policy(void)
> -{
> - struct cpuid_policy *p = &pv_max_cpu_policy;
> - uint32_t pv_featureset[FSCAPINTS];
> - unsigned int i;
> -
> - *p = host_cpu_policy;
> - x86_cpu_policy_to_featureset(p, pv_featureset);
> -
> - for ( i = 0; i < ARRAY_SIZE(pv_featureset); ++i )
> - pv_featureset[i] &= pv_max_featuremask[i];
> -
> - /*
> - * If Xen isn't virtualising MSR_SPEC_CTRL for PV guests (functional
> - * availability, or admin choice), hide the feature.
> - */
> - if ( !boot_cpu_has(X86_FEATURE_SC_MSR_PV) )
> - {
> - __clear_bit(X86_FEATURE_IBRSB, pv_featureset);
> - __clear_bit(X86_FEATURE_IBRS, pv_featureset);
> - }
> -
> - guest_common_feature_adjustments(pv_featureset);
> -
> - sanitise_featureset(pv_featureset);
> - x86_cpu_featureset_to_policy(pv_featureset, p);
> - recalculate_xstate(p);
> -
> - p->extd.raw[0xa] = EMPTY_LEAF; /* No SVM for PV guests. */
> -}
> -
> -static void __init calculate_pv_def_policy(void)
> -{
> - struct cpuid_policy *p = &pv_def_cpu_policy;
> - uint32_t pv_featureset[FSCAPINTS];
> - unsigned int i;
> -
> - *p = pv_max_cpu_policy;
> - x86_cpu_policy_to_featureset(p, pv_featureset);
> -
> - for ( i = 0; i < ARRAY_SIZE(pv_featureset); ++i )
> - pv_featureset[i] &= pv_def_featuremask[i];
> -
> - guest_common_feature_adjustments(pv_featureset);
> - guest_common_default_feature_adjustments(pv_featureset);
> -
> - sanitise_featureset(pv_featureset);
> - x86_cpu_featureset_to_policy(pv_featureset, p);
> - recalculate_xstate(p);
> -}
> -
> -static void __init calculate_hvm_max_policy(void)
> -{
> - struct cpuid_policy *p = &hvm_max_cpu_policy;
> - uint32_t hvm_featureset[FSCAPINTS];
> - unsigned int i;
> - const uint32_t *hvm_featuremask;
> -
> - *p = host_cpu_policy;
> - x86_cpu_policy_to_featureset(p, hvm_featureset);
> -
> - hvm_featuremask = hvm_hap_supported() ?
> - hvm_hap_max_featuremask : hvm_shadow_max_featuremask;
> -
> - for ( i = 0; i < ARRAY_SIZE(hvm_featureset); ++i )
> - hvm_featureset[i] &= hvm_featuremask[i];
> -
> - /*
> - * Xen can provide an (x2)APIC emulation to HVM guests even if the host's
> - * (x2)APIC isn't enabled.
> - */
> - __set_bit(X86_FEATURE_APIC, hvm_featureset);
> - __set_bit(X86_FEATURE_X2APIC, hvm_featureset);
> -
> - /*
> - * We don't support EFER.LMSLE at all. AMD has dropped the feature from
> - * hardware and allocated a CPUID bit to indicate its absence.
> - */
> - __set_bit(X86_FEATURE_NO_LMSL, hvm_featureset);
> -
> - /*
> - * On AMD, PV guests are entirely unable to use SYSENTER as Xen runs in
> - * long mode (and init_amd() has cleared it out of host capabilities), but
> - * HVM guests are able if running in protected mode.
> - */
> - if ( (boot_cpu_data.x86_vendor & (X86_VENDOR_AMD | X86_VENDOR_HYGON)) &&
> - raw_cpu_policy.basic.sep )
> - __set_bit(X86_FEATURE_SEP, hvm_featureset);
> -
> - /*
> - * VIRT_SSBD is exposed in the default policy as a result of
> - * amd_virt_spec_ctrl being set, it also needs exposing in the max policy.
> - */
> - if ( amd_virt_spec_ctrl )
> - __set_bit(X86_FEATURE_VIRT_SSBD, hvm_featureset);
> -
> - /*
> - * If Xen isn't virtualising MSR_SPEC_CTRL for HVM guests (functional
> - * availability, or admin choice), hide the feature.
> - */
> - if ( !boot_cpu_has(X86_FEATURE_SC_MSR_HVM) )
> - {
> - __clear_bit(X86_FEATURE_IBRSB, hvm_featureset);
> - __clear_bit(X86_FEATURE_IBRS, hvm_featureset);
> - }
> - else if ( boot_cpu_has(X86_FEATURE_AMD_SSBD) )
> - /*
> - * If SPEC_CTRL.SSBD is available VIRT_SPEC_CTRL.SSBD can be exposed
> - * and implemented using the former. Expose in the max policy only as
> - * the preference is for guests to use SPEC_CTRL.SSBD if available.
> - */
> - __set_bit(X86_FEATURE_VIRT_SSBD, hvm_featureset);
> -
> - /*
> - * With VT-x, some features are only supported by Xen if dedicated
> - * hardware support is also available.
> - */
> - if ( cpu_has_vmx )
> - {
> - if ( !cpu_has_vmx_mpx )
> - __clear_bit(X86_FEATURE_MPX, hvm_featureset);
> -
> - if ( !cpu_has_vmx_xsaves )
> - __clear_bit(X86_FEATURE_XSAVES, hvm_featureset);
> - }
> -
> - /*
> - * Xen doesn't use PKS, so the guest support for it has opted to not use
> - * the VMCS load/save controls for efficiency reasons. This depends on
> - * the exact vmentry/exit behaviour, so don't expose PKS in other
> - * situations until someone has cross-checked the behaviour for safety.
> - */
> - if ( !cpu_has_vmx )
> - __clear_bit(X86_FEATURE_PKS, hvm_featureset);
> -
> - guest_common_feature_adjustments(hvm_featureset);
> -
> - sanitise_featureset(hvm_featureset);
> - x86_cpu_featureset_to_policy(hvm_featureset, p);
> - recalculate_xstate(p);
> -}
> -
> -static void __init calculate_hvm_def_policy(void)
> -{
> - struct cpuid_policy *p = &hvm_def_cpu_policy;
> - uint32_t hvm_featureset[FSCAPINTS];
> - unsigned int i;
> - const uint32_t *hvm_featuremask;
> -
> - *p = hvm_max_cpu_policy;
> - x86_cpu_policy_to_featureset(p, hvm_featureset);
> -
> - hvm_featuremask = hvm_hap_supported() ?
> - hvm_hap_def_featuremask : hvm_shadow_def_featuremask;
> -
> - for ( i = 0; i < ARRAY_SIZE(hvm_featureset); ++i )
> - hvm_featureset[i] &= hvm_featuremask[i];
> -
> - guest_common_feature_adjustments(hvm_featureset);
> - guest_common_default_feature_adjustments(hvm_featureset);
> -
> - /*
> - * Only expose VIRT_SSBD if AMD_SSBD is not available, and thus
> - * amd_virt_spec_ctrl is set.
> - */
> - if ( amd_virt_spec_ctrl )
> - __set_bit(X86_FEATURE_VIRT_SSBD, hvm_featureset);
> -
> - sanitise_featureset(hvm_featureset);
> - x86_cpu_featureset_to_policy(hvm_featureset, p);
> - recalculate_xstate(p);
> -}
> -
> -void __init init_guest_cpuid(void)
> -{
> - calculate_raw_policy();
> - calculate_host_policy();
> -
> - if ( IS_ENABLED(CONFIG_PV) )
> - {
> - calculate_pv_max_policy();
> - calculate_pv_def_policy();
> - }
> -
> - if ( hvm_enabled )
> - {
> - calculate_hvm_max_policy();
> - calculate_hvm_def_policy();
> - }
> -}
>
> bool recheck_cpu_features(unsigned int cpu)
> {
> @@ -656,170 +32,6 @@ bool recheck_cpu_features(unsigned int cpu)
> return okay;
> }
>
> -void recalculate_cpuid_policy(struct domain *d)
> -{
> - struct cpuid_policy *p = d->arch.cpuid;
> - const struct cpuid_policy *max = is_pv_domain(d)
> - ? (IS_ENABLED(CONFIG_PV) ? &pv_max_cpu_policy : NULL)
> - : (IS_ENABLED(CONFIG_HVM) ? &hvm_max_cpu_policy : NULL);
> - uint32_t fs[FSCAPINTS], max_fs[FSCAPINTS];
> - unsigned int i;
> -
> - if ( !max )
> - {
> - ASSERT_UNREACHABLE();
> - return;
> - }
> -
> - p->x86_vendor = x86_cpuid_lookup_vendor(
> - p->basic.vendor_ebx, p->basic.vendor_ecx, p->basic.vendor_edx);
> -
> - p->basic.max_leaf = min(p->basic.max_leaf, max->basic.max_leaf);
> - p->feat.max_subleaf = min(p->feat.max_subleaf, max->feat.max_subleaf);
> - p->extd.max_leaf = 0x80000000 | min(p->extd.max_leaf & 0xffff,
> - ((p->x86_vendor & (X86_VENDOR_AMD |
> - X86_VENDOR_HYGON))
> - ? CPUID_GUEST_NR_EXTD_AMD
> - : CPUID_GUEST_NR_EXTD_INTEL) - 1);
> -
> - x86_cpu_policy_to_featureset(p, fs);
> - x86_cpu_policy_to_featureset(max, max_fs);
> -
> - if ( is_hvm_domain(d) )
> - {
> - /*
> - * HVM domains using Shadow paging have further restrictions on their
> - * available paging features.
> - */
> - if ( !hap_enabled(d) )
> - {
> - for ( i = 0; i < ARRAY_SIZE(max_fs); i++ )
> - max_fs[i] &= hvm_shadow_max_featuremask[i];
> - }
> -
> - /* Hide nested-virt if it hasn't been explicitly configured. */
> - if ( !nestedhvm_enabled(d) )
> - {
> - __clear_bit(X86_FEATURE_VMX, max_fs);
> - __clear_bit(X86_FEATURE_SVM, max_fs);
> - }
> - }
> -
> - /*
> - * Allow the toolstack to set HTT, X2APIC and CMP_LEGACY. These bits
> - * affect how to interpret topology information in other cpuid leaves.
> - */
> - __set_bit(X86_FEATURE_HTT, max_fs);
> - __set_bit(X86_FEATURE_X2APIC, max_fs);
> - __set_bit(X86_FEATURE_CMP_LEGACY, max_fs);
> -
> - /*
> - * 32bit PV domains can't use any Long Mode features, and cannot use
> - * SYSCALL on non-AMD hardware.
> - */
> - if ( is_pv_32bit_domain(d) )
> - {
> - __clear_bit(X86_FEATURE_LM, max_fs);
> - if ( !(boot_cpu_data.x86_vendor & (X86_VENDOR_AMD | X86_VENDOR_HYGON)) )
> - __clear_bit(X86_FEATURE_SYSCALL, max_fs);
> - }
> -
> - /* Clamp the toolstacks choices to reality. */
> - for ( i = 0; i < ARRAY_SIZE(fs); i++ )
> - fs[i] &= max_fs[i];
> -
> - if ( p->basic.max_leaf < XSTATE_CPUID )
> - __clear_bit(X86_FEATURE_XSAVE, fs);
> -
> - sanitise_featureset(fs);
> -
> - /* Fold host's FDP_EXCP_ONLY and NO_FPU_SEL into guest's view. */
> - fs[FEATURESET_7b0] &= ~(cpufeat_mask(X86_FEATURE_FDP_EXCP_ONLY) |
> - cpufeat_mask(X86_FEATURE_NO_FPU_SEL));
> - fs[FEATURESET_7b0] |= (host_cpu_policy.feat._7b0 &
> - (cpufeat_mask(X86_FEATURE_FDP_EXCP_ONLY) |
> - cpufeat_mask(X86_FEATURE_NO_FPU_SEL)));
> -
> - x86_cpu_featureset_to_policy(fs, p);
> -
> - /* Pass host cacheline size through to guests. */
> - p->basic.clflush_size = max->basic.clflush_size;
> -
> - p->extd.maxphysaddr = min(p->extd.maxphysaddr, max->extd.maxphysaddr);
> - p->extd.maxphysaddr = min_t(uint8_t, p->extd.maxphysaddr,
> - paging_max_paddr_bits(d));
> - p->extd.maxphysaddr = max_t(uint8_t, p->extd.maxphysaddr,
> - (p->basic.pae || p->basic.pse36) ? 36 : 32);
> -
> - p->extd.maxlinaddr = p->extd.lm ? 48 : 32;
> -
> - recalculate_xstate(p);
> - recalculate_misc(p);
> -
> - for ( i = 0; i < ARRAY_SIZE(p->cache.raw); ++i )
> - {
> - if ( p->cache.subleaf[i].type >= 1 &&
> - p->cache.subleaf[i].type <= 3 )
> - {
> - /* Subleaf has a valid cache type. Zero reserved fields. */
> - p->cache.raw[i].a &= 0xffffc3ffu;
> - p->cache.raw[i].d &= 0x00000007u;
> - }
> - else
> - {
> - /* Subleaf is not valid. Zero the rest of the union. */
> - zero_leaves(p->cache.raw, i, ARRAY_SIZE(p->cache.raw) - 1);
> - break;
> - }
> - }
> -
> - if ( vpmu_mode == XENPMU_MODE_OFF ||
> - ((vpmu_mode & XENPMU_MODE_ALL) && !is_hardware_domain(d)) )
> - p->basic.raw[0xa] = EMPTY_LEAF;
> -
> - if ( !p->extd.svm )
> - p->extd.raw[0xa] = EMPTY_LEAF;
> -
> - if ( !p->extd.page1gb )
> - p->extd.raw[0x19] = EMPTY_LEAF;
> -}
> -
> -void __init init_dom0_cpuid_policy(struct domain *d)
> -{
> - struct cpuid_policy *p = d->arch.cpuid;
> -
> - /* dom0 can't migrate. Give it ITSC if available. */
> - if ( cpu_has_itsc )
> - p->extd.itsc = true;
> -
> - /*
> - * Expose the "hardware speculation behaviour" bits of ARCH_CAPS to dom0,
> - * so dom0 can turn off workarounds as appropriate. Temporary, until the
> - * domain policy logic gains a better understanding of MSRs.
> - */
> - if ( cpu_has_arch_caps )
> - p->feat.arch_caps = true;
> -
> - /* Apply dom0-cpuid= command line settings, if provided. */
> - if ( dom0_cpuid_cmdline )
> - {
> - uint32_t fs[FSCAPINTS];
> - unsigned int i;
> -
> - x86_cpu_policy_to_featureset(p, fs);
> -
> - for ( i = 0; i < ARRAY_SIZE(fs); ++i )
> - {
> - fs[i] |= dom0_enable_feat [i];
> - fs[i] &= ~dom0_disable_feat[i];
> - }
> -
> - x86_cpu_featureset_to_policy(fs, p);
> -
> - recalculate_cpuid_policy(d);
> - }
> -}
> -
> void guest_cpuid(const struct vcpu *v, uint32_t leaf,
> uint32_t subleaf, struct cpuid_leaf *res)
> {
> @@ -1190,27 +402,6 @@ void guest_cpuid(const struct vcpu *v, uint32_t leaf,
> }
> }
>
> -static void __init __maybe_unused build_assertions(void)
> -{
> - BUILD_BUG_ON(ARRAY_SIZE(known_features) != FSCAPINTS);
> - BUILD_BUG_ON(ARRAY_SIZE(pv_max_featuremask) != FSCAPINTS);
> - BUILD_BUG_ON(ARRAY_SIZE(hvm_shadow_max_featuremask) != FSCAPINTS);
> - BUILD_BUG_ON(ARRAY_SIZE(hvm_hap_max_featuremask) != FSCAPINTS);
> - BUILD_BUG_ON(ARRAY_SIZE(deep_features) != FSCAPINTS);
> -
> - /* Find some more clever allocation scheme if this trips. */
> - BUILD_BUG_ON(sizeof(struct cpuid_policy) > PAGE_SIZE);
> -
> - BUILD_BUG_ON(sizeof(raw_cpu_policy.basic) !=
> - sizeof(raw_cpu_policy.basic.raw));
> - BUILD_BUG_ON(sizeof(raw_cpu_policy.feat) !=
> - sizeof(raw_cpu_policy.feat.raw));
> - BUILD_BUG_ON(sizeof(raw_cpu_policy.xstate) !=
> - sizeof(raw_cpu_policy.xstate.raw));
> - BUILD_BUG_ON(sizeof(raw_cpu_policy.extd) !=
> - sizeof(raw_cpu_policy.extd.raw));
> -}
> -
> /*
> * Local variables:
> * mode: C
> diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
> index d326fa1c0136..675c523d9909 100644
> --- a/xen/arch/x86/hvm/hvm.c
> +++ b/xen/arch/x86/hvm/hvm.c
> @@ -77,7 +77,6 @@
> #include <public/memory.h>
> #include <public/vm_event.h>
> #include <public/arch-x86/cpuid.h>
> -#include <asm/cpuid.h>
>
> #include <compat/hvm/hvm_op.h>
>
> diff --git a/xen/arch/x86/include/asm/cpu-policy.h b/xen/arch/x86/include/asm/cpu-policy.h
> index 13e2a1f86d13..b361537a602b 100644
> --- a/xen/arch/x86/include/asm/cpu-policy.h
> +++ b/xen/arch/x86/include/asm/cpu-policy.h
> @@ -18,4 +18,10 @@ void init_guest_cpu_policies(void);
> /* Allocate and initialise a CPU policy suitable for the domain. */
> int init_domain_cpu_policy(struct domain *d);
>
> +/* Apply dom0-specific tweaks to the CPUID policy. */
> +void init_dom0_cpuid_policy(struct domain *d);
> +
> +/* Clamp the CPUID policy to reality. */
> +void recalculate_cpuid_policy(struct domain *d);
> +
> #endif /* X86_CPU_POLICY_H */
> diff --git a/xen/arch/x86/include/asm/cpuid.h b/xen/arch/x86/include/asm/cpuid.h
> index 7f81b998ce01..b32ba0bbfe5c 100644
> --- a/xen/arch/x86/include/asm/cpuid.h
> +++ b/xen/arch/x86/include/asm/cpuid.h
> @@ -8,14 +8,10 @@
> #include <xen/kernel.h>
> #include <xen/percpu.h>
>
> -#include <xen/lib/x86/cpu-policy.h>
> -
> #include <public/sysctl.h>
>
> extern const uint32_t known_features[FSCAPINTS];
>
> -void init_guest_cpuid(void);
> -
> /*
> * Expected levelling capabilities (given cpuid vendor/family information),
> * and levelling capabilities actually available (given MSR probing).
> @@ -49,13 +45,8 @@ extern struct cpuidmasks cpuidmask_defaults;
> /* Check that all previously present features are still available. */
> bool recheck_cpu_features(unsigned int cpu);
>
> -/* Apply dom0-specific tweaks to the CPUID policy. */
> -void init_dom0_cpuid_policy(struct domain *d);
> -
> -/* Clamp the CPUID policy to reality. */
> -void recalculate_cpuid_policy(struct domain *d);
> -
> struct vcpu;
> +struct cpuid_leaf;
> void guest_cpuid(const struct vcpu *v, uint32_t leaf,
> uint32_t subleaf, struct cpuid_leaf *res);
>
> diff --git a/xen/arch/x86/pv/domain.c b/xen/arch/x86/pv/domain.c
> index f94f28c8e271..95492715d8ad 100644
> --- a/xen/arch/x86/pv/domain.c
> +++ b/xen/arch/x86/pv/domain.c
> @@ -10,6 +10,7 @@
> #include <xen/param.h>
> #include <xen/sched.h>
>
> +#include <asm/cpu-policy.h>
> #include <asm/cpufeature.h>
> #include <asm/invpcid.h>
> #include <asm/spec_ctrl.h>
> diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
> index 51a19b9019eb..08ade715a3ce 100644
> --- a/xen/arch/x86/setup.c
> +++ b/xen/arch/x86/setup.c
> @@ -51,7 +51,6 @@
> #include <asm/alternative.h>
> #include <asm/mc146818rtc.h>
> #include <asm/cpu-policy.h>
> -#include <asm/cpuid.h>
> #include <asm/spec_ctrl.h>
> #include <asm/guest.h>
> #include <asm/microcode.h>
> @@ -1991,7 +1990,6 @@ void __init noreturn __start_xen(unsigned long mbi_p)
> if ( !tboot_protect_mem_regions() )
> panic("Could not protect TXT memory regions\n");
>
> - init_guest_cpuid();
> init_guest_cpu_policies();
>
> if ( xen_cpuidle )
On 04/04/2023 4:16 pm, Jan Beulich wrote:
> On 04.04.2023 11:52, Andrew Cooper wrote:
>> Switch to the newer cpu_policy nomenclature. Do some easy cleanup of
>> includes.
>>
>> No practical change.
>>
>> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
>> ---
>> CC: Jan Beulich <JBeulich@suse.com>
>> CC: Roger Pau Monné <roger.pau@citrix.com>
>> CC: Wei Liu <wl@xen.org>
>>
>> v2:
>> * New
>> ---
>> xen/arch/x86/cpu-policy.c | 752 ++++++++++++++++++++++++
>> xen/arch/x86/cpuid.c | 817 +-------------------------
>> xen/arch/x86/hvm/hvm.c | 1 -
>> xen/arch/x86/include/asm/cpu-policy.h | 6 +
>> xen/arch/x86/include/asm/cpuid.h | 11 +-
>> xen/arch/x86/pv/domain.c | 1 +
>> xen/arch/x86/setup.c | 2 -
>> 7 files changed, 764 insertions(+), 826 deletions(-)
>>
>> diff --git a/xen/arch/x86/cpu-policy.c b/xen/arch/x86/cpu-policy.c
>> index f6a2317ed7bd..83186e940ca7 100644
>> --- a/xen/arch/x86/cpu-policy.c
>> +++ b/xen/arch/x86/cpu-policy.c
>> @@ -1,13 +1,19 @@
>> /* SPDX-License-Identifier: GPL-2.0-or-later */
>> #include <xen/cache.h>
>> #include <xen/kernel.h>
>> +#include <xen/param.h>
>> #include <xen/sched.h>
>>
>> #include <xen/lib/x86/cpu-policy.h>
>>
>> +#include <asm/amd.h>
>> #include <asm/cpu-policy.h>
>> +#include <asm/hvm/nestedhvm.h>
>> +#include <asm/hvm/svm/svm.h>
>> #include <asm/msr-index.h>
>> +#include <asm/paging.h>
>> #include <asm/setup.h>
>> +#include <asm/xstate.h>
>>
>> struct cpu_policy __ro_after_init raw_cpu_policy;
>> struct cpu_policy __ro_after_init host_cpu_policy;
>> @@ -20,10 +26,332 @@ struct cpu_policy __ro_after_init hvm_max_cpu_policy;
>> struct cpu_policy __ro_after_init hvm_def_cpu_policy;
>> #endif
>>
>> +const uint32_t known_features[] = INIT_KNOWN_FEATURES;
>> +
>> +static const uint32_t __initconst pv_max_featuremask[] = INIT_PV_MAX_FEATURES;
>> +static const uint32_t hvm_shadow_max_featuremask[] = INIT_HVM_SHADOW_MAX_FEATURES;
>> +static const uint32_t __initconst hvm_hap_max_featuremask[] =
>> + INIT_HVM_HAP_MAX_FEATURES;
>> +static const uint32_t __initconst pv_def_featuremask[] = INIT_PV_DEF_FEATURES;
>> +static const uint32_t __initconst hvm_shadow_def_featuremask[] =
>> + INIT_HVM_SHADOW_DEF_FEATURES;
>> +static const uint32_t __initconst hvm_hap_def_featuremask[] =
>> + INIT_HVM_HAP_DEF_FEATURES;
>> +static const uint32_t deep_features[] = INIT_DEEP_FEATURES;
>> +
>> +static const struct feature_name {
>> + const char *name;
>> + unsigned int bit;
>> +} feature_names[] __initconstrel = INIT_FEATURE_NAMES;
>> +
>> +/*
>> + * Parse a list of cpuid feature names -> bool, calling the callback for any
>> + * matches found.
>> + *
>> + * always_inline, because this is init code only and we really don't want a
>> + * function pointer call in the middle of the loop.
>> + */
>> +static int __init always_inline parse_cpuid(
>> + const char *s, void (*callback)(unsigned int feat, bool val))
>> +{
>> + const char *ss;
>> + int val, rc = 0;
>> +
>> + do {
>> + const struct feature_name *lhs, *rhs, *mid = NULL /* GCC... */;
>> + const char *feat;
>> +
>> + ss = strchr(s, ',');
>> + if ( !ss )
>> + ss = strchr(s, '\0');
>> +
>> + /* Skip the 'no-' prefix for name comparisons. */
>> + feat = s;
>> + if ( strncmp(s, "no-", 3) == 0 )
>> + feat += 3;
>> +
>> + /* (Re)initalise lhs and rhs for binary search. */
>> + lhs = feature_names;
>> + rhs = feature_names + ARRAY_SIZE(feature_names);
>> +
>> + while ( lhs < rhs )
>> + {
>> + int res;
>> +
>> + mid = lhs + (rhs - lhs) / 2;
>> + res = cmdline_strcmp(feat, mid->name);
>> +
>> + if ( res < 0 )
>> + {
>> + rhs = mid;
>> + continue;
>> + }
>> + if ( res > 0 )
>> + {
>> + lhs = mid + 1;
>> + continue;
>> + }
>> +
>> + if ( (val = parse_boolean(mid->name, s, ss)) >= 0 )
>> + {
>> + callback(mid->bit, val);
>> + mid = NULL;
>> + }
>> +
>> + break;
>> + }
>> +
>> + /*
>> + * Mid being NULL means that the name and boolean were successfully
>> + * identified. Everything else is an error.
>> + */
>> + if ( mid )
>> + rc = -EINVAL;
>> +
>> + s = ss + 1;
>> + } while ( *ss );
>> +
>> + return rc;
>> +}
>> +
>> +static void __init cf_check _parse_xen_cpuid(unsigned int feat, bool val)
>> +{
>> + if ( !val )
>> + setup_clear_cpu_cap(feat);
>> + else if ( feat == X86_FEATURE_RDRAND &&
>> + (cpuid_ecx(1) & cpufeat_mask(X86_FEATURE_RDRAND)) )
>> + setup_force_cpu_cap(X86_FEATURE_RDRAND);
>> +}
>> +
>> +static int __init cf_check parse_xen_cpuid(const char *s)
>> +{
>> + return parse_cpuid(s, _parse_xen_cpuid);
>> +}
>> +custom_param("cpuid", parse_xen_cpuid);
>> +
>> +static bool __initdata dom0_cpuid_cmdline;
>> +static uint32_t __initdata dom0_enable_feat[FSCAPINTS];
>> +static uint32_t __initdata dom0_disable_feat[FSCAPINTS];
>> +
>> +static void __init cf_check _parse_dom0_cpuid(unsigned int feat, bool val)
>> +{
>> + __set_bit (feat, val ? dom0_enable_feat : dom0_disable_feat);
>> + __clear_bit(feat, val ? dom0_disable_feat : dom0_enable_feat );
>> +}
>> +
>> +static int __init cf_check parse_dom0_cpuid(const char *s)
>> +{
>> + dom0_cpuid_cmdline = true;
>> +
>> + return parse_cpuid(s, _parse_dom0_cpuid);
>> +}
>> +custom_param("dom0-cpuid", parse_dom0_cpuid);
> Unless the plan is to completely remove cpuid.c, this command line
> handling would imo better fit there. I understand that to keep
> dom0_{en,dis}able_feat[] static, the _parse_dom0_cpuid() helper
> would then need to be exposed (under a different name), but I think
> that's quite okay, the more that it's an __init function.
I'm not sure I agree. (I did debate this for a while before moving the
cmdline parsing.)
I do have some cleanup plans which will move code into cpuid.c, and
guest_cpuid() absolutely still lives there, but for these options
specifically, the moment I add MSR_ARCH_CAPS into a featureset, their
bit names names will work here too.
So arguably {dom0-}cpuid= don't be a great name moving forwards, but it
is is absolutely more cpu-policy.c content than cpuid.c content.
We can't get rid of the existing cmdline names, and I think documenting
our way out of the "it's not only CPUID bits any more" is better than
adding yet a new name.
>> @@ -149,3 +716,188 @@ int init_domain_cpu_policy(struct domain *d)
>>
>> return 0;
>> }
>> +
>> +void recalculate_cpuid_policy(struct domain *d)
>> +{
>> + struct cpu_policy *p = d->arch.cpuid;
>> + const struct cpu_policy *max = is_pv_domain(d)
>> + ? (IS_ENABLED(CONFIG_PV) ? &pv_max_cpu_policy : NULL)
>> + : (IS_ENABLED(CONFIG_HVM) ? &hvm_max_cpu_policy : NULL);
> While this is how the original code was, wouldn't this want to use
> hvm_enabled, just like init_guest_cpu_policies() does (patch 10)?
No. That will fail to link.
This trickery is necessary to drop the compiler-visible reference to
hvm_max_cpu_policy in !CONFIG_HVM builds.
This function is only called after the domain type has already been
established, which precludes calling it in a case where max will
evaluate to NULL, hence the ASSERT_UNREACHABLE() just below.
~Andrew
On 04.04.2023 17:45, Andrew Cooper wrote:
> On 04/04/2023 4:16 pm, Jan Beulich wrote:
>> On 04.04.2023 11:52, Andrew Cooper wrote:
>>> @@ -20,10 +26,332 @@ struct cpu_policy __ro_after_init hvm_max_cpu_policy;
>>> struct cpu_policy __ro_after_init hvm_def_cpu_policy;
>>> #endif
>>>
>>> +const uint32_t known_features[] = INIT_KNOWN_FEATURES;
>>> +
>>> +static const uint32_t __initconst pv_max_featuremask[] = INIT_PV_MAX_FEATURES;
>>> +static const uint32_t hvm_shadow_max_featuremask[] = INIT_HVM_SHADOW_MAX_FEATURES;
>>> +static const uint32_t __initconst hvm_hap_max_featuremask[] =
>>> + INIT_HVM_HAP_MAX_FEATURES;
>>> +static const uint32_t __initconst pv_def_featuremask[] = INIT_PV_DEF_FEATURES;
>>> +static const uint32_t __initconst hvm_shadow_def_featuremask[] =
>>> + INIT_HVM_SHADOW_DEF_FEATURES;
>>> +static const uint32_t __initconst hvm_hap_def_featuremask[] =
>>> + INIT_HVM_HAP_DEF_FEATURES;
>>> +static const uint32_t deep_features[] = INIT_DEEP_FEATURES;
>>> +
>>> +static const struct feature_name {
>>> + const char *name;
>>> + unsigned int bit;
>>> +} feature_names[] __initconstrel = INIT_FEATURE_NAMES;
>>> +
>>> +/*
>>> + * Parse a list of cpuid feature names -> bool, calling the callback for any
>>> + * matches found.
>>> + *
>>> + * always_inline, because this is init code only and we really don't want a
>>> + * function pointer call in the middle of the loop.
>>> + */
>>> +static int __init always_inline parse_cpuid(
>>> + const char *s, void (*callback)(unsigned int feat, bool val))
>>> +{
>>> + const char *ss;
>>> + int val, rc = 0;
>>> +
>>> + do {
>>> + const struct feature_name *lhs, *rhs, *mid = NULL /* GCC... */;
>>> + const char *feat;
>>> +
>>> + ss = strchr(s, ',');
>>> + if ( !ss )
>>> + ss = strchr(s, '\0');
>>> +
>>> + /* Skip the 'no-' prefix for name comparisons. */
>>> + feat = s;
>>> + if ( strncmp(s, "no-", 3) == 0 )
>>> + feat += 3;
>>> +
>>> + /* (Re)initalise lhs and rhs for binary search. */
>>> + lhs = feature_names;
>>> + rhs = feature_names + ARRAY_SIZE(feature_names);
>>> +
>>> + while ( lhs < rhs )
>>> + {
>>> + int res;
>>> +
>>> + mid = lhs + (rhs - lhs) / 2;
>>> + res = cmdline_strcmp(feat, mid->name);
>>> +
>>> + if ( res < 0 )
>>> + {
>>> + rhs = mid;
>>> + continue;
>>> + }
>>> + if ( res > 0 )
>>> + {
>>> + lhs = mid + 1;
>>> + continue;
>>> + }
>>> +
>>> + if ( (val = parse_boolean(mid->name, s, ss)) >= 0 )
>>> + {
>>> + callback(mid->bit, val);
>>> + mid = NULL;
>>> + }
>>> +
>>> + break;
>>> + }
>>> +
>>> + /*
>>> + * Mid being NULL means that the name and boolean were successfully
>>> + * identified. Everything else is an error.
>>> + */
>>> + if ( mid )
>>> + rc = -EINVAL;
>>> +
>>> + s = ss + 1;
>>> + } while ( *ss );
>>> +
>>> + return rc;
>>> +}
>>> +
>>> +static void __init cf_check _parse_xen_cpuid(unsigned int feat, bool val)
>>> +{
>>> + if ( !val )
>>> + setup_clear_cpu_cap(feat);
>>> + else if ( feat == X86_FEATURE_RDRAND &&
>>> + (cpuid_ecx(1) & cpufeat_mask(X86_FEATURE_RDRAND)) )
>>> + setup_force_cpu_cap(X86_FEATURE_RDRAND);
>>> +}
>>> +
>>> +static int __init cf_check parse_xen_cpuid(const char *s)
>>> +{
>>> + return parse_cpuid(s, _parse_xen_cpuid);
>>> +}
>>> +custom_param("cpuid", parse_xen_cpuid);
>>> +
>>> +static bool __initdata dom0_cpuid_cmdline;
>>> +static uint32_t __initdata dom0_enable_feat[FSCAPINTS];
>>> +static uint32_t __initdata dom0_disable_feat[FSCAPINTS];
>>> +
>>> +static void __init cf_check _parse_dom0_cpuid(unsigned int feat, bool val)
>>> +{
>>> + __set_bit (feat, val ? dom0_enable_feat : dom0_disable_feat);
>>> + __clear_bit(feat, val ? dom0_disable_feat : dom0_enable_feat );
>>> +}
>>> +
>>> +static int __init cf_check parse_dom0_cpuid(const char *s)
>>> +{
>>> + dom0_cpuid_cmdline = true;
>>> +
>>> + return parse_cpuid(s, _parse_dom0_cpuid);
>>> +}
>>> +custom_param("dom0-cpuid", parse_dom0_cpuid);
>> Unless the plan is to completely remove cpuid.c, this command line
>> handling would imo better fit there. I understand that to keep
>> dom0_{en,dis}able_feat[] static, the _parse_dom0_cpuid() helper
>> would then need to be exposed (under a different name), but I think
>> that's quite okay, the more that it's an __init function.
>
> I'm not sure I agree. (I did debate this for a while before moving the
> cmdline parsing.)
>
> I do have some cleanup plans which will move code into cpuid.c, and
> guest_cpuid() absolutely still lives there, but for these options
> specifically, the moment I add MSR_ARCH_CAPS into a featureset, their
> bit names names will work here too.
>
> So arguably {dom0-}cpuid= don't be a great name moving forwards, but it
> is is absolutely more cpu-policy.c content than cpuid.c content.
>
> We can't get rid of the existing cmdline names, and I think documenting
> our way out of the "it's not only CPUID bits any more" is better than
> adding yet a new name.
Hmm, yes:
Acked-by: Jan Beulich <jbeulich@suse.com>
>>> @@ -149,3 +716,188 @@ int init_domain_cpu_policy(struct domain *d)
>>>
>>> return 0;
>>> }
>>> +
>>> +void recalculate_cpuid_policy(struct domain *d)
>>> +{
>>> + struct cpu_policy *p = d->arch.cpuid;
>>> + const struct cpu_policy *max = is_pv_domain(d)
>>> + ? (IS_ENABLED(CONFIG_PV) ? &pv_max_cpu_policy : NULL)
>>> + : (IS_ENABLED(CONFIG_HVM) ? &hvm_max_cpu_policy : NULL);
>> While this is how the original code was, wouldn't this want to use
>> hvm_enabled, just like init_guest_cpu_policies() does (patch 10)?
>
> No. That will fail to link.
Why? hvm_enabled is a #define (to false) only when !HVM.
> This trickery is necessary to drop the compiler-visible reference to
> hvm_max_cpu_policy in !CONFIG_HVM builds.
>
> This function is only called after the domain type has already been
> established, which precludes calling it in a case where max will
> evaluate to NULL, hence the ASSERT_UNREACHABLE() just below.
Right, and this will hold when HVM=y but no VMX/SVM was found.
Jan
On 04/04/2023 5:14 pm, Jan Beulich wrote:
> On 04.04.2023 17:45, Andrew Cooper wrote:
>> On 04/04/2023 4:16 pm, Jan Beulich wrote:
>>> On 04.04.2023 11:52, Andrew Cooper wrote:
>>>> @@ -20,10 +26,332 @@ struct cpu_policy __ro_after_init hvm_max_cpu_policy;
>>>> struct cpu_policy __ro_after_init hvm_def_cpu_policy;
>>>> #endif
>>>>
>>>> +const uint32_t known_features[] = INIT_KNOWN_FEATURES;
>>>> +
>>>> +static const uint32_t __initconst pv_max_featuremask[] = INIT_PV_MAX_FEATURES;
>>>> +static const uint32_t hvm_shadow_max_featuremask[] = INIT_HVM_SHADOW_MAX_FEATURES;
>>>> +static const uint32_t __initconst hvm_hap_max_featuremask[] =
>>>> + INIT_HVM_HAP_MAX_FEATURES;
>>>> +static const uint32_t __initconst pv_def_featuremask[] = INIT_PV_DEF_FEATURES;
>>>> +static const uint32_t __initconst hvm_shadow_def_featuremask[] =
>>>> + INIT_HVM_SHADOW_DEF_FEATURES;
>>>> +static const uint32_t __initconst hvm_hap_def_featuremask[] =
>>>> + INIT_HVM_HAP_DEF_FEATURES;
>>>> +static const uint32_t deep_features[] = INIT_DEEP_FEATURES;
>>>> +
>>>> +static const struct feature_name {
>>>> + const char *name;
>>>> + unsigned int bit;
>>>> +} feature_names[] __initconstrel = INIT_FEATURE_NAMES;
>>>> +
>>>> +/*
>>>> + * Parse a list of cpuid feature names -> bool, calling the callback for any
>>>> + * matches found.
>>>> + *
>>>> + * always_inline, because this is init code only and we really don't want a
>>>> + * function pointer call in the middle of the loop.
>>>> + */
>>>> +static int __init always_inline parse_cpuid(
>>>> + const char *s, void (*callback)(unsigned int feat, bool val))
>>>> +{
>>>> + const char *ss;
>>>> + int val, rc = 0;
>>>> +
>>>> + do {
>>>> + const struct feature_name *lhs, *rhs, *mid = NULL /* GCC... */;
>>>> + const char *feat;
>>>> +
>>>> + ss = strchr(s, ',');
>>>> + if ( !ss )
>>>> + ss = strchr(s, '\0');
>>>> +
>>>> + /* Skip the 'no-' prefix for name comparisons. */
>>>> + feat = s;
>>>> + if ( strncmp(s, "no-", 3) == 0 )
>>>> + feat += 3;
>>>> +
>>>> + /* (Re)initalise lhs and rhs for binary search. */
>>>> + lhs = feature_names;
>>>> + rhs = feature_names + ARRAY_SIZE(feature_names);
>>>> +
>>>> + while ( lhs < rhs )
>>>> + {
>>>> + int res;
>>>> +
>>>> + mid = lhs + (rhs - lhs) / 2;
>>>> + res = cmdline_strcmp(feat, mid->name);
>>>> +
>>>> + if ( res < 0 )
>>>> + {
>>>> + rhs = mid;
>>>> + continue;
>>>> + }
>>>> + if ( res > 0 )
>>>> + {
>>>> + lhs = mid + 1;
>>>> + continue;
>>>> + }
>>>> +
>>>> + if ( (val = parse_boolean(mid->name, s, ss)) >= 0 )
>>>> + {
>>>> + callback(mid->bit, val);
>>>> + mid = NULL;
>>>> + }
>>>> +
>>>> + break;
>>>> + }
>>>> +
>>>> + /*
>>>> + * Mid being NULL means that the name and boolean were successfully
>>>> + * identified. Everything else is an error.
>>>> + */
>>>> + if ( mid )
>>>> + rc = -EINVAL;
>>>> +
>>>> + s = ss + 1;
>>>> + } while ( *ss );
>>>> +
>>>> + return rc;
>>>> +}
>>>> +
>>>> +static void __init cf_check _parse_xen_cpuid(unsigned int feat, bool val)
>>>> +{
>>>> + if ( !val )
>>>> + setup_clear_cpu_cap(feat);
>>>> + else if ( feat == X86_FEATURE_RDRAND &&
>>>> + (cpuid_ecx(1) & cpufeat_mask(X86_FEATURE_RDRAND)) )
>>>> + setup_force_cpu_cap(X86_FEATURE_RDRAND);
>>>> +}
>>>> +
>>>> +static int __init cf_check parse_xen_cpuid(const char *s)
>>>> +{
>>>> + return parse_cpuid(s, _parse_xen_cpuid);
>>>> +}
>>>> +custom_param("cpuid", parse_xen_cpuid);
>>>> +
>>>> +static bool __initdata dom0_cpuid_cmdline;
>>>> +static uint32_t __initdata dom0_enable_feat[FSCAPINTS];
>>>> +static uint32_t __initdata dom0_disable_feat[FSCAPINTS];
>>>> +
>>>> +static void __init cf_check _parse_dom0_cpuid(unsigned int feat, bool val)
>>>> +{
>>>> + __set_bit (feat, val ? dom0_enable_feat : dom0_disable_feat);
>>>> + __clear_bit(feat, val ? dom0_disable_feat : dom0_enable_feat );
>>>> +}
>>>> +
>>>> +static int __init cf_check parse_dom0_cpuid(const char *s)
>>>> +{
>>>> + dom0_cpuid_cmdline = true;
>>>> +
>>>> + return parse_cpuid(s, _parse_dom0_cpuid);
>>>> +}
>>>> +custom_param("dom0-cpuid", parse_dom0_cpuid);
>>> Unless the plan is to completely remove cpuid.c, this command line
>>> handling would imo better fit there. I understand that to keep
>>> dom0_{en,dis}able_feat[] static, the _parse_dom0_cpuid() helper
>>> would then need to be exposed (under a different name), but I think
>>> that's quite okay, the more that it's an __init function.
>> I'm not sure I agree. (I did debate this for a while before moving the
>> cmdline parsing.)
>>
>> I do have some cleanup plans which will move code into cpuid.c, and
>> guest_cpuid() absolutely still lives there, but for these options
>> specifically, the moment I add MSR_ARCH_CAPS into a featureset, their
>> bit names names will work here too.
>>
>> So arguably {dom0-}cpuid= don't be a great name moving forwards, but it
>> is is absolutely more cpu-policy.c content than cpuid.c content.
>>
>> We can't get rid of the existing cmdline names, and I think documenting
>> our way out of the "it's not only CPUID bits any more" is better than
>> adding yet a new name.
> Hmm, yes:
> Acked-by: Jan Beulich <jbeulich@suse.com>
Thanks.
>
>>>> @@ -149,3 +716,188 @@ int init_domain_cpu_policy(struct domain *d)
>>>>
>>>> return 0;
>>>> }
>>>> +
>>>> +void recalculate_cpuid_policy(struct domain *d)
>>>> +{
>>>> + struct cpu_policy *p = d->arch.cpuid;
>>>> + const struct cpu_policy *max = is_pv_domain(d)
>>>> + ? (IS_ENABLED(CONFIG_PV) ? &pv_max_cpu_policy : NULL)
>>>> + : (IS_ENABLED(CONFIG_HVM) ? &hvm_max_cpu_policy : NULL);
>>> While this is how the original code was, wouldn't this want to use
>>> hvm_enabled, just like init_guest_cpu_policies() does (patch 10)?
>> No. That will fail to link.
> Why? hvm_enabled is a #define (to false) only when !HVM.
Hmm, maybe.
But honestly, I want to keep the code as it is because this is trying to
only be code-movement, and because it's currently symmetric between the
two cases.
~Andrew
© 2016 - 2026 Red Hat, Inc.