From: Amit Shah <amit.shah@amd.com>
AMD CPUs with the ERAPS feature (Turin+) have a larger RSB (aka RAP).
While the new default RSB size is used on the host without any software
modification necessary, the RSB usage for guests is limited to the older
value (32 entries) for backwards compatibility. With this patch, KVM
enables guest mode to also use the default number of entries by setting
the new ALLOW_LARGER_RAP bit in the VMCB.
The two cases for backward compatibility that need special handling are
nested guests, and guests using shadow paging (or when NPT is disabled):
For nested guests: the ERAPS feature adds host/guest tagging to entries
in the RSB, but does not distinguish between ASIDs. On a nested exit,
the L0 hypervisor instructs the microcode (via another new VMCB bit,
FLUSH_RAP_ON_VMRUN) to flush the RSB on the next VMRUN to prevent RSB
poisoning attacks from an L2 guest to an L1 guest. With that in place,
this feature can be exposed to guests.
For shadow paging guests: do not expose this feature to guests; only
expose if nested paging is enabled, to ensure context switches within
guests trigger TLB flushes on the CPU -- thereby ensuring guest context
switches flush guest RSB entries. For shadow paging, the CPU's CR3 is
not used for guest processes, and hence cannot benefit from this
feature.
Signed-off-by: Amit Shah <amit.shah@amd.com>
---
arch/x86/include/asm/svm.h | 6 +++++-
arch/x86/kvm/cpuid.c | 15 ++++++++++++-
arch/x86/kvm/svm/svm.c | 44 ++++++++++++++++++++++++++++++++++++++
arch/x86/kvm/svm/svm.h | 15 +++++++++++++
4 files changed, 78 insertions(+), 2 deletions(-)
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 2b59b9951c90..f8584a63c859 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -129,7 +129,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
u64 tsc_offset;
u32 asid;
u8 tlb_ctl;
- u8 reserved_2[3];
+ u8 erap_ctl;
+ u8 reserved_2[2];
u32 int_ctl;
u32 int_vector;
u32 int_state;
@@ -175,6 +176,9 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define TLB_CONTROL_FLUSH_ASID 3
#define TLB_CONTROL_FLUSH_ASID_LOCAL 7
+#define ERAP_CONTROL_ALLOW_LARGER_RAP 0
+#define ERAP_CONTROL_FLUSH_RAP 1
+
#define V_TPR_MASK 0x0f
#define V_IRQ_SHIFT 8
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 41786b834b16..2c2a60964a2e 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -797,6 +797,8 @@ void kvm_set_cpu_caps(void)
F(WRMSR_XX_BASE_NS)
);
+ if (tdp_enabled)
+ kvm_cpu_cap_check_and_set(X86_FEATURE_ERAPS);
kvm_cpu_cap_check_and_set(X86_FEATURE_SBPB);
kvm_cpu_cap_check_and_set(X86_FEATURE_IBPB_BRTYPE);
kvm_cpu_cap_check_and_set(X86_FEATURE_SRSO_NO);
@@ -1357,8 +1359,19 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
break;
case 0x80000021:
- entry->ebx = entry->ecx = entry->edx = 0;
+ unsigned int ebx_mask = 0;
+
+ entry->ecx = entry->edx = 0;
cpuid_entry_override(entry, CPUID_8000_0021_EAX);
+
+ /*
+ * Bits 23:16 in EBX indicate the size of the RSB.
+ * Expose the value in the hardware to the guest.
+ */
+ if (kvm_cpu_cap_has(X86_FEATURE_ERAPS))
+ ebx_mask |= GENMASK(23, 16);
+
+ entry->ebx &= ebx_mask;
break;
/* AMD Extended Performance Monitoring and Debug */
case 0x80000022: {
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 9df3e1e5ae81..ecd290ff38f8 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1360,6 +1360,28 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
+ /*
+ * If the hardware has a larger RSB, use it in the guest context as
+ * well.
+ *
+ * When running nested guests: the hardware tags host and guest RSB
+ * entries, but the entries are ASID agnostic. Differentiating L1 and
+ * L2 guests isn't possible in hardware. To prevent L2->L1 RSB
+ * poisoning attacks in this case, the L0 hypervisor must set
+ * FLUSH_RAP_ON_VMRUN in the L1's VMCB on a nested #VMEXIT to ensure
+ * the next VMRUN flushes the RSB.
+ *
+ * For shadow paging / NPT disabled case: the CPU's CR3 does not
+ * contain the CR3 of the running guest process, and hence intra-guest
+ * context switches will not cause a hardware TLB flush, which in turn
+ * does not result in a guest RSB flush that the ERAPS feature
+ * provides. Do not expose ERAPS or the larger RSB to the guest in
+ * this case, so the guest continues implementing software mitigations
+ * as well as only sees 32 entries for the RSB.
+ */
+ if (boot_cpu_has(X86_FEATURE_ERAPS) && npt_enabled)
+ vmcb_set_larger_rap(svm->vmcb);
+
if (kvm_vcpu_apicv_active(vcpu))
avic_init_vmcb(svm, vmcb);
@@ -3393,6 +3415,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
pr_err("%-20s%d\n", "asid:", control->asid);
pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
+ pr_err("%-20s%d\n", "erap_ctl:", control->erap_ctl);
pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
pr_err("%-20s%08x\n", "int_state:", control->int_state);
@@ -3559,6 +3582,27 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
trace_kvm_nested_vmexit(vcpu, KVM_ISA_SVM);
+ if (boot_cpu_has(X86_FEATURE_ERAPS)
+ && vmcb_is_larger_rap(svm->vmcb01.ptr)) {
+ /*
+ * XXX a few further optimizations can be made:
+ *
+ * 1. In pre_svm_run() we can reset this bit when a hw
+ * TLB flush has happened - any context switch on a
+ * CPU (which causes a TLB flush) auto-flushes the RSB
+ * - eg when this vCPU is scheduled on a different
+ * pCPU.
+ *
+ * 2. This is also not needed in the case where the
+ * vCPU is being scheduled on the same pCPU, but there
+ * was a context switch between the #VMEXIT and VMRUN.
+ *
+ * 3. If the guest returns to L2 again after this
+ * #VMEXIT, there's no need to flush the RSB.
+ */
+ vmcb_set_flush_rap(svm->vmcb01.ptr);
+ }
+
vmexit = nested_svm_exit_special(svm);
if (vmexit == NESTED_EXIT_CONTINUE)
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 43fa6a16eb19..8a7877f46dc5 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -500,6 +500,21 @@ static inline bool svm_is_intercept(struct vcpu_svm *svm, int bit)
return vmcb_is_intercept(&svm->vmcb->control, bit);
}
+static inline void vmcb_set_flush_rap(struct vmcb *vmcb)
+{
+ __set_bit(ERAP_CONTROL_FLUSH_RAP, (unsigned long *)&vmcb->control.erap_ctl);
+}
+
+static inline void vmcb_set_larger_rap(struct vmcb *vmcb)
+{
+ __set_bit(ERAP_CONTROL_ALLOW_LARGER_RAP, (unsigned long *)&vmcb->control.erap_ctl);
+}
+
+static inline bool vmcb_is_larger_rap(struct vmcb *vmcb)
+{
+ return test_bit(ERAP_CONTROL_ALLOW_LARGER_RAP, (unsigned long *)&vmcb->control.erap_ctl);
+}
+
static inline bool nested_vgif_enabled(struct vcpu_svm *svm)
{
return guest_can_use(&svm->vcpu, X86_FEATURE_VGIF) &&
--
2.47.0
On Thu, Oct 31, 2024 at 04:39:25PM +0100, Amit Shah wrote: > + if (boot_cpu_has(X86_FEATURE_ERAPS) && npt_enabled) s/boot_cpu_has/cpu_feature_enabled/g -- Regards/Gruss, Boris. https://people.kernel.org/tglx/notes-about-netiquette
On Mon, 2024-11-04 at 06:18 +0100, Borislav Petkov wrote: > On Thu, Oct 31, 2024 at 04:39:25PM +0100, Amit Shah wrote: > > + if (boot_cpu_has(X86_FEATURE_ERAPS) && npt_enabled) > > s/boot_cpu_has/cpu_feature_enabled/g ACK
Hi Amit, kernel test robot noticed the following build warnings: [auto build test WARNING on kvm/queue] [also build test WARNING on mst-vhost/linux-next tip/master tip/x86/core linus/master v6.12-rc5 next-20241031] [cannot apply to kvm/linux-next tip/auto-latest] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information] url: https://github.com/intel-lab-lkp/linux/commits/Amit-Shah/x86-cpu-bugs-add-support-for-AMD-ERAPS-feature/20241031-234332 base: https://git.kernel.org/pub/scm/virt/kvm/kvm.git queue patch link: https://lore.kernel.org/r/20241031153925.36216-3-amit%40kernel.org patch subject: [PATCH 2/2] x86: kvm: svm: add support for ERAPS and FLUSH_RAP_ON_VMRUN config: x86_64-kexec (https://download.01.org/0day-ci/archive/20241101/202411011119.l3yRJpht-lkp@intel.com/config) compiler: clang version 19.1.3 (https://github.com/llvm/llvm-project ab51eccf88f5321e7c60591c5546b254b6afab99) reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241101/202411011119.l3yRJpht-lkp@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Closes: https://lore.kernel.org/oe-kbuild-all/202411011119.l3yRJpht-lkp@intel.com/ All warnings (new ones prefixed by >>): In file included from arch/x86/kvm/cpuid.c:13: In file included from include/linux/kvm_host.h:16: In file included from include/linux/mm.h:2213: include/linux/vmstat.h:504:43: warning: arithmetic between different enumeration types ('enum zone_stat_item' and 'enum numa_stat_item') [-Wenum-enum-conversion] 504 | return vmstat_text[NR_VM_ZONE_STAT_ITEMS + | ~~~~~~~~~~~~~~~~~~~~~ ^ 505 | item]; | ~~~~ include/linux/vmstat.h:511:43: warning: arithmetic between different enumeration types ('enum zone_stat_item' and 'enum numa_stat_item') [-Wenum-enum-conversion] 511 | return vmstat_text[NR_VM_ZONE_STAT_ITEMS + | ~~~~~~~~~~~~~~~~~~~~~ ^ 512 | NR_VM_NUMA_EVENT_ITEMS + | ~~~~~~~~~~~~~~~~~~~~~~ include/linux/vmstat.h:518:36: warning: arithmetic between different enumeration types ('enum node_stat_item' and 'enum lru_list') [-Wenum-enum-conversion] 518 | return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_" | ~~~~~~~~~~~ ^ ~~~ include/linux/vmstat.h:524:43: warning: arithmetic between different enumeration types ('enum zone_stat_item' and 'enum numa_stat_item') [-Wenum-enum-conversion] 524 | return vmstat_text[NR_VM_ZONE_STAT_ITEMS + | ~~~~~~~~~~~~~~~~~~~~~ ^ 525 | NR_VM_NUMA_EVENT_ITEMS + | ~~~~~~~~~~~~~~~~~~~~~~ >> arch/x86/kvm/cpuid.c:1362:3: warning: label followed by a declaration is a C23 extension [-Wc23-extensions] 1362 | unsigned int ebx_mask = 0; | ^ 5 warnings generated. vim +1362 arch/x86/kvm/cpuid.c 940 941 static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) 942 { 943 struct kvm_cpuid_entry2 *entry; 944 int r, i, max_idx; 945 946 /* all calls to cpuid_count() should be made on the same cpu */ 947 get_cpu(); 948 949 r = -E2BIG; 950 951 entry = do_host_cpuid(array, function, 0); 952 if (!entry) 953 goto out; 954 955 switch (function) { 956 case 0: 957 /* Limited to the highest leaf implemented in KVM. */ 958 entry->eax = min(entry->eax, 0x24U); 959 break; 960 case 1: 961 cpuid_entry_override(entry, CPUID_1_EDX); 962 cpuid_entry_override(entry, CPUID_1_ECX); 963 break; 964 case 2: 965 /* 966 * On ancient CPUs, function 2 entries are STATEFUL. That is, 967 * CPUID(function=2, index=0) may return different results each 968 * time, with the least-significant byte in EAX enumerating the 969 * number of times software should do CPUID(2, 0). 970 * 971 * Modern CPUs, i.e. every CPU KVM has *ever* run on are less 972 * idiotic. Intel's SDM states that EAX & 0xff "will always 973 * return 01H. Software should ignore this value and not 974 * interpret it as an informational descriptor", while AMD's 975 * APM states that CPUID(2) is reserved. 976 * 977 * WARN if a frankenstein CPU that supports virtualization and 978 * a stateful CPUID.0x2 is encountered. 979 */ 980 WARN_ON_ONCE((entry->eax & 0xff) > 1); 981 break; 982 /* functions 4 and 0x8000001d have additional index. */ 983 case 4: 984 case 0x8000001d: 985 /* 986 * Read entries until the cache type in the previous entry is 987 * zero, i.e. indicates an invalid entry. 988 */ 989 for (i = 1; entry->eax & 0x1f; ++i) { 990 entry = do_host_cpuid(array, function, i); 991 if (!entry) 992 goto out; 993 } 994 break; 995 case 6: /* Thermal management */ 996 entry->eax = 0x4; /* allow ARAT */ 997 entry->ebx = 0; 998 entry->ecx = 0; 999 entry->edx = 0; 1000 break; 1001 /* function 7 has additional index. */ 1002 case 7: 1003 max_idx = entry->eax = min(entry->eax, 2u); 1004 cpuid_entry_override(entry, CPUID_7_0_EBX); 1005 cpuid_entry_override(entry, CPUID_7_ECX); 1006 cpuid_entry_override(entry, CPUID_7_EDX); 1007 1008 /* KVM only supports up to 0x7.2, capped above via min(). */ 1009 if (max_idx >= 1) { 1010 entry = do_host_cpuid(array, function, 1); 1011 if (!entry) 1012 goto out; 1013 1014 cpuid_entry_override(entry, CPUID_7_1_EAX); 1015 cpuid_entry_override(entry, CPUID_7_1_EDX); 1016 entry->ebx = 0; 1017 entry->ecx = 0; 1018 } 1019 if (max_idx >= 2) { 1020 entry = do_host_cpuid(array, function, 2); 1021 if (!entry) 1022 goto out; 1023 1024 cpuid_entry_override(entry, CPUID_7_2_EDX); 1025 entry->ecx = 0; 1026 entry->ebx = 0; 1027 entry->eax = 0; 1028 } 1029 break; 1030 case 0xa: { /* Architectural Performance Monitoring */ 1031 union cpuid10_eax eax; 1032 union cpuid10_edx edx; 1033 1034 if (!enable_pmu || !static_cpu_has(X86_FEATURE_ARCH_PERFMON)) { 1035 entry->eax = entry->ebx = entry->ecx = entry->edx = 0; 1036 break; 1037 } 1038 1039 eax.split.version_id = kvm_pmu_cap.version; 1040 eax.split.num_counters = kvm_pmu_cap.num_counters_gp; 1041 eax.split.bit_width = kvm_pmu_cap.bit_width_gp; 1042 eax.split.mask_length = kvm_pmu_cap.events_mask_len; 1043 edx.split.num_counters_fixed = kvm_pmu_cap.num_counters_fixed; 1044 edx.split.bit_width_fixed = kvm_pmu_cap.bit_width_fixed; 1045 1046 if (kvm_pmu_cap.version) 1047 edx.split.anythread_deprecated = 1; 1048 edx.split.reserved1 = 0; 1049 edx.split.reserved2 = 0; 1050 1051 entry->eax = eax.full; 1052 entry->ebx = kvm_pmu_cap.events_mask; 1053 entry->ecx = 0; 1054 entry->edx = edx.full; 1055 break; 1056 } 1057 case 0x1f: 1058 case 0xb: 1059 /* 1060 * No topology; a valid topology is indicated by the presence 1061 * of subleaf 1. 1062 */ 1063 entry->eax = entry->ebx = entry->ecx = 0; 1064 break; 1065 case 0xd: { 1066 u64 permitted_xcr0 = kvm_get_filtered_xcr0(); 1067 u64 permitted_xss = kvm_caps.supported_xss; 1068 1069 entry->eax &= permitted_xcr0; 1070 entry->ebx = xstate_required_size(permitted_xcr0, false); 1071 entry->ecx = entry->ebx; 1072 entry->edx &= permitted_xcr0 >> 32; 1073 if (!permitted_xcr0) 1074 break; 1075 1076 entry = do_host_cpuid(array, function, 1); 1077 if (!entry) 1078 goto out; 1079 1080 cpuid_entry_override(entry, CPUID_D_1_EAX); 1081 if (entry->eax & (F(XSAVES)|F(XSAVEC))) 1082 entry->ebx = xstate_required_size(permitted_xcr0 | permitted_xss, 1083 true); 1084 else { 1085 WARN_ON_ONCE(permitted_xss != 0); 1086 entry->ebx = 0; 1087 } 1088 entry->ecx &= permitted_xss; 1089 entry->edx &= permitted_xss >> 32; 1090 1091 for (i = 2; i < 64; ++i) { 1092 bool s_state; 1093 if (permitted_xcr0 & BIT_ULL(i)) 1094 s_state = false; 1095 else if (permitted_xss & BIT_ULL(i)) 1096 s_state = true; 1097 else 1098 continue; 1099 1100 entry = do_host_cpuid(array, function, i); 1101 if (!entry) 1102 goto out; 1103 1104 /* 1105 * The supported check above should have filtered out 1106 * invalid sub-leafs. Only valid sub-leafs should 1107 * reach this point, and they should have a non-zero 1108 * save state size. Furthermore, check whether the 1109 * processor agrees with permitted_xcr0/permitted_xss 1110 * on whether this is an XCR0- or IA32_XSS-managed area. 1111 */ 1112 if (WARN_ON_ONCE(!entry->eax || (entry->ecx & 0x1) != s_state)) { 1113 --array->nent; 1114 continue; 1115 } 1116 1117 if (!kvm_cpu_cap_has(X86_FEATURE_XFD)) 1118 entry->ecx &= ~BIT_ULL(2); 1119 entry->edx = 0; 1120 } 1121 break; 1122 } 1123 case 0x12: 1124 /* Intel SGX */ 1125 if (!kvm_cpu_cap_has(X86_FEATURE_SGX)) { 1126 entry->eax = entry->ebx = entry->ecx = entry->edx = 0; 1127 break; 1128 } 1129 1130 /* 1131 * Index 0: Sub-features, MISCSELECT (a.k.a extended features) 1132 * and max enclave sizes. The SGX sub-features and MISCSELECT 1133 * are restricted by kernel and KVM capabilities (like most 1134 * feature flags), while enclave size is unrestricted. 1135 */ 1136 cpuid_entry_override(entry, CPUID_12_EAX); 1137 entry->ebx &= SGX_MISC_EXINFO; 1138 1139 entry = do_host_cpuid(array, function, 1); 1140 if (!entry) 1141 goto out; 1142 1143 /* 1144 * Index 1: SECS.ATTRIBUTES. ATTRIBUTES are restricted a la 1145 * feature flags. Advertise all supported flags, including 1146 * privileged attributes that require explicit opt-in from 1147 * userspace. ATTRIBUTES.XFRM is not adjusted as userspace is 1148 * expected to derive it from supported XCR0. 1149 */ 1150 entry->eax &= SGX_ATTR_PRIV_MASK | SGX_ATTR_UNPRIV_MASK; 1151 entry->ebx &= 0; 1152 break; 1153 /* Intel PT */ 1154 case 0x14: 1155 if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) { 1156 entry->eax = entry->ebx = entry->ecx = entry->edx = 0; 1157 break; 1158 } 1159 1160 for (i = 1, max_idx = entry->eax; i <= max_idx; ++i) { 1161 if (!do_host_cpuid(array, function, i)) 1162 goto out; 1163 } 1164 break; 1165 /* Intel AMX TILE */ 1166 case 0x1d: 1167 if (!kvm_cpu_cap_has(X86_FEATURE_AMX_TILE)) { 1168 entry->eax = entry->ebx = entry->ecx = entry->edx = 0; 1169 break; 1170 } 1171 1172 for (i = 1, max_idx = entry->eax; i <= max_idx; ++i) { 1173 if (!do_host_cpuid(array, function, i)) 1174 goto out; 1175 } 1176 break; 1177 case 0x1e: /* TMUL information */ 1178 if (!kvm_cpu_cap_has(X86_FEATURE_AMX_TILE)) { 1179 entry->eax = entry->ebx = entry->ecx = entry->edx = 0; 1180 break; 1181 } 1182 break; 1183 case 0x24: { 1184 u8 avx10_version; 1185 1186 if (!kvm_cpu_cap_has(X86_FEATURE_AVX10)) { 1187 entry->eax = entry->ebx = entry->ecx = entry->edx = 0; 1188 break; 1189 } 1190 1191 /* 1192 * The AVX10 version is encoded in EBX[7:0]. Note, the version 1193 * is guaranteed to be >=1 if AVX10 is supported. Note #2, the 1194 * version needs to be captured before overriding EBX features! 1195 */ 1196 avx10_version = min_t(u8, entry->ebx & 0xff, 1); 1197 cpuid_entry_override(entry, CPUID_24_0_EBX); 1198 entry->ebx |= avx10_version; 1199 1200 entry->eax = 0; 1201 entry->ecx = 0; 1202 entry->edx = 0; 1203 break; 1204 } 1205 case KVM_CPUID_SIGNATURE: { 1206 const u32 *sigptr = (const u32 *)KVM_SIGNATURE; 1207 entry->eax = KVM_CPUID_FEATURES; 1208 entry->ebx = sigptr[0]; 1209 entry->ecx = sigptr[1]; 1210 entry->edx = sigptr[2]; 1211 break; 1212 } 1213 case KVM_CPUID_FEATURES: 1214 entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) | 1215 (1 << KVM_FEATURE_NOP_IO_DELAY) | 1216 (1 << KVM_FEATURE_CLOCKSOURCE2) | 1217 (1 << KVM_FEATURE_ASYNC_PF) | 1218 (1 << KVM_FEATURE_PV_EOI) | 1219 (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) | 1220 (1 << KVM_FEATURE_PV_UNHALT) | 1221 (1 << KVM_FEATURE_PV_TLB_FLUSH) | 1222 (1 << KVM_FEATURE_ASYNC_PF_VMEXIT) | 1223 (1 << KVM_FEATURE_PV_SEND_IPI) | 1224 (1 << KVM_FEATURE_POLL_CONTROL) | 1225 (1 << KVM_FEATURE_PV_SCHED_YIELD) | 1226 (1 << KVM_FEATURE_ASYNC_PF_INT); 1227 1228 if (sched_info_on()) 1229 entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); 1230 1231 entry->ebx = 0; 1232 entry->ecx = 0; 1233 entry->edx = 0; 1234 break; 1235 case 0x80000000: 1236 entry->eax = min(entry->eax, 0x80000022); 1237 /* 1238 * Serializing LFENCE is reported in a multitude of ways, and 1239 * NullSegClearsBase is not reported in CPUID on Zen2; help 1240 * userspace by providing the CPUID leaf ourselves. 1241 * 1242 * However, only do it if the host has CPUID leaf 0x8000001d. 1243 * QEMU thinks that it can query the host blindly for that 1244 * CPUID leaf if KVM reports that it supports 0x8000001d or 1245 * above. The processor merrily returns values from the 1246 * highest Intel leaf which QEMU tries to use as the guest's 1247 * 0x8000001d. Even worse, this can result in an infinite 1248 * loop if said highest leaf has no subleaves indexed by ECX. 1249 */ 1250 if (entry->eax >= 0x8000001d && 1251 (static_cpu_has(X86_FEATURE_LFENCE_RDTSC) 1252 || !static_cpu_has_bug(X86_BUG_NULL_SEG))) 1253 entry->eax = max(entry->eax, 0x80000021); 1254 break; 1255 case 0x80000001: 1256 entry->ebx &= ~GENMASK(27, 16); 1257 cpuid_entry_override(entry, CPUID_8000_0001_EDX); 1258 cpuid_entry_override(entry, CPUID_8000_0001_ECX); 1259 break; 1260 case 0x80000005: 1261 /* Pass host L1 cache and TLB info. */ 1262 break; 1263 case 0x80000006: 1264 /* Drop reserved bits, pass host L2 cache and TLB info. */ 1265 entry->edx &= ~GENMASK(17, 16); 1266 break; 1267 case 0x80000007: /* Advanced power management */ 1268 cpuid_entry_override(entry, CPUID_8000_0007_EDX); 1269 1270 /* mask against host */ 1271 entry->edx &= boot_cpu_data.x86_power; 1272 entry->eax = entry->ebx = entry->ecx = 0; 1273 break; 1274 case 0x80000008: { 1275 /* 1276 * GuestPhysAddrSize (EAX[23:16]) is intended for software 1277 * use. 1278 * 1279 * KVM's ABI is to report the effective MAXPHYADDR for the 1280 * guest in PhysAddrSize (phys_as), and the maximum 1281 * *addressable* GPA in GuestPhysAddrSize (g_phys_as). 1282 * 1283 * GuestPhysAddrSize is valid if and only if TDP is enabled, 1284 * in which case the max GPA that can be addressed by KVM may 1285 * be less than the max GPA that can be legally generated by 1286 * the guest, e.g. if MAXPHYADDR>48 but the CPU doesn't 1287 * support 5-level TDP. 1288 */ 1289 unsigned int virt_as = max((entry->eax >> 8) & 0xff, 48U); 1290 unsigned int phys_as, g_phys_as; 1291 1292 /* 1293 * If TDP (NPT) is disabled use the adjusted host MAXPHYADDR as 1294 * the guest operates in the same PA space as the host, i.e. 1295 * reductions in MAXPHYADDR for memory encryption affect shadow 1296 * paging, too. 1297 * 1298 * If TDP is enabled, use the raw bare metal MAXPHYADDR as 1299 * reductions to the HPAs do not affect GPAs. The max 1300 * addressable GPA is the same as the max effective GPA, except 1301 * that it's capped at 48 bits if 5-level TDP isn't supported 1302 * (hardware processes bits 51:48 only when walking the fifth 1303 * level page table). 1304 */ 1305 if (!tdp_enabled) { 1306 phys_as = boot_cpu_data.x86_phys_bits; 1307 g_phys_as = 0; 1308 } else { 1309 phys_as = entry->eax & 0xff; 1310 g_phys_as = phys_as; 1311 if (kvm_mmu_get_max_tdp_level() < 5) 1312 g_phys_as = min(g_phys_as, 48); 1313 } 1314 1315 entry->eax = phys_as | (virt_as << 8) | (g_phys_as << 16); 1316 entry->ecx &= ~(GENMASK(31, 16) | GENMASK(11, 8)); 1317 entry->edx = 0; 1318 cpuid_entry_override(entry, CPUID_8000_0008_EBX); 1319 break; 1320 } 1321 case 0x8000000A: 1322 if (!kvm_cpu_cap_has(X86_FEATURE_SVM)) { 1323 entry->eax = entry->ebx = entry->ecx = entry->edx = 0; 1324 break; 1325 } 1326 entry->eax = 1; /* SVM revision 1 */ 1327 entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper 1328 ASID emulation to nested SVM */ 1329 entry->ecx = 0; /* Reserved */ 1330 cpuid_entry_override(entry, CPUID_8000_000A_EDX); 1331 break; 1332 case 0x80000019: 1333 entry->ecx = entry->edx = 0; 1334 break; 1335 case 0x8000001a: 1336 entry->eax &= GENMASK(2, 0); 1337 entry->ebx = entry->ecx = entry->edx = 0; 1338 break; 1339 case 0x8000001e: 1340 /* Do not return host topology information. */ 1341 entry->eax = entry->ebx = entry->ecx = 0; 1342 entry->edx = 0; /* reserved */ 1343 break; 1344 case 0x8000001F: 1345 if (!kvm_cpu_cap_has(X86_FEATURE_SEV)) { 1346 entry->eax = entry->ebx = entry->ecx = entry->edx = 0; 1347 } else { 1348 cpuid_entry_override(entry, CPUID_8000_001F_EAX); 1349 /* Clear NumVMPL since KVM does not support VMPL. */ 1350 entry->ebx &= ~GENMASK(31, 12); 1351 /* 1352 * Enumerate '0' for "PA bits reduction", the adjusted 1353 * MAXPHYADDR is enumerated directly (see 0x80000008). 1354 */ 1355 entry->ebx &= ~GENMASK(11, 6); 1356 } 1357 break; 1358 case 0x80000020: 1359 entry->eax = entry->ebx = entry->ecx = entry->edx = 0; 1360 break; 1361 case 0x80000021: > 1362 unsigned int ebx_mask = 0; 1363 1364 entry->ecx = entry->edx = 0; 1365 cpuid_entry_override(entry, CPUID_8000_0021_EAX); 1366 1367 /* 1368 * Bits 23:16 in EBX indicate the size of the RSB. 1369 * Expose the value in the hardware to the guest. 1370 */ 1371 if (kvm_cpu_cap_has(X86_FEATURE_ERAPS)) 1372 ebx_mask |= GENMASK(23, 16); 1373 1374 entry->ebx &= ebx_mask; 1375 break; 1376 /* AMD Extended Performance Monitoring and Debug */ 1377 case 0x80000022: { 1378 union cpuid_0x80000022_ebx ebx; 1379 1380 entry->ecx = entry->edx = 0; 1381 if (!enable_pmu || !kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) { 1382 entry->eax = entry->ebx; 1383 break; 1384 } 1385 1386 cpuid_entry_override(entry, CPUID_8000_0022_EAX); 1387 1388 if (kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) 1389 ebx.split.num_core_pmc = kvm_pmu_cap.num_counters_gp; 1390 else if (kvm_cpu_cap_has(X86_FEATURE_PERFCTR_CORE)) 1391 ebx.split.num_core_pmc = AMD64_NUM_COUNTERS_CORE; 1392 else 1393 ebx.split.num_core_pmc = AMD64_NUM_COUNTERS; 1394 1395 entry->ebx = ebx.full; 1396 break; 1397 } 1398 /*Add support for Centaur's CPUID instruction*/ 1399 case 0xC0000000: 1400 /*Just support up to 0xC0000004 now*/ 1401 entry->eax = min(entry->eax, 0xC0000004); 1402 break; 1403 case 0xC0000001: 1404 cpuid_entry_override(entry, CPUID_C000_0001_EDX); 1405 break; 1406 case 3: /* Processor serial number */ 1407 case 5: /* MONITOR/MWAIT */ 1408 case 0xC0000002: 1409 case 0xC0000003: 1410 case 0xC0000004: 1411 default: 1412 entry->eax = entry->ebx = entry->ecx = entry->edx = 0; 1413 break; 1414 } 1415 1416 r = 0; 1417 1418 out: 1419 put_cpu(); 1420 1421 return r; 1422 } 1423 -- 0-DAY CI Kernel Test Service https://github.com/intel/lkp-tests/wiki
On Fri, 2024-11-01 at 12:14 +0800, kernel test robot wrote: > Hi Amit, > > kernel test robot noticed the following build warnings: > > [auto build test WARNING on kvm/queue] > [also build test WARNING on mst-vhost/linux-next tip/master > tip/x86/core linus/master v6.12-rc5 next-20241031] > [cannot apply to kvm/linux-next tip/auto-latest] > [If your patch is applied to the wrong git tree, kindly drop us a > note. > And when submitting patch, we suggest to use '--base' as documented > in > https://git-scm.com/docs/git-format-patch#_base_tree_information] > > url: > https://github.com/intel-lab-lkp/linux/commits/Amit-Shah/x86-cpu-bugs-add-support-for-AMD-ERAPS-feature/20241031-234332 > base: https://git.kernel.org/pub/scm/virt/kvm/kvm.git queue > patch link: > https://lore.kernel.org/r/20241031153925.36216-3-amit%40kernel.org > patch subject: [PATCH 2/2] x86: kvm: svm: add support for ERAPS and > FLUSH_RAP_ON_VMRUN > config: x86_64-kexec > (https://download.01.org/0day-ci/archive/20241101/202411011119.l3yRJp > ht-lkp@intel.com/config) > compiler: clang version 19.1.3 > (https://github.com/llvm/llvm-project ab51eccf88f5321e7c60591c5546b25 > 4b6afab99) > reproduce (this is a W=1 build): > (https://download.01.org/0day-ci/archive/20241101/202411011119.l3yRJp > ht-lkp@intel.com/reproduce) > > If you fix the issue in a separate patch/commit (i.e. not just a new > version of > the same patch/commit), kindly add following tags > > Reported-by: kernel test robot <lkp@intel.com> > > Closes: > > https://lore.kernel.org/oe-kbuild-all/202411011119.l3yRJpht-lkp@intel.com/ > > All warnings (new ones prefixed by >>): > > In file included from arch/x86/kvm/cpuid.c:13: > In file included from include/linux/kvm_host.h:16: > In file included from include/linux/mm.h:2213: > include/linux/vmstat.h:504:43: warning: arithmetic between > different enumeration types ('enum zone_stat_item' and 'enum > numa_stat_item') [-Wenum-enum-conversion] > 504 | return vmstat_text[NR_VM_ZONE_STAT_ITEMS + > | ~~~~~~~~~~~~~~~~~~~~~ ^ > 505 | item]; > | ~~~~ > include/linux/vmstat.h:511:43: warning: arithmetic between > different enumeration types ('enum zone_stat_item' and 'enum > numa_stat_item') [-Wenum-enum-conversion] > 511 | return vmstat_text[NR_VM_ZONE_STAT_ITEMS + > | ~~~~~~~~~~~~~~~~~~~~~ ^ > 512 | NR_VM_NUMA_EVENT_ITEMS + > | ~~~~~~~~~~~~~~~~~~~~~~ > include/linux/vmstat.h:518:36: warning: arithmetic between > different enumeration types ('enum node_stat_item' and 'enum > lru_list') [-Wenum-enum-conversion] > 518 | return node_stat_name(NR_LRU_BASE + lru) + 3; // > skip "nr_" > | ~~~~~~~~~~~ ^ ~~~ > include/linux/vmstat.h:524:43: warning: arithmetic between > different enumeration types ('enum zone_stat_item' and 'enum > numa_stat_item') [-Wenum-enum-conversion] > 524 | return vmstat_text[NR_VM_ZONE_STAT_ITEMS + > | ~~~~~~~~~~~~~~~~~~~~~ ^ > 525 | NR_VM_NUMA_EVENT_ITEMS + > | ~~~~~~~~~~~~~~~~~~~~~~ > > > arch/x86/kvm/cpuid.c:1362:3: warning: label followed by a > > > declaration is a C23 extension [-Wc23-extensions] > 1362 | unsigned int ebx_mask = 0; > | ^ > 5 warnings generated. [...] > 1361 case 0x80000021: > > 1362 unsigned int ebx_mask = 0; > 1363 > 1364 entry->ecx = entry->edx = 0; > 1365 cpuid_entry_override(entry, > CPUID_8000_0021_EAX); > 1366 > 1367 /* > 1368 * Bits 23:16 in EBX indicate the size of > the RSB. > 1369 * Expose the value in the hardware to the > guest. > 1370 */ > 1371 if (kvm_cpu_cap_has(X86_FEATURE_ERAPS)) > 1372 ebx_mask |= GENMASK(23, 16); > 1373 > 1374 entry->ebx &= ebx_mask; > 1375 break; Right - I'll add braces around this case statement. Amit
On Thu, Oct 31, 2024 at 04:39:25PM +0100, Amit Shah wrote: > @@ -3559,6 +3582,27 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) > > trace_kvm_nested_vmexit(vcpu, KVM_ISA_SVM); > > + if (boot_cpu_has(X86_FEATURE_ERAPS) > + && vmcb_is_larger_rap(svm->vmcb01.ptr)) { ^ This should be at the end of previous line.
© 2016 - 2024 Red Hat, Inc.