QEMU uses the kvm_get_msrs() function to save Intel PMU registers from KVM
and kvm_put_msrs() to restore them to KVM. However, there is no support for
AMD PMU registers. Currently, pmu_version and num_pmu_gp_counters are
initialized based on cpuid(0xa), which does not apply to AMD processors.
For AMD CPUs, prior to PerfMonV2, the number of general-purpose registers
is determined based on the CPU version.
To address this issue, we need to add support for AMD PMU registers.
Without this support, the following problems can arise:
1. If the VM is reset (e.g., via QEMU system_reset or VM kdump/kexec) while
running "perf top", the PMU registers are not disabled properly.
2. Despite x86_cpu_reset() resetting many registers to zero, kvm_put_msrs()
does not handle AMD PMU registers, causing some PMU events to remain
enabled in KVM.
3. The KVM kvm_pmc_speculative_in_use() function consistently returns true,
preventing the reclamation of these events. Consequently, the
kvm_pmc->perf_event remains active.
4. After a reboot, the VM kernel may report the following error:
[ 0.092011] Performance Events: Fam17h+ core perfctr, Broken BIOS detected, complain to your hardware vendor.
[ 0.092023] [Firmware Bug]: the BIOS has corrupted hw-PMU resources (MSR c0010200 is 530076)
5. In the worst case, the active kvm_pmc->perf_event may inject unknown
NMIs randomly into the VM kernel:
[...] Uhhuh. NMI received for unknown reason 30 on CPU 0.
To resolve these issues, we propose resetting AMD PMU registers during the
VM reset process.
Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
---
Changed since v1:
- Modify "MSR_K7_EVNTSEL0 + 3" and "MSR_K7_PERFCTR0 + 3" by using
AMD64_NUM_COUNTERS (suggested by Sandipan Das).
- Use "AMD64_NUM_COUNTERS_CORE * 2 - 1", not "MSR_F15H_PERF_CTL0 + 0xb".
(suggested by Sandipan Das).
- Switch back to "-pmu" instead of using a global "pmu-cap-disabled".
- Don't initialize PMU info if kvm.enable_pmu=N.
Changed since v2:
- Remove 'static' from host_cpuid_vendorX.
- Change has_pmu_version to pmu_version.
- Use object_property_get_int() to get CPU family.
- Use cpuid_find_entry() instead of cpu_x86_cpuid().
- Send error log when host and guest are from different vendors.
- Move "if (!cpu->enable_pmu)" to begin of function. Add comments to
reminder developers.
- Add support to Zhaoxin. Change is_same_vendor() to
is_host_compat_vendor().
- Didn't add Reviewed-by from Sandipan because the change isn't minor.
TODO:
- This patch adds is_host_compat_vendor(), while there are something
like is_host_cpu_intel() from target/i386/kvm/vmsr_energy.c. A rework
may help move those helpers to target/i386/cpu*.
target/i386/cpu.h | 8 ++
target/i386/kvm/kvm.c | 176 +++++++++++++++++++++++++++++++++++++++++-
2 files changed, 180 insertions(+), 4 deletions(-)
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 76f24446a5..84e497f5d3 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -490,6 +490,14 @@ typedef enum X86Seg {
#define MSR_CORE_PERF_GLOBAL_CTRL 0x38f
#define MSR_CORE_PERF_GLOBAL_OVF_CTRL 0x390
+#define MSR_K7_EVNTSEL0 0xc0010000
+#define MSR_K7_PERFCTR0 0xc0010004
+#define MSR_F15H_PERF_CTL0 0xc0010200
+#define MSR_F15H_PERF_CTR0 0xc0010201
+
+#define AMD64_NUM_COUNTERS 4
+#define AMD64_NUM_COUNTERS_CORE 6
+
#define MSR_MC0_CTL 0x400
#define MSR_MC0_STATUS 0x401
#define MSR_MC0_ADDR 0x402
diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index f68d5a0578..3a35fd741d 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -2087,7 +2087,7 @@ int kvm_arch_pre_create_vcpu(CPUState *cpu, Error **errp)
return 0;
}
-static void kvm_init_pmu_info(struct kvm_cpuid2 *cpuid)
+static void kvm_init_pmu_info_intel(struct kvm_cpuid2 *cpuid)
{
struct kvm_cpuid_entry2 *c;
@@ -2120,6 +2120,97 @@ static void kvm_init_pmu_info(struct kvm_cpuid2 *cpuid)
}
}
+static void kvm_init_pmu_info_amd(struct kvm_cpuid2 *cpuid, X86CPU *cpu)
+{
+ struct kvm_cpuid_entry2 *c;
+ int64_t family;
+
+ family = object_property_get_int(OBJECT(cpu), "family", NULL);
+ if (family < 0) {
+ return;
+ }
+
+ if (family < 6) {
+ error_report("AMD performance-monitoring is supported from "
+ "K7 and later");
+ return;
+ }
+
+ pmu_version = 1;
+ num_pmu_gp_counters = AMD64_NUM_COUNTERS;
+
+ c = cpuid_find_entry(cpuid, 0x80000001, 0);
+ if (!c) {
+ return;
+ }
+
+ if (!(c->ecx & CPUID_EXT3_PERFCORE)) {
+ return;
+ }
+
+ num_pmu_gp_counters = AMD64_NUM_COUNTERS_CORE;
+}
+
+static bool is_host_compat_vendor(CPUX86State *env)
+{
+ char host_vendor[CPUID_VENDOR_SZ + 1];
+ uint32_t host_cpuid_vendor1;
+ uint32_t host_cpuid_vendor2;
+ uint32_t host_cpuid_vendor3;
+
+ host_cpuid(0x0, 0, NULL, &host_cpuid_vendor1, &host_cpuid_vendor3,
+ &host_cpuid_vendor2);
+
+ x86_cpu_vendor_words2str(host_vendor, host_cpuid_vendor1,
+ host_cpuid_vendor2, host_cpuid_vendor3);
+
+ /*
+ * Intel and Zhaoxin are compatible.
+ */
+ if ((g_str_equal(host_vendor, CPUID_VENDOR_INTEL) ||
+ g_str_equal(host_vendor, CPUID_VENDOR_ZHAOXIN1) ||
+ g_str_equal(host_vendor, CPUID_VENDOR_ZHAOXIN2)) &&
+ (IS_INTEL_CPU(env) || IS_ZHAOXIN_CPU(env))) {
+ return true;
+ }
+
+ return env->cpuid_vendor1 == host_cpuid_vendor1 &&
+ env->cpuid_vendor2 == host_cpuid_vendor2 &&
+ env->cpuid_vendor3 == host_cpuid_vendor3;
+}
+
+static void kvm_init_pmu_info(struct kvm_cpuid2 *cpuid, X86CPU *cpu)
+{
+ CPUX86State *env = &cpu->env;
+
+ /*
+ * If KVM_CAP_PMU_CAPABILITY is not supported, there is no way to
+ * disable the AMD PMU virtualization.
+ *
+ * Assume the user is aware of this when !cpu->enable_pmu. AMD PMU
+ * registers are not going to reset, even they are still available to
+ * guest VM.
+ */
+ if (!cpu->enable_pmu) {
+ return;
+ }
+
+ /*
+ * It is not supported to virtualize AMD PMU registers on Intel
+ * processors, nor to virtualize Intel PMU registers on AMD processors.
+ */
+ if (!is_host_compat_vendor(env)) {
+ error_report("host doesn't support requested feature: vPMU");
+ return;
+ }
+
+ if (IS_INTEL_CPU(env) || IS_ZHAOXIN_CPU(env)) {
+ kvm_init_pmu_info_intel(cpuid);
+ } else if (IS_AMD_CPU(env)) {
+ kvm_init_pmu_info_amd(cpuid, cpu);
+ }
+}
+
int kvm_arch_init_vcpu(CPUState *cs)
{
struct {
@@ -2302,7 +2393,7 @@ int kvm_arch_init_vcpu(CPUState *cs)
cpuid_i = kvm_x86_build_cpuid(env, cpuid_data.entries, cpuid_i);
cpuid_data.cpuid.nent = cpuid_i;
- kvm_init_pmu_info(&cpuid_data.cpuid);
+ kvm_init_pmu_info(&cpuid_data.cpuid, cpu);
if (((env->cpuid_version >> 8)&0xF) >= 6
&& (env->features[FEAT_1_EDX] & (CPUID_MCE | CPUID_MCA)) ==
@@ -4066,7 +4157,7 @@ static int kvm_put_msrs(X86CPU *cpu, int level)
kvm_msr_entry_add(cpu, MSR_KVM_POLL_CONTROL, env->poll_control_msr);
}
- if (pmu_version > 0) {
+ if ((IS_INTEL_CPU(env) || IS_ZHAOXIN_CPU(env)) && pmu_version > 0) {
if (pmu_version > 1) {
/* Stop the counter. */
kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
@@ -4097,6 +4188,38 @@ static int kvm_put_msrs(X86CPU *cpu, int level)
env->msr_global_ctrl);
}
}
+
+ if (IS_AMD_CPU(env) && pmu_version > 0) {
+ uint32_t sel_base = MSR_K7_EVNTSEL0;
+ uint32_t ctr_base = MSR_K7_PERFCTR0;
+ /*
+ * The address of the next selector or counter register is
+ * obtained by incrementing the address of the current selector
+ * or counter register by one.
+ */
+ uint32_t step = 1;
+
+ /*
+ * When PERFCORE is enabled, AMD PMU uses a separate set of
+ * addresses for the selector and counter registers.
+ * Additionally, the address of the next selector or counter
+ * register is determined by incrementing the address of the
+ * current register by two.
+ */
+ if (num_pmu_gp_counters == AMD64_NUM_COUNTERS_CORE) {
+ sel_base = MSR_F15H_PERF_CTL0;
+ ctr_base = MSR_F15H_PERF_CTR0;
+ step = 2;
+ }
+
+ for (i = 0; i < num_pmu_gp_counters; i++) {
+ kvm_msr_entry_add(cpu, ctr_base + i * step,
+ env->msr_gp_counters[i]);
+ kvm_msr_entry_add(cpu, sel_base + i * step,
+ env->msr_gp_evtsel[i]);
+ }
+ }
+
/*
* Hyper-V partition-wide MSRs: to avoid clearing them on cpu hot-add,
* only sync them to KVM on the first cpu
@@ -4544,7 +4667,8 @@ static int kvm_get_msrs(X86CPU *cpu)
if (env->features[FEAT_KVM] & CPUID_KVM_POLL_CONTROL) {
kvm_msr_entry_add(cpu, MSR_KVM_POLL_CONTROL, 1);
}
- if (pmu_version > 0) {
+
+ if ((IS_INTEL_CPU(env) || IS_ZHAOXIN_CPU(env)) && pmu_version > 0) {
if (pmu_version > 1) {
kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_CTRL, 0);
@@ -4560,6 +4684,35 @@ static int kvm_get_msrs(X86CPU *cpu)
}
}
+ if (IS_AMD_CPU(env) && pmu_version > 0) {
+ uint32_t sel_base = MSR_K7_EVNTSEL0;
+ uint32_t ctr_base = MSR_K7_PERFCTR0;
+ /*
+ * The address of the next selector or counter register is
+ * obtained by incrementing the address of the current selector
+ * or counter register by one.
+ */
+ uint32_t step = 1;
+
+ /*
+ * When PERFCORE is enabled, AMD PMU uses a separate set of
+ * addresses for the selector and counter registers.
+ * Additionally, the address of the next selector or counter
+ * register is determined by incrementing the address of the
+ * current register by two.
+ */
+ if (num_pmu_gp_counters == AMD64_NUM_COUNTERS_CORE) {
+ sel_base = MSR_F15H_PERF_CTL0;
+ ctr_base = MSR_F15H_PERF_CTR0;
+ step = 2;
+ }
+
+ for (i = 0; i < num_pmu_gp_counters; i++) {
+ kvm_msr_entry_add(cpu, ctr_base + i * step, 0);
+ kvm_msr_entry_add(cpu, sel_base + i * step, 0);
+ }
+ }
+
if (env->mcg_cap) {
kvm_msr_entry_add(cpu, MSR_MCG_STATUS, 0);
kvm_msr_entry_add(cpu, MSR_MCG_CTL, 0);
@@ -4871,6 +5024,21 @@ static int kvm_get_msrs(X86CPU *cpu)
case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL0 + MAX_GP_COUNTERS - 1:
env->msr_gp_evtsel[index - MSR_P6_EVNTSEL0] = msrs[i].data;
break;
+ case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL0 + AMD64_NUM_COUNTERS - 1:
+ env->msr_gp_evtsel[index - MSR_K7_EVNTSEL0] = msrs[i].data;
+ break;
+ case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR0 + AMD64_NUM_COUNTERS - 1:
+ env->msr_gp_counters[index - MSR_K7_PERFCTR0] = msrs[i].data;
+ break;
+ case MSR_F15H_PERF_CTL0 ...
+ MSR_F15H_PERF_CTL0 + AMD64_NUM_COUNTERS_CORE * 2 - 1:
+ index = index - MSR_F15H_PERF_CTL0;
+ if (index & 0x1) {
+ env->msr_gp_counters[index] = msrs[i].data;
+ } else {
+ env->msr_gp_evtsel[index] = msrs[i].data;
+ }
+ break;
case HV_X64_MSR_HYPERCALL:
env->msr_hv_hypercall = msrs[i].data;
break;
--
2.39.3
...
> TODO:
> - This patch adds is_host_compat_vendor(), while there are something
> like is_host_cpu_intel() from target/i386/kvm/vmsr_energy.c. A rework
> may help move those helpers to target/i386/cpu*.
vmsr_energy emulates RAPL in user space...but RAPL is not architectural
(no CPUID), so this case doesn't need to consider "compat" vendor.
> target/i386/cpu.h | 8 ++
> target/i386/kvm/kvm.c | 176 +++++++++++++++++++++++++++++++++++++++++-
> 2 files changed, 180 insertions(+), 4 deletions(-)
...
> +static bool is_host_compat_vendor(CPUX86State *env)
> +{
> + char host_vendor[CPUID_VENDOR_SZ + 1];
> + uint32_t host_cpuid_vendor1;
> + uint32_t host_cpuid_vendor2;
> + uint32_t host_cpuid_vendor3;
>
> + host_cpuid(0x0, 0, NULL, &host_cpuid_vendor1, &host_cpuid_vendor3,
> + &host_cpuid_vendor2);
> +
> + x86_cpu_vendor_words2str(host_vendor, host_cpuid_vendor1,
> + host_cpuid_vendor2, host_cpuid_vendor3);
We can use host_cpu_vendor_fms() (with a little change). If you like
this idea, pls feel free to pick my cleanup patch into your series.
> + /*
> + * Intel and Zhaoxin are compatible.
> + */
> + if ((g_str_equal(host_vendor, CPUID_VENDOR_INTEL) ||
> + g_str_equal(host_vendor, CPUID_VENDOR_ZHAOXIN1) ||
> + g_str_equal(host_vendor, CPUID_VENDOR_ZHAOXIN2)) &&
> + (IS_INTEL_CPU(env) || IS_ZHAOXIN_CPU(env))) {
> + return true;
> + }
> +
> + return env->cpuid_vendor1 == host_cpuid_vendor1 &&
> + env->cpuid_vendor2 == host_cpuid_vendor2 &&
> + env->cpuid_vendor3 == host_cpuid_vendor3;
Checking AMD directly makes the "compat" rule clear:
return g_str_equal(host_vendor, CPUID_VENDOR_AMD) &&
IS_AMD_CPU(env);
> +}
...
> if (env->mcg_cap) {
> kvm_msr_entry_add(cpu, MSR_MCG_STATUS, 0);
> kvm_msr_entry_add(cpu, MSR_MCG_CTL, 0);
> @@ -4871,6 +5024,21 @@ static int kvm_get_msrs(X86CPU *cpu)
> case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL0 + MAX_GP_COUNTERS - 1:
> env->msr_gp_evtsel[index - MSR_P6_EVNTSEL0] = msrs[i].data;
> break;
> + case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL0 + AMD64_NUM_COUNTERS - 1:
> + env->msr_gp_evtsel[index - MSR_K7_EVNTSEL0] = msrs[i].data;
> + break;
> + case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR0 + AMD64_NUM_COUNTERS - 1:
> + env->msr_gp_counters[index - MSR_K7_PERFCTR0] = msrs[i].data;
> + break;
> + case MSR_F15H_PERF_CTL0 ...
> + MSR_F15H_PERF_CTL0 + AMD64_NUM_COUNTERS_CORE * 2 - 1:
> + index = index - MSR_F15H_PERF_CTL0;
> + if (index & 0x1) {
> + env->msr_gp_counters[index] = msrs[i].data;
> + } else {
> + env->msr_gp_evtsel[index] = msrs[i].data;
This msr_gp_evtsel[] array's size is 18:
#define MAX_GP_COUNTERS (MSR_IA32_PERF_STATUS - MSR_P6_EVNTSEL0)
This formula is based on Intel's MSR, it's best to add a note that the
current size also meets AMD's needs. (No need to adjust the size, as
it will affect migration).
> + }
> + break;
> case HV_X64_MSR_HYPERCALL:
> env->msr_hv_hypercall = msrs[i].data;
> break;
Others LGTM!
Thanks,
Zhao
Hi Zhao,
On 4/10/25 12:43 AM, Zhao Liu wrote:
> ...
>
>> TODO:
>> - This patch adds is_host_compat_vendor(), while there are something
>> like is_host_cpu_intel() from target/i386/kvm/vmsr_energy.c. A rework
>> may help move those helpers to target/i386/cpu*.
>
> vmsr_energy emulates RAPL in user space...but RAPL is not architectural
> (no CPUID), so this case doesn't need to consider "compat" vendor.
>
>> target/i386/cpu.h | 8 ++
>> target/i386/kvm/kvm.c | 176 +++++++++++++++++++++++++++++++++++++++++-
>> 2 files changed, 180 insertions(+), 4 deletions(-)
>
> ...
>
>> +static bool is_host_compat_vendor(CPUX86State *env)
>> +{
>> + char host_vendor[CPUID_VENDOR_SZ + 1];
>> + uint32_t host_cpuid_vendor1;
>> + uint32_t host_cpuid_vendor2;
>> + uint32_t host_cpuid_vendor3;
>>
>> + host_cpuid(0x0, 0, NULL, &host_cpuid_vendor1, &host_cpuid_vendor3,
>> + &host_cpuid_vendor2);
>> +
>> + x86_cpu_vendor_words2str(host_vendor, host_cpuid_vendor1,
>> + host_cpuid_vendor2, host_cpuid_vendor3);
>
> We can use host_cpu_vendor_fms() (with a little change). If you like
> this idea, pls feel free to pick my cleanup patch into your series.
Sure. I will try to use host_cpu_vendor_fms().
>
>> + /*
>> + * Intel and Zhaoxin are compatible.
>> + */
>> + if ((g_str_equal(host_vendor, CPUID_VENDOR_INTEL) ||
>> + g_str_equal(host_vendor, CPUID_VENDOR_ZHAOXIN1) ||
>> + g_str_equal(host_vendor, CPUID_VENDOR_ZHAOXIN2)) &&
>> + (IS_INTEL_CPU(env) || IS_ZHAOXIN_CPU(env))) {
>> + return true;
>> + }
>> +
>> + return env->cpuid_vendor1 == host_cpuid_vendor1 &&
>> + env->cpuid_vendor2 == host_cpuid_vendor2 &&
>> + env->cpuid_vendor3 == host_cpuid_vendor3;
>
> Checking AMD directly makes the "compat" rule clear:
>
> return g_str_equal(host_vendor, CPUID_VENDOR_AMD) &&
> IS_AMD_CPU(env);
Sure.
>
>> +}
>
> ...
>
>> if (env->mcg_cap) {
>> kvm_msr_entry_add(cpu, MSR_MCG_STATUS, 0);
>> kvm_msr_entry_add(cpu, MSR_MCG_CTL, 0);
>> @@ -4871,6 +5024,21 @@ static int kvm_get_msrs(X86CPU *cpu)
>> case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL0 + MAX_GP_COUNTERS - 1:
>> env->msr_gp_evtsel[index - MSR_P6_EVNTSEL0] = msrs[i].data;
>> break;
>> + case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL0 + AMD64_NUM_COUNTERS - 1:
>> + env->msr_gp_evtsel[index - MSR_K7_EVNTSEL0] = msrs[i].data;
>> + break;
>> + case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR0 + AMD64_NUM_COUNTERS - 1:
>> + env->msr_gp_counters[index - MSR_K7_PERFCTR0] = msrs[i].data;
>> + break;
>> + case MSR_F15H_PERF_CTL0 ...
>> + MSR_F15H_PERF_CTL0 + AMD64_NUM_COUNTERS_CORE * 2 - 1:
>> + index = index - MSR_F15H_PERF_CTL0;
>> + if (index & 0x1) {
>> + env->msr_gp_counters[index] = msrs[i].data;
>> + } else {
>> + env->msr_gp_evtsel[index] = msrs[i].data;
>
> This msr_gp_evtsel[] array's size is 18:
>
> #define MAX_GP_COUNTERS (MSR_IA32_PERF_STATUS - MSR_P6_EVNTSEL0)
>
> This formula is based on Intel's MSR, it's best to add a note that the
> current size also meets AMD's needs. (No need to adjust the size, as
> it will affect migration).
I will add a comment to target/i386/cpu.h, above the definition of MAX_GP_COUNTERS.
Thank you very much!
Dongli Zhang
© 2016 - 2026 Red Hat, Inc.