[v5] KVM: x86: Add support for mediated vPMUs

[PATCH v5 28/44] KVM: x86/pmu: Load/save GLOBAL_CTRL via entry/exit fields for mediated PMU

Posted by Sean Christopherson 6 months ago

From: Dapeng Mi <dapeng1.mi@linux.intel.com>

When running a guest with a mediated PMU, context switch PERF_GLOBAL_CTRL
via the dedicated VMCS fields for both host and guest.  For the host,
always zero GLOBAL_CTRL on exit as the guest's state will still be loaded
in hardware (KVM will context switch the bulk of PMU state outside of the
inner run loop).  For the guest, use the dedicated fields to atomically
load and save PERF_GLOBAL_CTRL on all entry/exits.

Note, VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL was introduced by Sapphire
Rapids, and is expected to be supported on all CPUs with PMU v4+.  WARN if
that expectation is not met.  Alternatively, KVM could manually save
PERF_GLOBAL_CTRL via the MSR save list, but the associated complexity and
runtime overhead is unjustified given that the feature should always be
available on relevant CPUs.

To minimize VM-Entry latency, propagate IA32_PERF_GLOBAL_CTRL to the VMCS
on-demand.  But to minimize complexity, read IA32_PERF_GLOBAL_CTRL out of
the VMCS on all non-failing VM-Exits.  I.e. partially cache the MSR.
KVM could track GLOBAL_CTRL as an EXREG and defer all reads, but writes
are rare, i.e. the dirty tracking for an EXREG is unnecessary, and it's
not obvious that shaving ~15-20 cycles per exit is meaningful given the
total overhead associated with mediated PMU context switches.

Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Co-developed-by: Mingwei Zhang <mizhang@google.com>
Signed-off-by: Mingwei Zhang <mizhang@google.com>
Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/include/asm/kvm-x86-pmu-ops.h |  2 ++
 arch/x86/include/asm/vmx.h             |  1 +
 arch/x86/kvm/pmu.c                     | 13 +++++++++--
 arch/x86/kvm/pmu.h                     |  3 ++-
 arch/x86/kvm/vmx/capabilities.h        |  5 +++++
 arch/x86/kvm/vmx/pmu_intel.c           | 19 +++++++++++++++-
 arch/x86/kvm/vmx/vmx.c                 | 31 +++++++++++++++++++++++++-
 arch/x86/kvm/vmx/vmx.h                 |  3 ++-
 8 files changed, 71 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/kvm-x86-pmu-ops.h b/arch/x86/include/asm/kvm-x86-pmu-ops.h
index 9159bf1a4730..ad2cc82abf79 100644
--- a/arch/x86/include/asm/kvm-x86-pmu-ops.h
+++ b/arch/x86/include/asm/kvm-x86-pmu-ops.h
@@ -23,5 +23,7 @@ KVM_X86_PMU_OP_OPTIONAL(reset)
 KVM_X86_PMU_OP_OPTIONAL(deliver_pmi)
 KVM_X86_PMU_OP_OPTIONAL(cleanup)
 
+KVM_X86_PMU_OP_OPTIONAL(write_global_ctrl)
+
 #undef KVM_X86_PMU_OP
 #undef KVM_X86_PMU_OP_OPTIONAL
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index cca7d6641287..af71666c3a37 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -106,6 +106,7 @@
 #define VM_EXIT_CLEAR_BNDCFGS                   0x00800000
 #define VM_EXIT_PT_CONCEAL_PIP			0x01000000
 #define VM_EXIT_CLEAR_IA32_RTIT_CTL		0x02000000
+#define VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL	0x40000000
 
 #define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR	0x00036dff
 
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 674f42d083a9..a4fe0e76df79 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -103,7 +103,7 @@ void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops)
 #undef __KVM_X86_PMU_OP
 }
 
-void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops)
+void kvm_init_pmu_capability(struct kvm_pmu_ops *pmu_ops)
 {
 	bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL;
 	int min_nr_gp_ctrs = pmu_ops->MIN_NR_GP_COUNTERS;
@@ -137,6 +137,9 @@ void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops)
 	    !pmu_ops->is_mediated_pmu_supported(&kvm_host_pmu))
 		enable_mediated_pmu = false;
 
+	if (!enable_mediated_pmu)
+		pmu_ops->write_global_ctrl = NULL;
+
 	if (!enable_pmu) {
 		memset(&kvm_pmu_cap, 0, sizeof(kvm_pmu_cap));
 		return;
@@ -831,6 +834,9 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			diff = pmu->global_ctrl ^ data;
 			pmu->global_ctrl = data;
 			reprogram_counters(pmu, diff);
+
+			if (kvm_vcpu_has_mediated_pmu(vcpu))
+				kvm_pmu_call(write_global_ctrl)(data);
 		}
 		break;
 	case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
@@ -921,8 +927,11 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
 	 * in the global controls).  Emulate that behavior when refreshing the
 	 * PMU so that userspace doesn't need to manually set PERF_GLOBAL_CTRL.
 	 */
-	if (kvm_pmu_has_perf_global_ctrl(pmu) && pmu->nr_arch_gp_counters)
+	if (kvm_pmu_has_perf_global_ctrl(pmu) && pmu->nr_arch_gp_counters) {
 		pmu->global_ctrl = GENMASK_ULL(pmu->nr_arch_gp_counters - 1, 0);
+		if (kvm_vcpu_has_mediated_pmu(vcpu))
+			kvm_pmu_call(write_global_ctrl)(pmu->global_ctrl);
+	}
 }
 
 void kvm_pmu_init(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index 6b95e81c078c..dcf4e2253875 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -38,6 +38,7 @@ struct kvm_pmu_ops {
 	void (*cleanup)(struct kvm_vcpu *vcpu);
 
 	bool (*is_mediated_pmu_supported)(struct x86_pmu_capability *host_pmu);
+	void (*write_global_ctrl)(u64 global_ctrl);
 
 	const u64 EVENTSEL_EVENT;
 	const int MAX_NR_GP_COUNTERS;
@@ -183,7 +184,7 @@ static inline bool pmc_is_locally_enabled(struct kvm_pmc *pmc)
 
 extern struct x86_pmu_capability kvm_pmu_cap;
 
-void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops);
+void kvm_init_pmu_capability(struct kvm_pmu_ops *pmu_ops);
 
 void kvm_pmu_recalc_pmc_emulation(struct kvm_pmu *pmu, struct kvm_pmc *pmc);
 
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 26ff606ff139..874c6dd34665 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -100,6 +100,11 @@ static inline bool cpu_has_load_perf_global_ctrl(void)
 	return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
 }
 
+static inline bool cpu_has_save_perf_global_ctrl(void)
+{
+	return vmcs_config.vmexit_ctrl & VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL;
+}
+
 static inline bool cpu_has_vmx_mpx(void)
 {
 	return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS;
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 7ab35ef4a3b1..98f7b45ea391 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -787,7 +787,23 @@ static bool intel_pmu_is_mediated_pmu_supported(struct x86_pmu_capability *host_
 	 * Require v4+ for MSR_CORE_PERF_GLOBAL_STATUS_SET, and full-width
 	 * writes so that KVM can precisely load guest counter values.
 	 */
-	return host_pmu->version >= 4 && host_perf_cap & PERF_CAP_FW_WRITES;
+	if (host_pmu->version < 4 || !(host_perf_cap & PERF_CAP_FW_WRITES))
+		return false;
+
+	/*
+	 * All CPUs that support a mediated PMU are expected to support loading
+	 * and saving PERF_GLOBAL_CTRL via dedicated VMCS fields.
+	 */
+	if (WARN_ON_ONCE(!cpu_has_load_perf_global_ctrl() ||
+			 !cpu_has_save_perf_global_ctrl()))
+		return false;
+
+	return true;
+}
+
+static void intel_pmu_write_global_ctrl(u64 global_ctrl)
+{
+	vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, global_ctrl);
 }
 
 struct kvm_pmu_ops intel_pmu_ops __initdata = {
@@ -803,6 +819,7 @@ struct kvm_pmu_ops intel_pmu_ops __initdata = {
 	.cleanup = intel_pmu_cleanup,
 
 	.is_mediated_pmu_supported = intel_pmu_is_mediated_pmu_supported,
+	.write_global_ctrl = intel_pmu_write_global_ctrl,
 
 	.EVENTSEL_EVENT = ARCH_PERFMON_EVENTSEL_EVENT,
 	.MAX_NR_GP_COUNTERS = KVM_MAX_NR_INTEL_GP_COUNTERS,
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 2f7db32710e3..1233a0afb31e 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -4115,6 +4115,18 @@ static void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu)
 		vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W,
 					  !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D));
 
+	if (enable_mediated_pmu) {
+		bool is_mediated_pmu = kvm_vcpu_has_mediated_pmu(vcpu);
+		struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+		vm_entry_controls_changebit(vmx,
+					    VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, is_mediated_pmu);
+
+		vm_exit_controls_changebit(vmx,
+					   VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL |
+					   VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL, is_mediated_pmu);
+	}
+
 	/*
 	 * x2APIC and LBR MSR intercepts are modified on-demand and cannot be
 	 * filtered by userspace.
@@ -4282,6 +4294,16 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
 
 	if (cpu_has_load_ia32_efer())
 		vmcs_write64(HOST_IA32_EFER, kvm_host.efer);
+
+	/*
+	 * When running a guest with a mediated PMU, guest state is resident in
+	 * hardware after VM-Exit.  Zero PERF_GLOBAL_CTRL on exit so that host
+	 * activity doesn't bleed into the guest counters.  When running with
+	 * an emulated PMU, PERF_GLOBAL_CTRL is dynamically computed on every
+	 * entry/exit to merge guest and host PMU usage.
+	 */
+	if (enable_mediated_pmu)
+		vmcs_write64(HOST_IA32_PERF_GLOBAL_CTRL, 0);
 }
 
 void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
@@ -4349,7 +4371,8 @@ static u32 vmx_get_initial_vmexit_ctrl(void)
 				 VM_EXIT_CLEAR_IA32_RTIT_CTL);
 	/* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
 	return vmexit_ctrl &
-		~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER);
+		~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER |
+		  VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL);
 }
 
 void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
@@ -7087,6 +7110,9 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
 	struct perf_guest_switch_msr *msrs;
 	struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu);
 
+	if (kvm_vcpu_has_mediated_pmu(&vmx->vcpu))
+		return;
+
 	pmu->host_cross_mapped_mask = 0;
 	if (pmu->pebs_enable & pmu->global_ctrl)
 		intel_pmu_cross_mapped_check(pmu);
@@ -7407,6 +7433,9 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
 
 	vmx->loaded_vmcs->launched = 1;
 
+	if (!msr_write_intercepted(vmx, MSR_CORE_PERF_GLOBAL_CTRL))
+		vcpu_to_pmu(vcpu)->global_ctrl = vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL);
+
 	vmx_recover_nmi_blocking(vmx);
 	vmx_complete_interrupts(vmx);
 
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index a4e5bcd1d023..7eb57f5cb975 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -506,7 +506,8 @@ static inline u8 vmx_get_rvi(void)
 	       VM_EXIT_LOAD_IA32_EFER |					\
 	       VM_EXIT_CLEAR_BNDCFGS |					\
 	       VM_EXIT_PT_CONCEAL_PIP |					\
-	       VM_EXIT_CLEAR_IA32_RTIT_CTL)
+	       VM_EXIT_CLEAR_IA32_RTIT_CTL |				\
+	       VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL)
 
 #define KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL			\
 	(PIN_BASED_EXT_INTR_MASK |					\
-- 
2.50.1.565.gc32cd1483b-goog

Re: [PATCH v5 28/44] KVM: x86/pmu: Load/save GLOBAL_CTRL via entry/exit fields for mediated PMU

Posted by Sean Christopherson 2 months, 2 weeks ago

On Wed, Aug 06, 2025, Sean Christopherson wrote:
> From: Dapeng Mi <dapeng1.mi@linux.intel.com>
> 
> When running a guest with a mediated PMU, context switch PERF_GLOBAL_CTRL
> via the dedicated VMCS fields for both host and guest.  For the host,
> always zero GLOBAL_CTRL on exit as the guest's state will still be loaded
> in hardware (KVM will context switch the bulk of PMU state outside of the
> inner run loop).  For the guest, use the dedicated fields to atomically
> load and save PERF_GLOBAL_CTRL on all entry/exits.
> 
> Note, VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL was introduced by Sapphire
> Rapids, and is expected to be supported on all CPUs with PMU v4+.  WARN if
> that expectation is not met.  Alternatively, KVM could manually save
> PERF_GLOBAL_CTRL via the MSR save list, but the associated complexity and
> runtime overhead is unjustified given that the feature should always be
> available on relevant CPUs.

This is wrong, PMU v4 has been supported since Skylake.

> diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
> index 7ab35ef4a3b1..98f7b45ea391 100644
> --- a/arch/x86/kvm/vmx/pmu_intel.c
> +++ b/arch/x86/kvm/vmx/pmu_intel.c
> @@ -787,7 +787,23 @@ static bool intel_pmu_is_mediated_pmu_supported(struct x86_pmu_capability *host_
>  	 * Require v4+ for MSR_CORE_PERF_GLOBAL_STATUS_SET, and full-width
>  	 * writes so that KVM can precisely load guest counter values.
>  	 */
> -	return host_pmu->version >= 4 && host_perf_cap & PERF_CAP_FW_WRITES;
> +	if (host_pmu->version < 4 || !(host_perf_cap & PERF_CAP_FW_WRITES))
> +		return false;
> +
> +	/*
> +	 * All CPUs that support a mediated PMU are expected to support loading
> +	 * and saving PERF_GLOBAL_CTRL via dedicated VMCS fields.
> +	 */
> +	if (WARN_ON_ONCE(!cpu_has_load_perf_global_ctrl() ||
> +			 !cpu_has_save_perf_global_ctrl()))
> +		return false;

And so this WARN fires due to cpu_has_save_perf_global_ctrl() being false.  The
bad changelog is mine, but the code isn't entirely my fault.  I did suggest the
WARN in v3[1], probably because I forgot when PMU v4 was introduced and no one
corrected me.

v4 of the series[2] then made cpu_has_save_perf_global_ctrl() a hard requirement,
based on my miguided feedback.

   * Only support GLOBAL_CTRL save/restore with VMCS exec_ctrl, drop the MSR
     save/retore list support for GLOBAL_CTRL, thus the support of mediated
     vPMU is constrained to SapphireRapids and later CPUs on Intel side.

Doubly frustrating is that this was discussed in the original RFC, where Jim
pointed out[3] that requiring VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL would prevent
enabling the mediated PMU on Skylake+, and I completely forgot that conversation
by the time v3 of the series rolled around :-(

As mentioned in the discussion with Jim, _if_ PMU v4 was introduced with ICX (or
later), then I'd be in favor of making VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL a hard
requirement.  But losing supporting Skylake+ is a bit much.

There are a few warts with nVMX's use of the auto-store list that need to be
cleaned up, but on the plus side it's also a good excuse to clean up
{add,clear}_atomic_switch_msr(), which have accumulated some cruft and quite a
bit of duplicate code.  And while I still dislike using the auto-store list, the
code isn't as ugly as it was back in v3 because we _can_ make the "load" VMCS
controls mandatory without losing support for any CPUs (they predate PMU v4).

[1] https://lore.kernel.org/all/ZzyWKTMdNi5YjvEM@google.com
[2] https://lore.kernel.org/all/20250324173121.1275209-1-mizhang@google.com
[3] https://lore.kernel.org/all/CALMp9eQ+-wcj8QMmFR07zvxFF22-bWwQgV-PZvD04ruQ=0NBBA@mail.gmail.com

Re: [PATCH v5 28/44] KVM: x86/pmu: Load/save GLOBAL_CTRL via entry/exit fields for mediated PMU

Posted by Mi, Dapeng 2 months, 2 weeks ago

On 11/25/2025 9:48 AM, Sean Christopherson wrote:
> On Wed, Aug 06, 2025, Sean Christopherson wrote:
>> From: Dapeng Mi <dapeng1.mi@linux.intel.com>
>>
>> When running a guest with a mediated PMU, context switch PERF_GLOBAL_CTRL
>> via the dedicated VMCS fields for both host and guest.  For the host,
>> always zero GLOBAL_CTRL on exit as the guest's state will still be loaded
>> in hardware (KVM will context switch the bulk of PMU state outside of the
>> inner run loop).  For the guest, use the dedicated fields to atomically
>> load and save PERF_GLOBAL_CTRL on all entry/exits.
>>
>> Note, VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL was introduced by Sapphire
>> Rapids, and is expected to be supported on all CPUs with PMU v4+.  WARN if
>> that expectation is not met.  Alternatively, KVM could manually save
>> PERF_GLOBAL_CTRL via the MSR save list, but the associated complexity and
>> runtime overhead is unjustified given that the feature should always be
>> available on relevant CPUs.
> This is wrong, PMU v4 has been supported since Skylake.

Yes, the v4+ restriction is to meet the requirement of existence of
IA32_PERF_GLOBAL_STATUS_SET MSR which is needed to restore the guest
global_ctrl.


>
>> diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
>> index 7ab35ef4a3b1..98f7b45ea391 100644
>> --- a/arch/x86/kvm/vmx/pmu_intel.c
>> +++ b/arch/x86/kvm/vmx/pmu_intel.c
>> @@ -787,7 +787,23 @@ static bool intel_pmu_is_mediated_pmu_supported(struct x86_pmu_capability *host_
>>  	 * Require v4+ for MSR_CORE_PERF_GLOBAL_STATUS_SET, and full-width
>>  	 * writes so that KVM can precisely load guest counter values.
>>  	 */
>> -	return host_pmu->version >= 4 && host_perf_cap & PERF_CAP_FW_WRITES;
>> +	if (host_pmu->version < 4 || !(host_perf_cap & PERF_CAP_FW_WRITES))
>> +		return false;
>> +
>> +	/*
>> +	 * All CPUs that support a mediated PMU are expected to support loading
>> +	 * and saving PERF_GLOBAL_CTRL via dedicated VMCS fields.
>> +	 */
>> +	if (WARN_ON_ONCE(!cpu_has_load_perf_global_ctrl() ||
>> +			 !cpu_has_save_perf_global_ctrl()))
>> +		return false;
> And so this WARN fires due to cpu_has_save_perf_global_ctrl() being false.  The
> bad changelog is mine, but the code isn't entirely my fault.  I did suggest the
> WARN in v3[1], probably because I forgot when PMU v4 was introduced and no one
> corrected me.
>
> v4 of the series[2] then made cpu_has_save_perf_global_ctrl() a hard requirement,
> based on my miguided feedback.
>
>    * Only support GLOBAL_CTRL save/restore with VMCS exec_ctrl, drop the MSR
>      save/retore list support for GLOBAL_CTRL, thus the support of mediated
>      vPMU is constrained to SapphireRapids and later CPUs on Intel side.
>
> Doubly frustrating is that this was discussed in the original RFC, where Jim
> pointed out[3] that requiring VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL would prevent
> enabling the mediated PMU on Skylake+, and I completely forgot that conversation
> by the time v3 of the series rolled around :-(

VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL is introduced from SPR and later. I
remember the original requirements includes to support Skylake and Icelake,
but I ever thought there were some offline sync and the requirement changed...

My bad, I should double confirm this at then.


>
> As mentioned in the discussion with Jim, _if_ PMU v4 was introduced with ICX (or
> later), then I'd be in favor of making VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL a hard
> requirement.  But losing supporting Skylake+ is a bit much.
>
> There are a few warts with nVMX's use of the auto-store list that need to be
> cleaned up, but on the plus side it's also a good excuse to clean up
> {add,clear}_atomic_switch_msr(), which have accumulated some cruft and quite a
> bit of duplicate code.  And while I still dislike using the auto-store list, the
> code isn't as ugly as it was back in v3 because we _can_ make the "load" VMCS
> controls mandatory without losing support for any CPUs (they predate PMU v4).

Yes, xxx_atomic_switch_msr() helpers need to be cleaned up and optimized. I
suppose we can have an independent patch-set to clean up and support
global_ctrl with auto-store list for Skylake and Icelake.


>
> [1] https://lore.kernel.org/all/ZzyWKTMdNi5YjvEM@google.com
> [2] https://lore.kernel.org/all/20250324173121.1275209-1-mizhang@google.com
> [3] https://lore.kernel.org/all/CALMp9eQ+-wcj8QMmFR07zvxFF22-bWwQgV-PZvD04ruQ=0NBBA@mail.gmail.com

Re: [PATCH v5 28/44] KVM: x86/pmu: Load/save GLOBAL_CTRL via entry/exit fields for mediated PMU

Posted by Sean Christopherson 2 months, 2 weeks ago

On Tue, Nov 25, 2025, Dapeng Mi wrote:
> On 11/25/2025 9:48 AM, Sean Christopherson wrote:
> >> +	if (host_pmu->version < 4 || !(host_perf_cap & PERF_CAP_FW_WRITES))
> >> +		return false;
> >> +
> >> +	/*
> >> +	 * All CPUs that support a mediated PMU are expected to support loading
> >> +	 * and saving PERF_GLOBAL_CTRL via dedicated VMCS fields.
> >> +	 */
> >> +	if (WARN_ON_ONCE(!cpu_has_load_perf_global_ctrl() ||
> >> +			 !cpu_has_save_perf_global_ctrl()))
> >> +		return false;
> > And so this WARN fires due to cpu_has_save_perf_global_ctrl() being false.  The
> > bad changelog is mine, but the code isn't entirely my fault.  I did suggest the
> > WARN in v3[1], probably because I forgot when PMU v4 was introduced and no one
> > corrected me.
> >
> > v4 of the series[2] then made cpu_has_save_perf_global_ctrl() a hard requirement,
> > based on my miguided feedback.
> >
> >    * Only support GLOBAL_CTRL save/restore with VMCS exec_ctrl, drop the MSR
> >      save/retore list support for GLOBAL_CTRL, thus the support of mediated
> >      vPMU is constrained to SapphireRapids and later CPUs on Intel side.
> >
> > Doubly frustrating is that this was discussed in the original RFC, where Jim
> > pointed out[3] that requiring VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL would prevent
> > enabling the mediated PMU on Skylake+, and I completely forgot that conversation
> > by the time v3 of the series rolled around :-(
> 
> VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL is introduced from SPR and later. I
> remember the original requirements includes to support Skylake and Icelake,
> but I ever thought there were some offline sync and the requirement changed...

Two things:

 1) Upstream's "requirements" are not the same as Google's requirements (or those
    of any company/individual).  Upstream most definitely is influenced by the
    needs and desires of end users, but ultimately the decision to do something
    (or not) is one that needs to be made by the upstream community.

 2) Decisions made off-list need to be summarized and communicated on-list,
    especially in cases like this where it's a relatively minor detail in a
    large series/feature, and thus easy to overlook.

I'll follow-up internally to make sure these points are well-understood by Google
folks as well (at least, those working on KVM).

> My bad,

Eh, this was a group "effort".  I'm as much to blame as anyone else.

> I should double confirm this at then.

No need, as above, Google's requirements (assuming the requirements you're referring
to are coming from Google people) are effectively just one data point.  At this
point, I want to drive the decision to support Sylake+ (or not) purely through
discussion of upstream patches.

> > As mentioned in the discussion with Jim, _if_ PMU v4 was introduced with ICX (or
> > later), then I'd be in favor of making VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL a hard
> > requirement.  But losing supporting Skylake+ is a bit much.
> >
> > There are a few warts with nVMX's use of the auto-store list that need to be
> > cleaned up, but on the plus side it's also a good excuse to clean up
> > {add,clear}_atomic_switch_msr(), which have accumulated some cruft and quite a
> > bit of duplicate code.  And while I still dislike using the auto-store list, the
> > code isn't as ugly as it was back in v3 because we _can_ make the "load" VMCS
> > controls mandatory without losing support for any CPUs (they predate PMU v4).
> 
> Yes, xxx_atomic_switch_msr() helpers need to be cleaned up and optimized. I
> suppose we can have an independent patch-set to clean up and support
> global_ctrl with auto-store list for Skylake and Icelake.

I have the code written (I wanted to see how much complexity it would add before
re-opening this discussion).  My plan is to put the Skylake+ support at the end
of the series, not a separate series, so that it can be reviewed in one shot.
E.g. if we can make a change in the "main" series that would simplify Skylake+
support, then I'd prefer to find and implement any such change right away.

Re: [PATCH v5 28/44] KVM: x86/pmu: Load/save GLOBAL_CTRL via entry/exit fields for mediated PMU

Posted by Mi, Dapeng 2 months, 2 weeks ago

On 11/26/2025 1:08 AM, Sean Christopherson wrote:
> On Tue, Nov 25, 2025, Dapeng Mi wrote:
>> On 11/25/2025 9:48 AM, Sean Christopherson wrote:
>>>> +	if (host_pmu->version < 4 || !(host_perf_cap & PERF_CAP_FW_WRITES))
>>>> +		return false;
>>>> +
>>>> +	/*
>>>> +	 * All CPUs that support a mediated PMU are expected to support loading
>>>> +	 * and saving PERF_GLOBAL_CTRL via dedicated VMCS fields.
>>>> +	 */
>>>> +	if (WARN_ON_ONCE(!cpu_has_load_perf_global_ctrl() ||
>>>> +			 !cpu_has_save_perf_global_ctrl()))
>>>> +		return false;
>>> And so this WARN fires due to cpu_has_save_perf_global_ctrl() being false.  The
>>> bad changelog is mine, but the code isn't entirely my fault.  I did suggest the
>>> WARN in v3[1], probably because I forgot when PMU v4 was introduced and no one
>>> corrected me.
>>>
>>> v4 of the series[2] then made cpu_has_save_perf_global_ctrl() a hard requirement,
>>> based on my miguided feedback.
>>>
>>>    * Only support GLOBAL_CTRL save/restore with VMCS exec_ctrl, drop the MSR
>>>      save/retore list support for GLOBAL_CTRL, thus the support of mediated
>>>      vPMU is constrained to SapphireRapids and later CPUs on Intel side.
>>>
>>> Doubly frustrating is that this was discussed in the original RFC, where Jim
>>> pointed out[3] that requiring VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL would prevent
>>> enabling the mediated PMU on Skylake+, and I completely forgot that conversation
>>> by the time v3 of the series rolled around :-(
>> VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL is introduced from SPR and later. I
>> remember the original requirements includes to support Skylake and Icelake,
>> but I ever thought there were some offline sync and the requirement changed...
> Two things:
>
>  1) Upstream's "requirements" are not the same as Google's requirements (or those
>     of any company/individual).  Upstream most definitely is influenced by the
>     needs and desires of end users, but ultimately the decision to do something
>     (or not) is one that needs to be made by the upstream community.
>
>  2) Decisions made off-list need to be summarized and communicated on-list,
>     especially in cases like this where it's a relatively minor detail in a
>     large series/feature, and thus easy to overlook.
>
> I'll follow-up internally to make sure these points are well-understood by Google
> folks as well (at least, those working on KVM).

Understood and would follow.


>
>> My bad,
> Eh, this was a group "effort".  I'm as much to blame as anyone else.
>
>> I should double confirm this at then.
> No need, as above, Google's requirements (assuming the requirements you're referring
> to are coming from Google people) are effectively just one data point.  At this
> point, I want to drive the decision to support Sylake+ (or not) purely through
> discussion of upstream patches.
>
>>> As mentioned in the discussion with Jim, _if_ PMU v4 was introduced with ICX (or
>>> later), then I'd be in favor of making VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL a hard
>>> requirement.  But losing supporting Skylake+ is a bit much.
>>>
>>> There are a few warts with nVMX's use of the auto-store list that need to be
>>> cleaned up, but on the plus side it's also a good excuse to clean up
>>> {add,clear}_atomic_switch_msr(), which have accumulated some cruft and quite a
>>> bit of duplicate code.  And while I still dislike using the auto-store list, the
>>> code isn't as ugly as it was back in v3 because we _can_ make the "load" VMCS
>>> controls mandatory without losing support for any CPUs (they predate PMU v4).
>> Yes, xxx_atomic_switch_msr() helpers need to be cleaned up and optimized. I
>> suppose we can have an independent patch-set to clean up and support
>> global_ctrl with auto-store list for Skylake and Icelake.
> I have the code written (I wanted to see how much complexity it would add before
> re-opening this discussion).  My plan is to put the Skylake+ support at the end
> of the series, not a separate series, so that it can be reviewed in one shot.
> E.g. if we can make a change in the "main" series that would simplify Skylake+
> support, then I'd prefer to find and implement any such change right away.

Sure. Thanks.