[PATCH v5 17/24] KVM: arm64: Context swap Partitioned PMU guest registers

Colton Lewis posted 24 patches 1 week, 1 day ago
[PATCH v5 17/24] KVM: arm64: Context swap Partitioned PMU guest registers
Posted by Colton Lewis 1 week, 1 day ago
Save and restore newly untrapped registers that can be directly
accessed by the guest when the PMU is partitioned.

* PMEVCNTRn_EL0
* PMCCNTR_EL0
* PMICNTR_EL0
* PMUSERENR_EL0
* PMSELR_EL0
* PMCR_EL0
* PMCNTEN_EL0
* PMINTEN_EL1

If we know we are not using FGT (that is, trapping everything), then
return immediately. Either the PMU is not partitioned, or it is but
all register writes are being written through the VCPU fields to
hardware, so all values are fresh.

Since we are taking over context switching, avoid the writes to
PMSELR_EL0 and PMUSERENR_EL0 that would normally occur in
__{,de}activate_traps_common()

Signed-off-by: Colton Lewis <coltonlewis@google.com>
---
 arch/arm64/include/asm/kvm_pmu.h        |   4 +
 arch/arm64/kvm/arm.c                    |   2 +
 arch/arm64/kvm/hyp/include/hyp/switch.h |   4 +-
 arch/arm64/kvm/pmu-direct.c             | 112 ++++++++++++++++++++++++
 4 files changed, 120 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_pmu.h b/arch/arm64/include/asm/kvm_pmu.h
index 8b634112eded2..25a5eb8c623da 100644
--- a/arch/arm64/include/asm/kvm_pmu.h
+++ b/arch/arm64/include/asm/kvm_pmu.h
@@ -103,6 +103,8 @@ void kvm_pmu_host_counters_disable(void);
 
 u8 kvm_pmu_guest_num_counters(struct kvm_vcpu *vcpu);
 u8 kvm_pmu_hpmn(struct kvm_vcpu *vcpu);
+void kvm_pmu_load(struct kvm_vcpu *vcpu);
+void kvm_pmu_put(struct kvm_vcpu *vcpu);
 
 #if !defined(__KVM_NVHE_HYPERVISOR__)
 bool kvm_vcpu_pmu_is_partitioned(struct kvm_vcpu *vcpu);
@@ -184,6 +186,8 @@ static inline u8 kvm_pmu_hpmn(struct kvm_vcpu *vcpu)
 {
 	return 0;
 }
+static inline void kvm_pmu_load(struct kvm_vcpu *vcpu) {}
+static inline void kvm_pmu_put(struct kvm_vcpu *vcpu) {}
 static inline void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu,
 					     u64 select_idx, u64 val) {}
 static inline void kvm_pmu_set_counter_value_user(struct kvm_vcpu *vcpu,
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 43e92f35f56ab..1750df5944f6d 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -629,6 +629,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		kvm_vcpu_load_vhe(vcpu);
 	kvm_arch_vcpu_load_fp(vcpu);
 	kvm_vcpu_pmu_restore_guest(vcpu);
+	kvm_pmu_load(vcpu);
 	if (kvm_arm_is_pvtime_enabled(&vcpu->arch))
 		kvm_make_request(KVM_REQ_RECORD_STEAL, vcpu);
 
@@ -671,6 +672,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 	kvm_timer_vcpu_put(vcpu);
 	kvm_vgic_put(vcpu);
 	kvm_vcpu_pmu_restore_host(vcpu);
+	kvm_pmu_put(vcpu);
 	if (vcpu_has_nv(vcpu))
 		kvm_vcpu_put_hw_mmu(vcpu);
 	kvm_arm_vmid_clear_active();
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index 40bd00df6c58f..bde79ec1a1836 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -311,7 +311,7 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu)
 	 * counter, which could make a PMXEVCNTR_EL0 access UNDEF at
 	 * EL1 instead of being trapped to EL2.
 	 */
-	if (system_supports_pmuv3()) {
+	if (system_supports_pmuv3() && !kvm_vcpu_pmu_is_partitioned(vcpu)) {
 		write_sysreg(0, pmselr_el0);
 
 		ctxt_sys_reg(hctxt, PMUSERENR_EL0) = read_sysreg(pmuserenr_el0);
@@ -340,7 +340,7 @@ static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu)
 	struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt);
 
 	write_sysreg(0, hstr_el2);
-	if (system_supports_pmuv3()) {
+	if (system_supports_pmuv3() && !kvm_vcpu_pmu_is_partitioned(vcpu)) {
 		write_sysreg(ctxt_sys_reg(hctxt, PMUSERENR_EL0), pmuserenr_el0);
 		vcpu_clear_flag(vcpu, PMUSERENR_ON_CPU);
 	}
diff --git a/arch/arm64/kvm/pmu-direct.c b/arch/arm64/kvm/pmu-direct.c
index 7fb4fb5c22e2a..71977d24f489a 100644
--- a/arch/arm64/kvm/pmu-direct.c
+++ b/arch/arm64/kvm/pmu-direct.c
@@ -9,6 +9,7 @@
 #include <linux/perf/arm_pmuv3.h>
 
 #include <asm/arm_pmuv3.h>
+#include <asm/kvm_emulate.h>
 #include <asm/kvm_pmu.h>
 
 /**
@@ -219,3 +220,114 @@ u8 kvm_pmu_hpmn(struct kvm_vcpu *vcpu)
 
 	return nr_host_cnt_max;
 }
+
+/**
+ * kvm_pmu_load() - Load untrapped PMU registers
+ * @vcpu: Pointer to struct kvm_vcpu
+ *
+ * Load all untrapped PMU registers from the VCPU into the PCPU. Mask
+ * to only bits belonging to guest-reserved counters and leave
+ * host-reserved counters alone in bitmask registers.
+ */
+void kvm_pmu_load(struct kvm_vcpu *vcpu)
+{
+	struct arm_pmu *pmu;
+	u64 mask;
+	u8 i;
+	u64 val;
+
+	/*
+	 * If we aren't using FGT then we are trapping everything
+	 * anyway, so no need to bother with the swap.
+	 */
+	if (!kvm_vcpu_pmu_use_fgt(vcpu))
+		return;
+
+	pmu = vcpu->kvm->arch.arm_pmu;
+
+	for (i = 0; i < pmu->hpmn_max; i++) {
+		val = __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i);
+		write_pmevcntrn(i, val);
+	}
+
+	val = __vcpu_sys_reg(vcpu, PMCCNTR_EL0);
+	write_pmccntr(val);
+
+	val = __vcpu_sys_reg(vcpu, PMUSERENR_EL0);
+	write_pmuserenr(val);
+
+	val = __vcpu_sys_reg(vcpu, PMSELR_EL0);
+	write_pmselr(val);
+
+	/* Save only the stateful writable bits. */
+	val = __vcpu_sys_reg(vcpu, PMCR_EL0);
+	mask = ARMV8_PMU_PMCR_MASK &
+		~(ARMV8_PMU_PMCR_P | ARMV8_PMU_PMCR_C);
+	write_pmcr(val & mask);
+
+	/*
+	 * When handling these:
+	 * 1. Apply only the bits for guest counters (indicated by mask)
+	 * 2. Use the different registers for set and clear
+	 */
+	mask = kvm_pmu_guest_counter_mask(pmu);
+
+	val = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0);
+	write_pmcntenset(val & mask);
+	write_pmcntenclr(~val & mask);
+
+	val = __vcpu_sys_reg(vcpu, PMINTENSET_EL1);
+	write_pmintenset(val & mask);
+	write_pmintenclr(~val & mask);
+}
+
+/**
+ * kvm_pmu_put() - Put untrapped PMU registers
+ * @vcpu: Pointer to struct kvm_vcpu
+ *
+ * Put all untrapped PMU registers from the VCPU into the PCPU. Mask
+ * to only bits belonging to guest-reserved counters and leave
+ * host-reserved counters alone in bitmask registers.
+ */
+void kvm_pmu_put(struct kvm_vcpu *vcpu)
+{
+	struct arm_pmu *pmu;
+	u64 mask;
+	u8 i;
+	u64 val;
+
+	/*
+	 * If we aren't using FGT then we are trapping everything
+	 * anyway, so no need to bother with the swap.
+	 */
+	if (!kvm_vcpu_pmu_use_fgt(vcpu))
+		return;
+
+	pmu = vcpu->kvm->arch.arm_pmu;
+
+	for (i = 0; i < pmu->hpmn_max; i++) {
+		val = read_pmevcntrn(i);
+		__vcpu_assign_sys_reg(vcpu, PMEVCNTR0_EL0 + i, val);
+	}
+
+	val = read_pmccntr();
+	__vcpu_assign_sys_reg(vcpu, PMCCNTR_EL0, val);
+
+	val = read_pmuserenr();
+	__vcpu_assign_sys_reg(vcpu, PMUSERENR_EL0, val);
+
+	val = read_pmselr();
+	__vcpu_assign_sys_reg(vcpu, PMSELR_EL0, val);
+
+	val = read_pmcr();
+	__vcpu_assign_sys_reg(vcpu, PMCR_EL0, val);
+
+	/* Mask these to only save the guest relevant bits. */
+	mask = kvm_pmu_guest_counter_mask(pmu);
+
+	val = read_pmcntenset();
+	__vcpu_assign_sys_reg(vcpu, PMCNTENSET_EL0, val & mask);
+
+	val = read_pmintenset();
+	__vcpu_assign_sys_reg(vcpu, PMINTENSET_EL1, val & mask);
+}
-- 
2.52.0.239.gd5f0c6e74e-goog
Re: [PATCH v5 17/24] KVM: arm64: Context swap Partitioned PMU guest registers
Posted by Oliver Upton 1 week, 1 day ago
On Tue, Dec 09, 2025 at 08:51:14PM +0000, Colton Lewis wrote:
> +/**
> + * kvm_pmu_load() - Load untrapped PMU registers
> + * @vcpu: Pointer to struct kvm_vcpu
> + *
> + * Load all untrapped PMU registers from the VCPU into the PCPU. Mask
> + * to only bits belonging to guest-reserved counters and leave
> + * host-reserved counters alone in bitmask registers.
> + */
> +void kvm_pmu_load(struct kvm_vcpu *vcpu)
> +{
> +	struct arm_pmu *pmu;
> +	u64 mask;
> +	u8 i;
> +	u64 val;
> +

Assert that preemption is disabled.

> +	/*
> +	 * If we aren't using FGT then we are trapping everything
> +	 * anyway, so no need to bother with the swap.
> +	 */
> +	if (!kvm_vcpu_pmu_use_fgt(vcpu))
> +		return;

Uhh... Then how do events count in this case?

The absence of FEAT_FGT shouldn't affect the residence of the guest PMU
context. We just need to handle the extra traps, ideally by reading the
PMU registers directly as a fast path exit handler.

> +	pmu = vcpu->kvm->arch.arm_pmu;
> +
> +	for (i = 0; i < pmu->hpmn_max; i++) {
> +		val = __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i);
> +		write_pmevcntrn(i, val);
> +	}
> +
> +	val = __vcpu_sys_reg(vcpu, PMCCNTR_EL0);
> +	write_pmccntr(val);
> +
> +	val = __vcpu_sys_reg(vcpu, PMUSERENR_EL0);
> +	write_pmuserenr(val);

What about the host's value for PMUSERENR?

> +	val = __vcpu_sys_reg(vcpu, PMSELR_EL0);
> +	write_pmselr(val);

PMSELR_EL0 needs to be switched late, e.g. at sysreg_restore_guest_state_vhe().
Even though the host doesn't currently use the selector-based accessor,
I'd prefer we not load things that'd affect the host context until we're
about to enter the guest.

> +	/* Save only the stateful writable bits. */
> +	val = __vcpu_sys_reg(vcpu, PMCR_EL0);
> +	mask = ARMV8_PMU_PMCR_MASK &
> +		~(ARMV8_PMU_PMCR_P | ARMV8_PMU_PMCR_C);
> +	write_pmcr(val & mask);
> +
> +	/*
> +	 * When handling these:
> +	 * 1. Apply only the bits for guest counters (indicated by mask)
> +	 * 2. Use the different registers for set and clear
> +	 */
> +	mask = kvm_pmu_guest_counter_mask(pmu);
> +
> +	val = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0);
> +	write_pmcntenset(val & mask);
> +	write_pmcntenclr(~val & mask);
> +
> +	val = __vcpu_sys_reg(vcpu, PMINTENSET_EL1);
> +	write_pmintenset(val & mask);
> +	write_pmintenclr(~val & mask);

Is this safe? What happens if we put the PMU into an overflow condition?

> +}
> +
> +/**
> + * kvm_pmu_put() - Put untrapped PMU registers
> + * @vcpu: Pointer to struct kvm_vcpu
> + *
> + * Put all untrapped PMU registers from the VCPU into the PCPU. Mask
> + * to only bits belonging to guest-reserved counters and leave
> + * host-reserved counters alone in bitmask registers.
> + */
> +void kvm_pmu_put(struct kvm_vcpu *vcpu)
> +{
> +	struct arm_pmu *pmu;
> +	u64 mask;
> +	u8 i;
> +	u64 val;
> +
> +	/*
> +	 * If we aren't using FGT then we are trapping everything
> +	 * anyway, so no need to bother with the swap.
> +	 */
> +	if (!kvm_vcpu_pmu_use_fgt(vcpu))
> +		return;
> +
> +	pmu = vcpu->kvm->arch.arm_pmu;
> +
> +	for (i = 0; i < pmu->hpmn_max; i++) {
> +		val = read_pmevcntrn(i);
> +		__vcpu_assign_sys_reg(vcpu, PMEVCNTR0_EL0 + i, val);
> +	}
> +
> +	val = read_pmccntr();
> +	__vcpu_assign_sys_reg(vcpu, PMCCNTR_EL0, val);
> +
> +	val = read_pmuserenr();
> +	__vcpu_assign_sys_reg(vcpu, PMUSERENR_EL0, val);
> +
> +	val = read_pmselr();
> +	__vcpu_assign_sys_reg(vcpu, PMSELR_EL0, val);
> +
> +	val = read_pmcr();
> +	__vcpu_assign_sys_reg(vcpu, PMCR_EL0, val);
> +
> +	/* Mask these to only save the guest relevant bits. */
> +	mask = kvm_pmu_guest_counter_mask(pmu);
> +
> +	val = read_pmcntenset();
> +	__vcpu_assign_sys_reg(vcpu, PMCNTENSET_EL0, val & mask);
> +
> +	val = read_pmintenset();
> +	__vcpu_assign_sys_reg(vcpu, PMINTENSET_EL1, val & mask);

What if the PMU is in an overflow state at this point?

Thanks,
Oliver
Re: [PATCH v5 17/24] KVM: arm64: Context swap Partitioned PMU guest registers
Posted by Colton Lewis 5 days, 10 hours ago
Oliver Upton <oupton@kernel.org> writes:

> On Tue, Dec 09, 2025 at 08:51:14PM +0000, Colton Lewis wrote:
>> +/**
>> + * kvm_pmu_load() - Load untrapped PMU registers
>> + * @vcpu: Pointer to struct kvm_vcpu
>> + *
>> + * Load all untrapped PMU registers from the VCPU into the PCPU. Mask
>> + * to only bits belonging to guest-reserved counters and leave
>> + * host-reserved counters alone in bitmask registers.
>> + */
>> +void kvm_pmu_load(struct kvm_vcpu *vcpu)
>> +{
>> +	struct arm_pmu *pmu;
>> +	u64 mask;
>> +	u8 i;
>> +	u64 val;
>> +

> Assert that preemption is disabled.

Will do.

>> +	/*
>> +	 * If we aren't using FGT then we are trapping everything
>> +	 * anyway, so no need to bother with the swap.
>> +	 */
>> +	if (!kvm_vcpu_pmu_use_fgt(vcpu))
>> +		return;

> Uhh... Then how do events count in this case?

> The absence of FEAT_FGT shouldn't affect the residence of the guest PMU
> context. We just need to handle the extra traps, ideally by reading the
> PMU registers directly as a fast path exit handler.

Agreed. Yeah I fixed this in my internal backports but looks like I
skipped incorperating the fix here.

>> +	pmu = vcpu->kvm->arch.arm_pmu;
>> +
>> +	for (i = 0; i < pmu->hpmn_max; i++) {
>> +		val = __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i);
>> +		write_pmevcntrn(i, val);
>> +	}
>> +
>> +	val = __vcpu_sys_reg(vcpu, PMCCNTR_EL0);
>> +	write_pmccntr(val);
>> +
>> +	val = __vcpu_sys_reg(vcpu, PMUSERENR_EL0);
>> +	write_pmuserenr(val);

> What about the host's value for PMUSERENR?
>> +	val = __vcpu_sys_reg(vcpu, PMSELR_EL0);
>> +	write_pmselr(val);

> PMSELR_EL0 needs to be switched late, e.g. at  
> sysreg_restore_guest_state_vhe().
> Even though the host doesn't currently use the selector-based accessor,
> I'd prefer we not load things that'd affect the host context until we're
> about to enter the guest.


There's a spot in __activate_traps_common() where the host value for
PMUSERENR is saved and PMSELR is zeroed. I stopped that branch when
partitioning because it was clobbering my loaded values, but I can
modify instead to handle these things as they should be handled.

>> +	/* Save only the stateful writable bits. */
>> +	val = __vcpu_sys_reg(vcpu, PMCR_EL0);
>> +	mask = ARMV8_PMU_PMCR_MASK &
>> +		~(ARMV8_PMU_PMCR_P | ARMV8_PMU_PMCR_C);
>> +	write_pmcr(val & mask);
>> +
>> +	/*
>> +	 * When handling these:
>> +	 * 1. Apply only the bits for guest counters (indicated by mask)
>> +	 * 2. Use the different registers for set and clear
>> +	 */
>> +	mask = kvm_pmu_guest_counter_mask(pmu);
>> +
>> +	val = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0);
>> +	write_pmcntenset(val & mask);
>> +	write_pmcntenclr(~val & mask);
>> +
>> +	val = __vcpu_sys_reg(vcpu, PMINTENSET_EL1);
>> +	write_pmintenset(val & mask);
>> +	write_pmintenclr(~val & mask);

> Is this safe? What happens if we put the PMU into an overflow condition?

It gets handled by the host same as any other PMU interrupt. Though I
remember from our conversation you don't want the latency of an
additional interrupt so I can handle that here.

>> +}
>> +
>> +/**
>> + * kvm_pmu_put() - Put untrapped PMU registers
>> + * @vcpu: Pointer to struct kvm_vcpu
>> + *
>> + * Put all untrapped PMU registers from the VCPU into the PCPU. Mask
>> + * to only bits belonging to guest-reserved counters and leave
>> + * host-reserved counters alone in bitmask registers.
>> + */
>> +void kvm_pmu_put(struct kvm_vcpu *vcpu)
>> +{
>> +	struct arm_pmu *pmu;
>> +	u64 mask;
>> +	u8 i;
>> +	u64 val;
>> +
>> +	/*
>> +	 * If we aren't using FGT then we are trapping everything
>> +	 * anyway, so no need to bother with the swap.
>> +	 */
>> +	if (!kvm_vcpu_pmu_use_fgt(vcpu))
>> +		return;
>> +
>> +	pmu = vcpu->kvm->arch.arm_pmu;
>> +
>> +	for (i = 0; i < pmu->hpmn_max; i++) {
>> +		val = read_pmevcntrn(i);
>> +		__vcpu_assign_sys_reg(vcpu, PMEVCNTR0_EL0 + i, val);
>> +	}
>> +
>> +	val = read_pmccntr();
>> +	__vcpu_assign_sys_reg(vcpu, PMCCNTR_EL0, val);
>> +
>> +	val = read_pmuserenr();
>> +	__vcpu_assign_sys_reg(vcpu, PMUSERENR_EL0, val);
>> +
>> +	val = read_pmselr();
>> +	__vcpu_assign_sys_reg(vcpu, PMSELR_EL0, val);
>> +
>> +	val = read_pmcr();
>> +	__vcpu_assign_sys_reg(vcpu, PMCR_EL0, val);
>> +
>> +	/* Mask these to only save the guest relevant bits. */
>> +	mask = kvm_pmu_guest_counter_mask(pmu);
>> +
>> +	val = read_pmcntenset();
>> +	__vcpu_assign_sys_reg(vcpu, PMCNTENSET_EL0, val & mask);
>> +
>> +	val = read_pmintenset();
>> +	__vcpu_assign_sys_reg(vcpu, PMINTENSET_EL1, val & mask);

> What if the PMU is in an overflow state at this point?

Is this a separate concern from the point above? It gets loaded back
that way and the normal interrupt machinery handles it.