[v5] ARM64 PMU Partitioning

[PATCH v5 19/24] KVM: arm64: Implement lazy PMU context swaps

Posted by Colton Lewis 2 months ago

Since many guests will never touch the PMU, they need not pay the cost
of context swapping those registers.

Use an enum to implement a simple state machine for PMU register
access. The PMU either accesses registers virtually or physically.

Virtual access implies all PMU registers are trapped coarsely by
MDCR_EL2.TPM and therefore do not need to be context swapped. Physical
access implies some registers are untrapped through FGT and do need to
be context swapped. All vCPUs do virtual access by default and
transition to physical if the PMU is partitioned and the guest
actually tries a PMU access.

Signed-off-by: Colton Lewis <coltonlewis@google.com>
---
 arch/arm64/include/asm/kvm_host.h       |  1 +
 arch/arm64/include/asm/kvm_pmu.h        |  4 ++++
 arch/arm64/include/asm/kvm_types.h      |  7 ++++++-
 arch/arm64/kvm/debug.c                  |  2 +-
 arch/arm64/kvm/hyp/include/hyp/switch.h |  2 ++
 arch/arm64/kvm/pmu-direct.c             | 21 +++++++++++++++++++++
 arch/arm64/kvm/pmu.c                    |  7 +++++++
 arch/arm64/kvm/sys_regs.c               |  4 ++++
 8 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index c7e52aaf469dc..f92027d8fdfd0 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -1373,6 +1373,7 @@ static inline bool kvm_system_needs_idmapped_vectors(void)
 	return cpus_have_final_cap(ARM64_SPECTRE_V3A);
 }
 
+void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu);
 void kvm_init_host_debug_data(void);
 void kvm_debug_init_vhe(void);
 void kvm_vcpu_load_debug(struct kvm_vcpu *vcpu);
diff --git a/arch/arm64/include/asm/kvm_pmu.h b/arch/arm64/include/asm/kvm_pmu.h
index 25a5eb8c623da..43aa334dce517 100644
--- a/arch/arm64/include/asm/kvm_pmu.h
+++ b/arch/arm64/include/asm/kvm_pmu.h
@@ -38,6 +38,7 @@ struct kvm_pmu {
 	int irq_num;
 	bool created;
 	bool irq_level;
+	enum vcpu_pmu_register_access access;
 };
 
 struct arm_pmu_entry {
@@ -106,6 +107,8 @@ u8 kvm_pmu_hpmn(struct kvm_vcpu *vcpu);
 void kvm_pmu_load(struct kvm_vcpu *vcpu);
 void kvm_pmu_put(struct kvm_vcpu *vcpu);
 
+void kvm_pmu_set_physical_access(struct kvm_vcpu *vcpu);
+
 #if !defined(__KVM_NVHE_HYPERVISOR__)
 bool kvm_vcpu_pmu_is_partitioned(struct kvm_vcpu *vcpu);
 bool kvm_vcpu_pmu_use_fgt(struct kvm_vcpu *vcpu);
@@ -188,6 +191,7 @@ static inline u8 kvm_pmu_hpmn(struct kvm_vcpu *vcpu)
 }
 static inline void kvm_pmu_load(struct kvm_vcpu *vcpu) {}
 static inline void kvm_pmu_put(struct kvm_vcpu *vcpu) {}
+static inline void kvm_pmu_set_physical_access(struct kvm_vcpu *vcpu) {}
 static inline void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu,
 					     u64 select_idx, u64 val) {}
 static inline void kvm_pmu_set_counter_value_user(struct kvm_vcpu *vcpu,
diff --git a/arch/arm64/include/asm/kvm_types.h b/arch/arm64/include/asm/kvm_types.h
index 9a126b9e2d7c9..9f67165359f5c 100644
--- a/arch/arm64/include/asm/kvm_types.h
+++ b/arch/arm64/include/asm/kvm_types.h
@@ -4,5 +4,10 @@
 
 #define KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE 40
 
-#endif /* _ASM_ARM64_KVM_TYPES_H */
+enum vcpu_pmu_register_access {
+	VCPU_PMU_ACCESS_UNSET,
+	VCPU_PMU_ACCESS_VIRTUAL,
+	VCPU_PMU_ACCESS_PHYSICAL,
+};
 
+#endif /* _ASM_ARM64_KVM_TYPES_H */
diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c
index 0ab89c91e19cb..c2cf6b308ec60 100644
--- a/arch/arm64/kvm/debug.c
+++ b/arch/arm64/kvm/debug.c
@@ -34,7 +34,7 @@ static int cpu_has_spe(u64 dfr0)
  *  - Self-hosted Trace Filter controls (MDCR_EL2_TTRF)
  *  - Self-hosted Trace (MDCR_EL2_TTRF/MDCR_EL2_E2TB)
  */
-static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu)
+void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu)
 {
 	int hpmn = kvm_pmu_hpmn(vcpu);
 
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index bde79ec1a1836..ea288a712bb5d 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -963,6 +963,8 @@ static bool kvm_hyp_handle_pmu_regs(struct kvm_vcpu *vcpu)
 	if (ret)
 		__kvm_skip_instr(vcpu);
 
+	kvm_pmu_set_physical_access(vcpu);
+
 	return ret;
 }
 
diff --git a/arch/arm64/kvm/pmu-direct.c b/arch/arm64/kvm/pmu-direct.c
index 8d0d6d1a0d851..c5767e2ebc651 100644
--- a/arch/arm64/kvm/pmu-direct.c
+++ b/arch/arm64/kvm/pmu-direct.c
@@ -73,6 +73,7 @@ bool kvm_vcpu_pmu_use_fgt(struct kvm_vcpu *vcpu)
 	u8 hpmn = vcpu->kvm->arch.nr_pmu_counters;
 
 	return kvm_vcpu_pmu_is_partitioned(vcpu) &&
+		vcpu->arch.pmu.access == VCPU_PMU_ACCESS_PHYSICAL &&
 		cpus_have_final_cap(ARM64_HAS_FGT) &&
 		(hpmn != 0 || cpus_have_final_cap(ARM64_HAS_HPMN0));
 }
@@ -92,6 +93,26 @@ u64 kvm_pmu_fgt2_bits(void)
 		| HDFGRTR2_EL2_nPMICNTR_EL0;
 }
 
+/**
+ * kvm_pmu_set_physical_access()
+ * @vcpu: Pointer to vcpu struct
+ *
+ * Reconfigure the guest for physical access of PMU hardware if
+ * allowed. This means reconfiguring mdcr_el2 and loading the vCPU
+ * state onto hardware.
+ *
+ */
+
+void kvm_pmu_set_physical_access(struct kvm_vcpu *vcpu)
+{
+	if (kvm_vcpu_pmu_is_partitioned(vcpu)
+	    && vcpu->arch.pmu.access == VCPU_PMU_ACCESS_VIRTUAL) {
+		vcpu->arch.pmu.access = VCPU_PMU_ACCESS_PHYSICAL;
+		kvm_arm_setup_mdcr_el2(vcpu);
+		kvm_pmu_load(vcpu);
+	}
+}
+
 /**
  * kvm_pmu_host_counter_mask() - Compute bitmask of host-reserved counters
  * @pmu: Pointer to arm_pmu struct
diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c
index 48b39f096fa12..c9862e55a4049 100644
--- a/arch/arm64/kvm/pmu.c
+++ b/arch/arm64/kvm/pmu.c
@@ -471,6 +471,12 @@ int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+static void kvm_pmu_register_init(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->arch.pmu.access == VCPU_PMU_ACCESS_UNSET)
+		vcpu->arch.pmu.access = VCPU_PMU_ACCESS_VIRTUAL;
+}
+
 static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu)
 {
 	if (irqchip_in_kernel(vcpu->kvm)) {
@@ -496,6 +502,7 @@ static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu)
 	init_irq_work(&vcpu->arch.pmu.overflow_work,
 		      kvm_pmu_perf_overflow_notify_vcpu);
 
+	kvm_pmu_register_init(vcpu);
 	vcpu->arch.pmu.created = true;
 	return 0;
 }
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index f2ae761625a66..d73218706b834 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1197,6 +1197,8 @@ static bool access_pmu_evtyper(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
 		p->regval = __vcpu_sys_reg(vcpu, reg);
 	}
 
+	kvm_pmu_set_physical_access(vcpu);
+
 	return true;
 }
 
@@ -1302,6 +1304,8 @@ static bool access_pmovs(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
 		p->regval = __vcpu_sys_reg(vcpu, PMOVSSET_EL0);
 	}
 
+	kvm_pmu_set_physical_access(vcpu);
+
 	return true;
 }
 
-- 
2.52.0.239.gd5f0c6e74e-goog

Re: [PATCH v5 19/24] KVM: arm64: Implement lazy PMU context swaps

Posted by Oliver Upton 2 months ago

On Tue, Dec 09, 2025 at 08:51:16PM +0000, Colton Lewis wrote:
> +enum vcpu_pmu_register_access {
> +	VCPU_PMU_ACCESS_UNSET,
> +	VCPU_PMU_ACCESS_VIRTUAL,
> +	VCPU_PMU_ACCESS_PHYSICAL,
> +};

This is confusing. Even when the guest is accessing registers directly
on the CPU I'd still call that "hardware assisted virtualization" and
not "physical".

> +#endif /* _ASM_ARM64_KVM_TYPES_H */
> diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c
> index 0ab89c91e19cb..c2cf6b308ec60 100644
> --- a/arch/arm64/kvm/debug.c
> +++ b/arch/arm64/kvm/debug.c
> @@ -34,7 +34,7 @@ static int cpu_has_spe(u64 dfr0)
>   *  - Self-hosted Trace Filter controls (MDCR_EL2_TTRF)
>   *  - Self-hosted Trace (MDCR_EL2_TTRF/MDCR_EL2_E2TB)
>   */
> -static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu)
> +void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu)
>  {
>  	int hpmn = kvm_pmu_hpmn(vcpu);
>  
> diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
> index bde79ec1a1836..ea288a712bb5d 100644
> --- a/arch/arm64/kvm/hyp/include/hyp/switch.h
> +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
> @@ -963,6 +963,8 @@ static bool kvm_hyp_handle_pmu_regs(struct kvm_vcpu *vcpu)
>  	if (ret)
>  		__kvm_skip_instr(vcpu);
>  
> +	kvm_pmu_set_physical_access(vcpu);
> +
>  	return ret;
>  }
>  
> diff --git a/arch/arm64/kvm/pmu-direct.c b/arch/arm64/kvm/pmu-direct.c
> index 8d0d6d1a0d851..c5767e2ebc651 100644
> --- a/arch/arm64/kvm/pmu-direct.c
> +++ b/arch/arm64/kvm/pmu-direct.c
> @@ -73,6 +73,7 @@ bool kvm_vcpu_pmu_use_fgt(struct kvm_vcpu *vcpu)
>  	u8 hpmn = vcpu->kvm->arch.nr_pmu_counters;
>  
>  	return kvm_vcpu_pmu_is_partitioned(vcpu) &&
> +		vcpu->arch.pmu.access == VCPU_PMU_ACCESS_PHYSICAL &&
>  		cpus_have_final_cap(ARM64_HAS_FGT) &&
>  		(hpmn != 0 || cpus_have_final_cap(ARM64_HAS_HPMN0));
>  }
> @@ -92,6 +93,26 @@ u64 kvm_pmu_fgt2_bits(void)
>  		| HDFGRTR2_EL2_nPMICNTR_EL0;
>  }
>  
> +/**
> + * kvm_pmu_set_physical_access()
> + * @vcpu: Pointer to vcpu struct
> + *
> + * Reconfigure the guest for physical access of PMU hardware if
> + * allowed. This means reconfiguring mdcr_el2 and loading the vCPU
> + * state onto hardware.
> + *
> + */
> +
> +void kvm_pmu_set_physical_access(struct kvm_vcpu *vcpu)
> +{
> +	if (kvm_vcpu_pmu_is_partitioned(vcpu)
> +	    && vcpu->arch.pmu.access == VCPU_PMU_ACCESS_VIRTUAL) {
> +		vcpu->arch.pmu.access = VCPU_PMU_ACCESS_PHYSICAL;
> +		kvm_arm_setup_mdcr_el2(vcpu);
> +		kvm_pmu_load(vcpu);
> +	}

It isn't immediately obvious how this guards against preemption.

Also, the general approach for these context-loading situations is to do
a full load/put on the vCPU rather than a directed load.

> +static void kvm_pmu_register_init(struct kvm_vcpu *vcpu)
> +{
> +	if (vcpu->arch.pmu.access == VCPU_PMU_ACCESS_UNSET)
> +		vcpu->arch.pmu.access = VCPU_PMU_ACCESS_VIRTUAL;

This is confusing. The zero value of the enum should be consistent with
the "unloaded" state.

> diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
> index f2ae761625a66..d73218706b834 100644
> --- a/arch/arm64/kvm/sys_regs.c
> +++ b/arch/arm64/kvm/sys_regs.c
> @@ -1197,6 +1197,8 @@ static bool access_pmu_evtyper(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
>  		p->regval = __vcpu_sys_reg(vcpu, reg);
>  	}
>  
> +	kvm_pmu_set_physical_access(vcpu);
> +
>  	return true;
>  }
>  
> @@ -1302,6 +1304,8 @@ static bool access_pmovs(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
>  		p->regval = __vcpu_sys_reg(vcpu, PMOVSSET_EL0);
>  	}
>  
> +	kvm_pmu_set_physical_access(vcpu);
> +
>  	return true;
>  }

Aren't there a ton of other registers the guest may access before
these two? Having generic PMU register accessors would allow you to
manage residence of PMU registers from a single spot.

Thanks,
Oliver

Re: [PATCH v5 19/24] KVM: arm64: Implement lazy PMU context swaps

Posted by Colton Lewis 1 month, 4 weeks ago

Oliver Upton <oupton@kernel.org> writes:

> On Tue, Dec 09, 2025 at 08:51:16PM +0000, Colton Lewis wrote:
>> +enum vcpu_pmu_register_access {
>> +	VCPU_PMU_ACCESS_UNSET,
>> +	VCPU_PMU_ACCESS_VIRTUAL,
>> +	VCPU_PMU_ACCESS_PHYSICAL,
>> +};

> This is confusing. Even when the guest is accessing registers directly
> on the CPU I'd still call that "hardware assisted virtualization" and
> not "physical".

It was what I thought described the access pattern. Do you have another
naming suggestion?

>> +#endif /* _ASM_ARM64_KVM_TYPES_H */
>> diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c
>> index 0ab89c91e19cb..c2cf6b308ec60 100644
>> --- a/arch/arm64/kvm/debug.c
>> +++ b/arch/arm64/kvm/debug.c
>> @@ -34,7 +34,7 @@ static int cpu_has_spe(u64 dfr0)
>>    *  - Self-hosted Trace Filter controls (MDCR_EL2_TTRF)
>>    *  - Self-hosted Trace (MDCR_EL2_TTRF/MDCR_EL2_E2TB)
>>    */
>> -static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu)
>> +void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu)
>>   {
>>   	int hpmn = kvm_pmu_hpmn(vcpu);

>> diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h  
>> b/arch/arm64/kvm/hyp/include/hyp/switch.h
>> index bde79ec1a1836..ea288a712bb5d 100644
>> --- a/arch/arm64/kvm/hyp/include/hyp/switch.h
>> +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
>> @@ -963,6 +963,8 @@ static bool kvm_hyp_handle_pmu_regs(struct kvm_vcpu  
>> *vcpu)
>>   	if (ret)
>>   		__kvm_skip_instr(vcpu);

>> +	kvm_pmu_set_physical_access(vcpu);
>> +
>>   	return ret;
>>   }

>> diff --git a/arch/arm64/kvm/pmu-direct.c b/arch/arm64/kvm/pmu-direct.c
>> index 8d0d6d1a0d851..c5767e2ebc651 100644
>> --- a/arch/arm64/kvm/pmu-direct.c
>> +++ b/arch/arm64/kvm/pmu-direct.c
>> @@ -73,6 +73,7 @@ bool kvm_vcpu_pmu_use_fgt(struct kvm_vcpu *vcpu)
>>   	u8 hpmn = vcpu->kvm->arch.nr_pmu_counters;

>>   	return kvm_vcpu_pmu_is_partitioned(vcpu) &&
>> +		vcpu->arch.pmu.access == VCPU_PMU_ACCESS_PHYSICAL &&
>>   		cpus_have_final_cap(ARM64_HAS_FGT) &&
>>   		(hpmn != 0 || cpus_have_final_cap(ARM64_HAS_HPMN0));
>>   }
>> @@ -92,6 +93,26 @@ u64 kvm_pmu_fgt2_bits(void)
>>   		| HDFGRTR2_EL2_nPMICNTR_EL0;
>>   }

>> +/**
>> + * kvm_pmu_set_physical_access()
>> + * @vcpu: Pointer to vcpu struct
>> + *
>> + * Reconfigure the guest for physical access of PMU hardware if
>> + * allowed. This means reconfiguring mdcr_el2 and loading the vCPU
>> + * state onto hardware.
>> + *
>> + */
>> +
>> +void kvm_pmu_set_physical_access(struct kvm_vcpu *vcpu)
>> +{
>> +	if (kvm_vcpu_pmu_is_partitioned(vcpu)
>> +	    && vcpu->arch.pmu.access == VCPU_PMU_ACCESS_VIRTUAL) {
>> +		vcpu->arch.pmu.access = VCPU_PMU_ACCESS_PHYSICAL;
>> +		kvm_arm_setup_mdcr_el2(vcpu);
>> +		kvm_pmu_load(vcpu);
>> +	}

> It isn't immediately obvious how this guards against preemption.

> Also, the general approach for these context-loading situations is to do
> a full load/put on the vCPU rather than a directed load.

Understood. Will fix.

>> +static void kvm_pmu_register_init(struct kvm_vcpu *vcpu)
>> +{
>> +	if (vcpu->arch.pmu.access == VCPU_PMU_ACCESS_UNSET)
>> +		vcpu->arch.pmu.access = VCPU_PMU_ACCESS_VIRTUAL;

> This is confusing. The zero value of the enum should be consistent with
> the "unloaded" state.

That's the way I initially wrote it but it had a problem on a different
kernel. I forget the exact issue, but I never saw the problem on
upstream so I'm happy to go back to it.

>> diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
>> index f2ae761625a66..d73218706b834 100644
>> --- a/arch/arm64/kvm/sys_regs.c
>> +++ b/arch/arm64/kvm/sys_regs.c
>> @@ -1197,6 +1197,8 @@ static bool access_pmu_evtyper(struct kvm_vcpu  
>> *vcpu, struct sys_reg_params *p,
>>   		p->regval = __vcpu_sys_reg(vcpu, reg);
>>   	}

>> +	kvm_pmu_set_physical_access(vcpu);
>> +
>>   	return true;
>>   }

>> @@ -1302,6 +1304,8 @@ static bool access_pmovs(struct kvm_vcpu *vcpu,  
>> struct sys_reg_params *p,
>>   		p->regval = __vcpu_sys_reg(vcpu, PMOVSSET_EL0);
>>   	}

>> +	kvm_pmu_set_physical_access(vcpu);
>> +
>>   	return true;
>>   }

> Aren't there a ton of other registers the guest may access before
> these two? Having generic PMU register accessors would allow you to
> manage residence of PMU registers from a single spot.

Yes but these are the only two that use the old trap handlers. I also
call set_physical_access from my fast path handler that handles all the
other registers when partitioned.

Agree on having some generic accessors which you mention in an earlier
patch.

Re: [PATCH v5 19/24] KVM: arm64: Implement lazy PMU context swaps

Posted by Oliver Upton 1 month, 3 weeks ago

On Fri, Dec 12, 2025 at 10:25:44PM +0000, Colton Lewis wrote:
> Oliver Upton <oupton@kernel.org> writes:
> 
> > On Tue, Dec 09, 2025 at 08:51:16PM +0000, Colton Lewis wrote:
> > > +enum vcpu_pmu_register_access {
> > > +	VCPU_PMU_ACCESS_UNSET,
> > > +	VCPU_PMU_ACCESS_VIRTUAL,
> > > +	VCPU_PMU_ACCESS_PHYSICAL,
> > > +};
> 
> > This is confusing. Even when the guest is accessing registers directly
> > on the CPU I'd still call that "hardware assisted virtualization" and
> > not "physical".
> 
> It was what I thought described the access pattern. Do you have another
> naming suggestion?

	PMU_STATE_FREE,
	PMU_STATE_GUEST_OWNED,

> > > +	kvm_pmu_set_physical_access(vcpu);
> > > +
> > >   	return true;
> > >   }
> 
> > Aren't there a ton of other registers the guest may access before
> > these two? Having generic PMU register accessors would allow you to
> > manage residence of PMU registers from a single spot.
> 
> Yes but these are the only two that use the old trap handlers. I also
> call set_physical_access from my fast path handler that handles all the
> other registers when partitioned.

The fast path accessors should only be accessing state already loaded
on the CPU. If the guest's PMU context isn't loaded on the CPU then it
should return to a kernel context and do a full put/load on the vCPU.

I'm not seeing how this all fits together but for lazy loading to work
correctly you need to evaluate the state of the vPMU at vcpu_load(). If
there exists an enabled PMC, set PMU_STATE_GUEST_OWNED and load it
upfront.

Otherwise, default to PMU_STATE_FREE until the next register access and
this whole thing resets when the vCPU is scheduled out. I had suggested
to you a while back that you should follow a similar model to the debug
registers, this is how they behave.

Thanks,
Oliver