[RFC PATCH v3 5/8] KVM: arm64: Introduce module param to partition the PMU

Colton Lewis posted 8 patches 10 months, 1 week ago
[RFC PATCH v3 5/8] KVM: arm64: Introduce module param to partition the PMU
Posted by Colton Lewis 10 months, 1 week ago
For PMUv3, the register MDCR_EL2.HPMN partitiones the PMU counters
into two ranges where counters 0..HPMN-1 are accessible by EL1 and, if
allowed, EL0 while counters HPMN..N are only accessible by EL2.

Introduce a module parameter in KVM to set this register. The name
reserved_host_counters reflects the intent to reserve some counters
for the host so the guest may eventually be allowed direct access to a
subset of PMU functionality for increased performance.

Track HPMN and whether the pmu is partitioned in struct arm_pmu
because both KVM and the PMUv3 driver will need to know that to handle
guests correctly.

Due to the difficulty this feature would create for the driver running
at EL1 on the host, partitioning is only allowed in VHE mode. Working
on nVHE mode would require a hypercall for every register access
because the counters reserved for the host by HPMN are now only
accessible to EL2.

The parameter is only configurable at boot time. Making the parameter
configurable on a running system is dangerous due to the difficulty of
knowing for sure no counters are in use anywhere so it is safe to
reporgram HPMN.

Signed-off-by: Colton Lewis <coltonlewis@google.com>
---
 arch/arm64/include/asm/kvm_pmu.h |  4 +++
 arch/arm64/kvm/Makefile          |  2 +-
 arch/arm64/kvm/debug.c           |  9 ++++--
 arch/arm64/kvm/pmu-part.c        | 47 ++++++++++++++++++++++++++++++++
 arch/arm64/kvm/pmu.c             |  2 ++
 include/linux/perf/arm_pmu.h     |  2 ++
 6 files changed, 62 insertions(+), 4 deletions(-)
 create mode 100644 arch/arm64/kvm/pmu-part.c

diff --git a/arch/arm64/include/asm/kvm_pmu.h b/arch/arm64/include/asm/kvm_pmu.h
index 613cddbdbdd8..174b7f376d95 100644
--- a/arch/arm64/include/asm/kvm_pmu.h
+++ b/arch/arm64/include/asm/kvm_pmu.h
@@ -22,6 +22,10 @@ bool kvm_set_pmuserenr(u64 val);
 void kvm_vcpu_pmu_resync_el0(void);
 void kvm_host_pmu_init(struct arm_pmu *pmu);
 
+u8 kvm_pmu_get_reserved_counters(void);
+u8 kvm_pmu_hpmn(u8 nr_counters);
+void kvm_pmu_partition(struct arm_pmu *pmu);
+
 #else
 
 static inline void kvm_set_pmu_events(u64 set, struct perf_event_attr *attr) {}
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 3cf7adb2b503..065a6b804c84 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -25,7 +25,7 @@ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \
 	 vgic/vgic-mmio-v3.o vgic/vgic-kvm-device.o \
 	 vgic/vgic-its.o vgic/vgic-debug.o
 
-kvm-$(CONFIG_HW_PERF_EVENTS)  += pmu-emul.o pmu.o
+kvm-$(CONFIG_HW_PERF_EVENTS)  += pmu-emul.o pmu-part.o pmu.o
 kvm-$(CONFIG_ARM64_PTR_AUTH)  += pauth.o
 kvm-$(CONFIG_PTDUMP_STAGE2_DEBUGFS) += ptdump.o
 
diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c
index 7fb1d9e7180f..b5ac5a213877 100644
--- a/arch/arm64/kvm/debug.c
+++ b/arch/arm64/kvm/debug.c
@@ -31,15 +31,18 @@
  */
 static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu)
 {
+	u8 counters = *host_data_ptr(nr_event_counters);
+	u8 hpmn = kvm_pmu_hpmn(counters);
+
 	preempt_disable();
 
 	/*
 	 * This also clears MDCR_EL2_E2PB_MASK and MDCR_EL2_E2TB_MASK
 	 * to disable guest access to the profiling and trace buffers
 	 */
-	vcpu->arch.mdcr_el2 = FIELD_PREP(MDCR_EL2_HPMN,
-					 *host_data_ptr(nr_event_counters));
-	vcpu->arch.mdcr_el2 |= (MDCR_EL2_TPM |
+	vcpu->arch.mdcr_el2 = FIELD_PREP(MDCR_EL2_HPMN, hpmn);
+	vcpu->arch.mdcr_el2 |= (MDCR_EL2_HPMD |
+				MDCR_EL2_TPM |
 				MDCR_EL2_TPMS |
 				MDCR_EL2_TTRF |
 				MDCR_EL2_TPMCR |
diff --git a/arch/arm64/kvm/pmu-part.c b/arch/arm64/kvm/pmu-part.c
new file mode 100644
index 000000000000..e74fecc67e37
--- /dev/null
+++ b/arch/arm64/kvm/pmu-part.c
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2025 Google LLC
+ * Author: Colton Lewis <coltonlewis@google.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/perf/arm_pmu.h>
+
+#include <asm/kvm_pmu.h>
+
+static u8 reserved_host_counters __read_mostly;
+
+module_param(reserved_host_counters, byte, 0);
+MODULE_PARM_DESC(reserved_host_counters,
+		 "Partition the PMU into host and guest counters");
+
+u8 kvm_pmu_get_reserved_counters(void)
+{
+	return reserved_host_counters;
+}
+
+u8 kvm_pmu_hpmn(u8 nr_counters)
+{
+	if (reserved_host_counters >= nr_counters) {
+		if (this_cpu_has_cap(ARM64_HAS_HPMN0))
+			return 0;
+
+		return 1;
+	}
+
+	return nr_counters - reserved_host_counters;
+}
+
+void kvm_pmu_partition(struct arm_pmu *pmu)
+{
+	u8 nr_counters = *host_data_ptr(nr_event_counters);
+	u8 hpmn = kvm_pmu_hpmn(nr_counters);
+
+	if (hpmn < nr_counters) {
+		pmu->hpmn = hpmn;
+		pmu->partitioned = true;
+	} else {
+		pmu->hpmn = nr_counters;
+		pmu->partitioned = false;
+	}
+}
diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c
index 85b5cb432c4f..7169c1a24dd6 100644
--- a/arch/arm64/kvm/pmu.c
+++ b/arch/arm64/kvm/pmu.c
@@ -243,6 +243,8 @@ void kvm_host_pmu_init(struct arm_pmu *pmu)
 	entry->arm_pmu = pmu;
 	list_add_tail(&entry->entry, &arm_pmus);
 
+	kvm_pmu_partition(pmu);
+
 	if (list_is_singular(&arm_pmus))
 		static_branch_enable(&kvm_arm_pmu_available);
 
diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
index 35c3a85bee43..ee4fc2e26bff 100644
--- a/include/linux/perf/arm_pmu.h
+++ b/include/linux/perf/arm_pmu.h
@@ -125,6 +125,8 @@ struct arm_pmu {
 
 	/* Only to be used by ACPI probing code */
 	unsigned long acpi_cpuid;
+	u8		hpmn; /* MDCR_EL2.HPMN: counter partition pivot */
+	bool		partitioned;
 };
 
 #define to_arm_pmu(p) (container_of(p, struct arm_pmu, pmu))
-- 
2.48.1.601.g30ceb7b040-goog
Re: [RFC PATCH v3 5/8] KVM: arm64: Introduce module param to partition the PMU
Posted by James Clark 8 months, 4 weeks ago

On 13/02/2025 6:03 pm, Colton Lewis wrote:
> For PMUv3, the register MDCR_EL2.HPMN partitiones the PMU counters
> into two ranges where counters 0..HPMN-1 are accessible by EL1 and, if
> allowed, EL0 while counters HPMN..N are only accessible by EL2.
> 
> Introduce a module parameter in KVM to set this register. The name
> reserved_host_counters reflects the intent to reserve some counters
> for the host so the guest may eventually be allowed direct access to a
> subset of PMU functionality for increased performance.
> 
> Track HPMN and whether the pmu is partitioned in struct arm_pmu
> because both KVM and the PMUv3 driver will need to know that to handle
> guests correctly.
> 
> Due to the difficulty this feature would create for the driver running
> at EL1 on the host, partitioning is only allowed in VHE mode. Working
> on nVHE mode would require a hypercall for every register access
> because the counters reserved for the host by HPMN are now only
> accessible to EL2.
> 
> The parameter is only configurable at boot time. Making the parameter
> configurable on a running system is dangerous due to the difficulty of
> knowing for sure no counters are in use anywhere so it is safe to
> reporgram HPMN.
> 

Hi Colton,

For some high level feedback for the RFC, it probably makes sense to 
include the other half of the feature at the same time. I think there is 
a risk that it requires something slightly different than what's here 
and there ends up being some churn.

Other than that I think it looks ok apart from some minor code review nits.

I was also thinking about how BRBE interacts with this. Alex has done 
some analysis that finds that it's difficult to use BRBE in guests with 
virtualized counters due to the fact that BRBE freezes on any counter 
overflow, rather than just guest ones. That leaves the guest with branch 
blackout windows in the delay between a host counter overflowing and the 
interrupt being taken and BRBE being restarted.

But with HPMN, BRBE does allow freeze on overflow of only one partition 
or the other (or both, but I don't think we'd want that) e.g.:

  RNXCWF: If EL2 is implemented, a BRBE freeze event occurs when all of
  the following are true:

  * BRBCR_EL1.FZP is 1.
  * Generation of Branch records is not paused.
  * PMOVSCLR_EL0[(MDCR_EL2.HPMN-1):0] is nonzero.
  * The PE is in a BRBE Non-prohibited region.

Unfortunately that means we could only let guests use BRBE with a 
partitioned PMU, which would massively reduce flexibility if hosts have 
to lose counters just so the guest can use BRBE.

I don't know if this is a stupid idea, but instead of having a fixed 
number for the partition, wouldn't it be nice if we could trap and 
increment HPMN on the first guest use of a counter, then decrement it on 
guest exit depending on what's still in use? The host would always 
assign its counters from the top down, and guests go bottom up if they 
want PMU passthrough. Maybe it's too complicated or won't work for 
various reasons, but because of BRBE the counter partitioning changes go 
from an optimization to almost a necessity.

> Signed-off-by: Colton Lewis <coltonlewis@google.com>
> ---
>   arch/arm64/include/asm/kvm_pmu.h |  4 +++
>   arch/arm64/kvm/Makefile          |  2 +-
>   arch/arm64/kvm/debug.c           |  9 ++++--
>   arch/arm64/kvm/pmu-part.c        | 47 ++++++++++++++++++++++++++++++++
>   arch/arm64/kvm/pmu.c             |  2 ++
>   include/linux/perf/arm_pmu.h     |  2 ++
>   6 files changed, 62 insertions(+), 4 deletions(-)
>   create mode 100644 arch/arm64/kvm/pmu-part.c
> 
> diff --git a/arch/arm64/include/asm/kvm_pmu.h b/arch/arm64/include/asm/kvm_pmu.h
> index 613cddbdbdd8..174b7f376d95 100644
> --- a/arch/arm64/include/asm/kvm_pmu.h
> +++ b/arch/arm64/include/asm/kvm_pmu.h
> @@ -22,6 +22,10 @@ bool kvm_set_pmuserenr(u64 val);
>   void kvm_vcpu_pmu_resync_el0(void);
>   void kvm_host_pmu_init(struct arm_pmu *pmu);
>   
> +u8 kvm_pmu_get_reserved_counters(void);
> +u8 kvm_pmu_hpmn(u8 nr_counters);
> +void kvm_pmu_partition(struct arm_pmu *pmu);
> +
>   #else
>   
>   static inline void kvm_set_pmu_events(u64 set, struct perf_event_attr *attr) {}
> diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
> index 3cf7adb2b503..065a6b804c84 100644
> --- a/arch/arm64/kvm/Makefile
> +++ b/arch/arm64/kvm/Makefile
> @@ -25,7 +25,7 @@ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \
>   	 vgic/vgic-mmio-v3.o vgic/vgic-kvm-device.o \
>   	 vgic/vgic-its.o vgic/vgic-debug.o
>   
> -kvm-$(CONFIG_HW_PERF_EVENTS)  += pmu-emul.o pmu.o
> +kvm-$(CONFIG_HW_PERF_EVENTS)  += pmu-emul.o pmu-part.o pmu.o
>   kvm-$(CONFIG_ARM64_PTR_AUTH)  += pauth.o
>   kvm-$(CONFIG_PTDUMP_STAGE2_DEBUGFS) += ptdump.o
>   
> diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c
> index 7fb1d9e7180f..b5ac5a213877 100644
> --- a/arch/arm64/kvm/debug.c
> +++ b/arch/arm64/kvm/debug.c
> @@ -31,15 +31,18 @@
>    */
>   static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu)
>   {
> +	u8 counters = *host_data_ptr(nr_event_counters);
> +	u8 hpmn = kvm_pmu_hpmn(counters);
> +
>   	preempt_disable();
>   

Would you not need to use vcpu->cpu here to access host_data? The 
preempt_disable() after the access seems suspicious. I think you'll end 
up with the same issue as here:

https://lore.kernel.org/kvmarm/5edb7c69-f548-4651-8b63-1643c5b13dac@linaro.org/

>   	/*
>   	 * This also clears MDCR_EL2_E2PB_MASK and MDCR_EL2_E2TB_MASK
>   	 * to disable guest access to the profiling and trace buffers
>   	 */
> -	vcpu->arch.mdcr_el2 = FIELD_PREP(MDCR_EL2_HPMN,
> -					 *host_data_ptr(nr_event_counters));
> -	vcpu->arch.mdcr_el2 |= (MDCR_EL2_TPM |
> +	vcpu->arch.mdcr_el2 = FIELD_PREP(MDCR_EL2_HPMN, hpmn);
> +	vcpu->arch.mdcr_el2 |= (MDCR_EL2_HPMD |
> +				MDCR_EL2_TPM |
>   				MDCR_EL2_TPMS |
>   				MDCR_EL2_TTRF |
>   				MDCR_EL2_TPMCR |
> diff --git a/arch/arm64/kvm/pmu-part.c b/arch/arm64/kvm/pmu-part.c
> new file mode 100644
> index 000000000000..e74fecc67e37
> --- /dev/null
> +++ b/arch/arm64/kvm/pmu-part.c
> @@ -0,0 +1,47 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright (C) 2025 Google LLC
> + * Author: Colton Lewis <coltonlewis@google.com>
> + */
> +
> +#include <linux/kvm_host.h>
> +#include <linux/perf/arm_pmu.h>
> +
> +#include <asm/kvm_pmu.h>
> +
> +static u8 reserved_host_counters __read_mostly;
> +
> +module_param(reserved_host_counters, byte, 0);
> +MODULE_PARM_DESC(reserved_host_counters,
> +		 "Partition the PMU into host and guest counters");
> +
> +u8 kvm_pmu_get_reserved_counters(void)
> +{
> +	return reserved_host_counters;
> +}
> +
> +u8 kvm_pmu_hpmn(u8 nr_counters)
> +{
> +	if (reserved_host_counters >= nr_counters) {
> +		if (this_cpu_has_cap(ARM64_HAS_HPMN0))
> +			return 0;
> +
> +		return 1;
> +	}
> +
> +	return nr_counters - reserved_host_counters;
> +}
> +
> +void kvm_pmu_partition(struct arm_pmu *pmu)
> +{
> +	u8 nr_counters = *host_data_ptr(nr_event_counters);
> +	u8 hpmn = kvm_pmu_hpmn(nr_counters);
> +
> +	if (hpmn < nr_counters) {
> +		pmu->hpmn = hpmn;
> +		pmu->partitioned = true;

Looks like Rob's point about pmu->partitioned being duplicate data 
stands again. On the previous version you mentioned that saving it was 
to avoid reading PMCR.N, but now it's not based on PMCR.N anymore.

Thanks
James
Re: [RFC PATCH v3 5/8] KVM: arm64: Introduce module param to partition the PMU
Posted by Colton Lewis 8 months, 4 weeks ago
Hi James,

Thanks for the review.

James Clark <james.clark@linaro.org> writes:

> On 13/02/2025 6:03 pm, Colton Lewis wrote:
>> For PMUv3, the register MDCR_EL2.HPMN partitiones the PMU counters
>> into two ranges where counters 0..HPMN-1 are accessible by EL1 and, if
>> allowed, EL0 while counters HPMN..N are only accessible by EL2.

>> Introduce a module parameter in KVM to set this register. The name
>> reserved_host_counters reflects the intent to reserve some counters
>> for the host so the guest may eventually be allowed direct access to a
>> subset of PMU functionality for increased performance.

>> Track HPMN and whether the pmu is partitioned in struct arm_pmu
>> because both KVM and the PMUv3 driver will need to know that to handle
>> guests correctly.

>> Due to the difficulty this feature would create for the driver running
>> at EL1 on the host, partitioning is only allowed in VHE mode. Working
>> on nVHE mode would require a hypercall for every register access
>> because the counters reserved for the host by HPMN are now only
>> accessible to EL2.

>> The parameter is only configurable at boot time. Making the parameter
>> configurable on a running system is dangerous due to the difficulty of
>> knowing for sure no counters are in use anywhere so it is safe to
>> reporgram HPMN.


> Hi Colton,

> For some high level feedback for the RFC, it probably makes sense to
> include the other half of the feature at the same time. I think there is
> a risk that it requires something slightly different than what's here
> and there ends up being some churn.

I agree. That's what I'm working on now. I justed wanted an iteration or
two in public so I'm not building on something that needs drastic change
later.

> Other than that I think it looks ok apart from some minor code review  
> nits.

Thank you

> I was also thinking about how BRBE interacts with this. Alex has done
> some analysis that finds that it's difficult to use BRBE in guests with
> virtualized counters due to the fact that BRBE freezes on any counter
> overflow, rather than just guest ones. That leaves the guest with branch
> blackout windows in the delay between a host counter overflowing and the
> interrupt being taken and BRBE being restarted.

> But with HPMN, BRBE does allow freeze on overflow of only one partition
> or the other (or both, but I don't think we'd want that) e.g.:

>    RNXCWF: If EL2 is implemented, a BRBE freeze event occurs when all of
>    the following are true:

>    * BRBCR_EL1.FZP is 1.
>    * Generation of Branch records is not paused.
>    * PMOVSCLR_EL0[(MDCR_EL2.HPMN-1):0] is nonzero.
>    * The PE is in a BRBE Non-prohibited region.

> Unfortunately that means we could only let guests use BRBE with a
> partitioned PMU, which would massively reduce flexibility if hosts have
> to lose counters just so the guest can use BRBE.

> I don't know if this is a stupid idea, but instead of having a fixed
> number for the partition, wouldn't it be nice if we could trap and
> increment HPMN on the first guest use of a counter, then decrement it on
> guest exit depending on what's still in use? The host would always
> assign its counters from the top down, and guests go bottom up if they
> want PMU passthrough. Maybe it's too complicated or won't work for
> various reasons, but because of BRBE the counter partitioning changes go
> from an optimization to almost a necessity.

This is a cool idea that would enable useful things. I can think of a
few potential problems.

1. Partitioning will give guests direct access to some PMU counter
registers. There is no reliable way for KVM to determine what is in use
from that state. A counter that is disabled guest at exit might only be
so temporarily, which could lead to a lot of thrashing allocating and
deallocating counters.

2. HPMN affects reads of PMCR_EL0.N, which is the standard way to
determine how many counters there are. If HPMN starts as a low number,
guests have no way of knowing there are more counters
available. Dynamically changing the counters available could be
confusing for guests.

3. If guests were aware they could write beyond HPMN and get the
counters allocated to them, nothing stops them from writing at counter
N and taking as many counters as possible to starve the host.

>> Signed-off-by: Colton Lewis <coltonlewis@google.com>
>> ---
>>    arch/arm64/include/asm/kvm_pmu.h |  4 +++
>>    arch/arm64/kvm/Makefile          |  2 +-
>>    arch/arm64/kvm/debug.c           |  9 ++++--
>>    arch/arm64/kvm/pmu-part.c        | 47 ++++++++++++++++++++++++++++++++
>>    arch/arm64/kvm/pmu.c             |  2 ++
>>    include/linux/perf/arm_pmu.h     |  2 ++
>>    6 files changed, 62 insertions(+), 4 deletions(-)
>>    create mode 100644 arch/arm64/kvm/pmu-part.c

>> diff --git a/arch/arm64/include/asm/kvm_pmu.h  
>> b/arch/arm64/include/asm/kvm_pmu.h
>> index 613cddbdbdd8..174b7f376d95 100644
>> --- a/arch/arm64/include/asm/kvm_pmu.h
>> +++ b/arch/arm64/include/asm/kvm_pmu.h
>> @@ -22,6 +22,10 @@ bool kvm_set_pmuserenr(u64 val);
>>    void kvm_vcpu_pmu_resync_el0(void);
>>    void kvm_host_pmu_init(struct arm_pmu *pmu);

>> +u8 kvm_pmu_get_reserved_counters(void);
>> +u8 kvm_pmu_hpmn(u8 nr_counters);
>> +void kvm_pmu_partition(struct arm_pmu *pmu);
>> +
>>    #else

>>    static inline void kvm_set_pmu_events(u64 set, struct perf_event_attr  
>> *attr) {}
>> diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
>> index 3cf7adb2b503..065a6b804c84 100644
>> --- a/arch/arm64/kvm/Makefile
>> +++ b/arch/arm64/kvm/Makefile
>> @@ -25,7 +25,7 @@ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o  
>> pvtime.o \
>>    	 vgic/vgic-mmio-v3.o vgic/vgic-kvm-device.o \
>>    	 vgic/vgic-its.o vgic/vgic-debug.o

>> -kvm-$(CONFIG_HW_PERF_EVENTS)  += pmu-emul.o pmu.o
>> +kvm-$(CONFIG_HW_PERF_EVENTS)  += pmu-emul.o pmu-part.o pmu.o
>>    kvm-$(CONFIG_ARM64_PTR_AUTH)  += pauth.o
>>    kvm-$(CONFIG_PTDUMP_STAGE2_DEBUGFS) += ptdump.o

>> diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c
>> index 7fb1d9e7180f..b5ac5a213877 100644
>> --- a/arch/arm64/kvm/debug.c
>> +++ b/arch/arm64/kvm/debug.c
>> @@ -31,15 +31,18 @@
>>     */
>>    static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu)
>>    {
>> +	u8 counters = *host_data_ptr(nr_event_counters);
>> +	u8 hpmn = kvm_pmu_hpmn(counters);
>> +
>>    	preempt_disable();


> Would you not need to use vcpu->cpu here to access host_data? The
> preempt_disable() after the access seems suspicious. I think you'll end
> up with the same issue as here:

> https://lore.kernel.org/kvmarm/5edb7c69-f548-4651-8b63-1643c5b13dac@linaro.org/

I think that's right. I should use the host_data for vcpu->cpu

>>    	/*
>>    	 * This also clears MDCR_EL2_E2PB_MASK and MDCR_EL2_E2TB_MASK
>>    	 * to disable guest access to the profiling and trace buffers
>>    	 */
>> -	vcpu->arch.mdcr_el2 = FIELD_PREP(MDCR_EL2_HPMN,
>> -					 *host_data_ptr(nr_event_counters));
>> -	vcpu->arch.mdcr_el2 |= (MDCR_EL2_TPM |
>> +	vcpu->arch.mdcr_el2 = FIELD_PREP(MDCR_EL2_HPMN, hpmn);
>> +	vcpu->arch.mdcr_el2 |= (MDCR_EL2_HPMD |
>> +				MDCR_EL2_TPM |
>>    				MDCR_EL2_TPMS |
>>    				MDCR_EL2_TTRF |
>>    				MDCR_EL2_TPMCR |
>> diff --git a/arch/arm64/kvm/pmu-part.c b/arch/arm64/kvm/pmu-part.c
>> new file mode 100644
>> index 000000000000..e74fecc67e37
>> --- /dev/null
>> +++ b/arch/arm64/kvm/pmu-part.c
>> @@ -0,0 +1,47 @@
>> +// SPDX-License-Identifier: GPL-2.0-only
>> +/*
>> + * Copyright (C) 2025 Google LLC
>> + * Author: Colton Lewis <coltonlewis@google.com>
>> + */
>> +
>> +#include <linux/kvm_host.h>
>> +#include <linux/perf/arm_pmu.h>
>> +
>> +#include <asm/kvm_pmu.h>
>> +
>> +static u8 reserved_host_counters __read_mostly;
>> +
>> +module_param(reserved_host_counters, byte, 0);
>> +MODULE_PARM_DESC(reserved_host_counters,
>> +		 "Partition the PMU into host and guest counters");
>> +
>> +u8 kvm_pmu_get_reserved_counters(void)
>> +{
>> +	return reserved_host_counters;
>> +}
>> +
>> +u8 kvm_pmu_hpmn(u8 nr_counters)
>> +{
>> +	if (reserved_host_counters >= nr_counters) {
>> +		if (this_cpu_has_cap(ARM64_HAS_HPMN0))
>> +			return 0;
>> +
>> +		return 1;
>> +	}
>> +
>> +	return nr_counters - reserved_host_counters;
>> +}
>> +
>> +void kvm_pmu_partition(struct arm_pmu *pmu)
>> +{
>> +	u8 nr_counters = *host_data_ptr(nr_event_counters);
>> +	u8 hpmn = kvm_pmu_hpmn(nr_counters);
>> +
>> +	if (hpmn < nr_counters) {
>> +		pmu->hpmn = hpmn;
>> +		pmu->partitioned = true;

> Looks like Rob's point about pmu->partitioned being duplicate data
> stands again. On the previous version you mentioned that saving it was
> to avoid reading PMCR.N, but now it's not based on PMCR.N anymore.

I will make it a function instead so the meaning of hpmn < nr_counters
is clear.
Re: [RFC PATCH v3 5/8] KVM: arm64: Introduce module param to partition the PMU
Posted by James Clark 8 months, 3 weeks ago

On 25/03/2025 6:32 pm, Colton Lewis wrote:
> Hi James,
> 
> Thanks for the review.
> 
> James Clark <james.clark@linaro.org> writes:
> 
>> On 13/02/2025 6:03 pm, Colton Lewis wrote:
>>> For PMUv3, the register MDCR_EL2.HPMN partitiones the PMU counters
>>> into two ranges where counters 0..HPMN-1 are accessible by EL1 and, if
>>> allowed, EL0 while counters HPMN..N are only accessible by EL2.
> 
>>> Introduce a module parameter in KVM to set this register. The name
>>> reserved_host_counters reflects the intent to reserve some counters
>>> for the host so the guest may eventually be allowed direct access to a
>>> subset of PMU functionality for increased performance.
> 
>>> Track HPMN and whether the pmu is partitioned in struct arm_pmu
>>> because both KVM and the PMUv3 driver will need to know that to handle
>>> guests correctly.
> 
>>> Due to the difficulty this feature would create for the driver running
>>> at EL1 on the host, partitioning is only allowed in VHE mode. Working
>>> on nVHE mode would require a hypercall for every register access
>>> because the counters reserved for the host by HPMN are now only
>>> accessible to EL2.
> 
>>> The parameter is only configurable at boot time. Making the parameter
>>> configurable on a running system is dangerous due to the difficulty of
>>> knowing for sure no counters are in use anywhere so it is safe to
>>> reporgram HPMN.
> 
> 
>> Hi Colton,
> 
>> For some high level feedback for the RFC, it probably makes sense to
>> include the other half of the feature at the same time. I think there is
>> a risk that it requires something slightly different than what's here
>> and there ends up being some churn.
> 
> I agree. That's what I'm working on now. I justed wanted an iteration or
> two in public so I'm not building on something that needs drastic change
> later.
> 
>> Other than that I think it looks ok apart from some minor code review 
>> nits.
> 
> Thank you
> 
>> I was also thinking about how BRBE interacts with this. Alex has done
>> some analysis that finds that it's difficult to use BRBE in guests with
>> virtualized counters due to the fact that BRBE freezes on any counter
>> overflow, rather than just guest ones. That leaves the guest with branch
>> blackout windows in the delay between a host counter overflowing and the
>> interrupt being taken and BRBE being restarted.
> 
>> But with HPMN, BRBE does allow freeze on overflow of only one partition
>> or the other (or both, but I don't think we'd want that) e.g.:
> 
>>    RNXCWF: If EL2 is implemented, a BRBE freeze event occurs when all of
>>    the following are true:
> 
>>    * BRBCR_EL1.FZP is 1.
>>    * Generation of Branch records is not paused.
>>    * PMOVSCLR_EL0[(MDCR_EL2.HPMN-1):0] is nonzero.
>>    * The PE is in a BRBE Non-prohibited region.
> 
>> Unfortunately that means we could only let guests use BRBE with a
>> partitioned PMU, which would massively reduce flexibility if hosts have
>> to lose counters just so the guest can use BRBE.
> 
>> I don't know if this is a stupid idea, but instead of having a fixed
>> number for the partition, wouldn't it be nice if we could trap and
>> increment HPMN on the first guest use of a counter, then decrement it on
>> guest exit depending on what's still in use? The host would always
>> assign its counters from the top down, and guests go bottom up if they
>> want PMU passthrough. Maybe it's too complicated or won't work for
>> various reasons, but because of BRBE the counter partitioning changes go
>> from an optimization to almost a necessity.
> 
> This is a cool idea that would enable useful things. I can think of a
> few potential problems.
> 
> 1. Partitioning will give guests direct access to some PMU counter
> registers. There is no reliable way for KVM to determine what is in use
> from that state. A counter that is disabled guest at exit might only be
> so temporarily, which could lead to a lot of thrashing allocating and
> deallocating counters.
> 
> 2. HPMN affects reads of PMCR_EL0.N, which is the standard way to
> determine how many counters there are. If HPMN starts as a low number,
> guests have no way of knowing there are more counters
> available. Dynamically changing the counters available could be
> confusing for guests.
> 

Yes I was expecting that PMCR would have to be trapped and N reported to 
be the number of physical counters rather than how many are in the guest 
partition.

> 3. If guests were aware they could write beyond HPMN and get the
> counters allocated to them, nothing stops them from writing at counter
> N and taking as many counters as possible to starve the host.
> 

Is that much different than how it is now with virtualized PMUs? As in, 
the guest can use all of the counters and the host's events will have to 
contend with them.

You can still have a module param, except it's more of a limit to the 
size of the partition rather than fixing it upfront. The default value 
would be the max number of counters, allowing the most flexibility for 
the common use case where it's unlikely that both host and guests are 
contending for all counters. But if you really want to make sure the 
host doesn't get starved you can set it to a lower value.

All this does sound a bit like it could be done on top of the simple 
partitioning though. And it's mainly for making BRBE more accessible, 
which I'm not 100% convinced that the blackout windows are that big of a 
problem. We could say BRBE may have some holes if the host happens to be 
using counters at the same time, and if you want to be certain of no 
holes, use a host with partitioned counters.

James

Re: [RFC PATCH v3 5/8] KVM: arm64: Introduce module param to partition the PMU
Posted by Oliver Upton 8 months, 3 weeks ago
On Wed, Mar 26, 2025 at 05:38:34PM +0000, James Clark wrote:
> On 25/03/2025 6:32 pm, Colton Lewis wrote:
> > > I don't know if this is a stupid idea, but instead of having a fixed
> > > number for the partition, wouldn't it be nice if we could trap and
> > > increment HPMN on the first guest use of a counter, then decrement it on
> > > guest exit depending on what's still in use? The host would always
> > > assign its counters from the top down, and guests go bottom up if they
> > > want PMU passthrough. Maybe it's too complicated or won't work for
> > > various reasons, but because of BRBE the counter partitioning changes go
> > > from an optimization to almost a necessity.
> > 
> > This is a cool idea that would enable useful things. I can think of a
> > few potential problems.
> > 
> > 1. Partitioning will give guests direct access to some PMU counter
> > registers. There is no reliable way for KVM to determine what is in use
> > from that state. A counter that is disabled guest at exit might only be
> > so temporarily, which could lead to a lot of thrashing allocating and
> > deallocating counters.

KVM must always have a reliable way to determine if the PMU is in use.
If there's any counter in the vPMU for which kvm_pmu_counter_is_enabled()
is true would do the trick...

Generally speaking, I would like to see the guest/host context switch in
KVM modeled in a way similar to the debug registers, where the vPMU
registers are loaded onto hardware lazily if either:

  1) The above definition of an in-use PMU is satisfied

  2) The guest accessed a PMU register since the last vcpu_load()

> > 2. HPMN affects reads of PMCR_EL0.N, which is the standard way to
> > determine how many counters there are. If HPMN starts as a low number,
> > guests have no way of knowing there are more counters
> > available. Dynamically changing the counters available could be
> > confusing for guests.
> > 
> 
> Yes I was expecting that PMCR would have to be trapped and N reported to be
> the number of physical counters rather than how many are in the guest
> partition.

I'm not sure this is aligned with the spirit of the feature.

Colton's aim is to minimize the overheads of trapping the PMU *and*
relying on the perf subsystem for event scheduling. To do dynamic
partitioning as you've described, KVM would need to unconditionally trap
the PMU registers so it can pack the guest counters into the guest
partition. We cannot assume the VM will allocate counters sequentially.

Dynamic counter allocation can be had with the existing PMU
implementation. The partitioned PMU is an alternative userspace can
select, not a replacement for what we already have.

Thanks,
Oliver
Re: [RFC PATCH v3 5/8] KVM: arm64: Introduce module param to partition the PMU
Posted by James Clark 8 months, 3 weeks ago

On 26/03/2025 8:40 pm, Oliver Upton wrote:
> On Wed, Mar 26, 2025 at 05:38:34PM +0000, James Clark wrote:
>> On 25/03/2025 6:32 pm, Colton Lewis wrote:
>>>> I don't know if this is a stupid idea, but instead of having a fixed
>>>> number for the partition, wouldn't it be nice if we could trap and
>>>> increment HPMN on the first guest use of a counter, then decrement it on
>>>> guest exit depending on what's still in use? The host would always
>>>> assign its counters from the top down, and guests go bottom up if they
>>>> want PMU passthrough. Maybe it's too complicated or won't work for
>>>> various reasons, but because of BRBE the counter partitioning changes go
>>>> from an optimization to almost a necessity.
>>>
>>> This is a cool idea that would enable useful things. I can think of a
>>> few potential problems.
>>>
>>> 1. Partitioning will give guests direct access to some PMU counter
>>> registers. There is no reliable way for KVM to determine what is in use
>>> from that state. A counter that is disabled guest at exit might only be
>>> so temporarily, which could lead to a lot of thrashing allocating and
>>> deallocating counters.
> 
> KVM must always have a reliable way to determine if the PMU is in use.
> If there's any counter in the vPMU for which kvm_pmu_counter_is_enabled()
> is true would do the trick...
> 
> Generally speaking, I would like to see the guest/host context switch in
> KVM modeled in a way similar to the debug registers, where the vPMU
> registers are loaded onto hardware lazily if either:
> 
>    1) The above definition of an in-use PMU is satisfied
> 
>    2) The guest accessed a PMU register since the last vcpu_load()
> 
>>> 2. HPMN affects reads of PMCR_EL0.N, which is the standard way to
>>> determine how many counters there are. If HPMN starts as a low number,
>>> guests have no way of knowing there are more counters
>>> available. Dynamically changing the counters available could be
>>> confusing for guests.
>>>
>>
>> Yes I was expecting that PMCR would have to be trapped and N reported to be
>> the number of physical counters rather than how many are in the guest
>> partition.
> 
> I'm not sure this is aligned with the spirit of the feature.
> 
> Colton's aim is to minimize the overheads of trapping the PMU *and*
> relying on the perf subsystem for event scheduling. To do dynamic
> partitioning as you've described, KVM would need to unconditionally trap
> the PMU registers so it can pack the guest counters into the guest
> partition. We cannot assume the VM will allocate counters sequentially.

Yeah I agree, requiring cooperation from the guest probably makes it a 
non starter.

> 
> Dynamic counter allocation can be had with the existing PMU
> implementation. The partitioned PMU is an alternative userspace can
> select, not a replacement for what we already have.
> 
> Thanks,
> Oliver


It's just a shame that it doesn't look like there's a way to make BRBE 
work properly in guests with the existing implementation. Maybe we're 
stuck with only allowing it in a partition for now.

Thanks
James
Re: [RFC PATCH v3 5/8] KVM: arm64: Introduce module param to partition the PMU
Posted by Colton Lewis 10 months, 1 week ago
Colton Lewis <coltonlewis@google.com> writes:

> For PMUv3, the register MDCR_EL2.HPMN partitiones the PMU counters
> into two ranges where counters 0..HPMN-1 are accessible by EL1 and, if
> allowed, EL0 while counters HPMN..N are only accessible by EL2.

> Introduce a module parameter in KVM to set this register. The name
> reserved_host_counters reflects the intent to reserve some counters
> for the host so the guest may eventually be allowed direct access to a
> subset of PMU functionality for increased performance.

> Track HPMN and whether the pmu is partitioned in struct arm_pmu
> because both KVM and the PMUv3 driver will need to know that to handle
> guests correctly.

> Due to the difficulty this feature would create for the driver running
> at EL1 on the host, partitioning is only allowed in VHE mode. Working
> on nVHE mode would require a hypercall for every register access
> because the counters reserved for the host by HPMN are now only
> accessible to EL2.

> The parameter is only configurable at boot time. Making the parameter
> configurable on a running system is dangerous due to the difficulty of
> knowing for sure no counters are in use anywhere so it is safe to
> reporgram HPMN.

> Signed-off-by: Colton Lewis <coltonlewis@google.com>
> ---
>   arch/arm64/include/asm/kvm_pmu.h |  4 +++
>   arch/arm64/kvm/Makefile          |  2 +-
>   arch/arm64/kvm/debug.c           |  9 ++++--
>   arch/arm64/kvm/pmu-part.c        | 47 ++++++++++++++++++++++++++++++++
>   arch/arm64/kvm/pmu.c             |  2 ++
>   include/linux/perf/arm_pmu.h     |  2 ++
>   6 files changed, 62 insertions(+), 4 deletions(-)
>   create mode 100644 arch/arm64/kvm/pmu-part.c

> diff --git a/arch/arm64/include/asm/kvm_pmu.h  
> b/arch/arm64/include/asm/kvm_pmu.h
> index 613cddbdbdd8..174b7f376d95 100644
> --- a/arch/arm64/include/asm/kvm_pmu.h
> +++ b/arch/arm64/include/asm/kvm_pmu.h
> @@ -22,6 +22,10 @@ bool kvm_set_pmuserenr(u64 val);
>   void kvm_vcpu_pmu_resync_el0(void);
>   void kvm_host_pmu_init(struct arm_pmu *pmu);

> +u8 kvm_pmu_get_reserved_counters(void);
> +u8 kvm_pmu_hpmn(u8 nr_counters);
> +void kvm_pmu_partition(struct arm_pmu *pmu);
> +
>   #else

>   static inline void kvm_set_pmu_events(u64 set, struct perf_event_attr  
> *attr) {}
> diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
> index 3cf7adb2b503..065a6b804c84 100644
> --- a/arch/arm64/kvm/Makefile
> +++ b/arch/arm64/kvm/Makefile
> @@ -25,7 +25,7 @@ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o  
> pvtime.o \
>   	 vgic/vgic-mmio-v3.o vgic/vgic-kvm-device.o \
>   	 vgic/vgic-its.o vgic/vgic-debug.o

> -kvm-$(CONFIG_HW_PERF_EVENTS)  += pmu-emul.o pmu.o
> +kvm-$(CONFIG_HW_PERF_EVENTS)  += pmu-emul.o pmu-part.o pmu.o
>   kvm-$(CONFIG_ARM64_PTR_AUTH)  += pauth.o
>   kvm-$(CONFIG_PTDUMP_STAGE2_DEBUGFS) += ptdump.o

> diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c
> index 7fb1d9e7180f..b5ac5a213877 100644
> --- a/arch/arm64/kvm/debug.c
> +++ b/arch/arm64/kvm/debug.c
> @@ -31,15 +31,18 @@
>    */
>   static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu)
>   {
> +	u8 counters = *host_data_ptr(nr_event_counters);
> +	u8 hpmn = kvm_pmu_hpmn(counters);
> +
>   	preempt_disable();

>   	/*
>   	 * This also clears MDCR_EL2_E2PB_MASK and MDCR_EL2_E2TB_MASK
>   	 * to disable guest access to the profiling and trace buffers
>   	 */
> -	vcpu->arch.mdcr_el2 = FIELD_PREP(MDCR_EL2_HPMN,
> -					 *host_data_ptr(nr_event_counters));
> -	vcpu->arch.mdcr_el2 |= (MDCR_EL2_TPM |
> +	vcpu->arch.mdcr_el2 = FIELD_PREP(MDCR_EL2_HPMN, hpmn);
> +	vcpu->arch.mdcr_el2 |= (MDCR_EL2_HPMD |
> +				MDCR_EL2_TPM |
>   				MDCR_EL2_TPMS |
>   				MDCR_EL2_TTRF |
>   				MDCR_EL2_TPMCR |
> diff --git a/arch/arm64/kvm/pmu-part.c b/arch/arm64/kvm/pmu-part.c
> new file mode 100644
> index 000000000000..e74fecc67e37
> --- /dev/null
> +++ b/arch/arm64/kvm/pmu-part.c
> @@ -0,0 +1,47 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright (C) 2025 Google LLC
> + * Author: Colton Lewis <coltonlewis@google.com>
> + */
> +
> +#include <linux/kvm_host.h>
> +#include <linux/perf/arm_pmu.h>
> +
> +#include <asm/kvm_pmu.h>
> +
> +static u8 reserved_host_counters __read_mostly;
> +
> +module_param(reserved_host_counters, byte, 0);
> +MODULE_PARM_DESC(reserved_host_counters,
> +		 "Partition the PMU into host and guest counters");
> +
> +u8 kvm_pmu_get_reserved_counters(void)
> +{
> +	return reserved_host_counters;
> +}
> +
> +u8 kvm_pmu_hpmn(u8 nr_counters)
> +{
> +	if (reserved_host_counters >= nr_counters) {
> +		if (this_cpu_has_cap(ARM64_HAS_HPMN0))
> +			return 0;
> +
> +		return 1;
> +	}
> +
> +	return nr_counters - reserved_host_counters;
> +}
> +
> +void kvm_pmu_partition(struct arm_pmu *pmu)
> +{
> +	u8 nr_counters = *host_data_ptr(nr_event_counters);
> +	u8 hpmn = kvm_pmu_hpmn(nr_counters);
> +
> +	if (hpmn < nr_counters) {
> +		pmu->hpmn = hpmn;
> +		pmu->partitioned = true;
> +	} else {
> +		pmu->hpmn = nr_counters;
> +		pmu->partitioned = false;
> +	}
> +}

There should be a VHE check in here. I thought I wouldn't need it with
moving MDCR_EL2 writes out of the driver but I just remembered there are
two spots in patch 7 I still need to write that register.

> diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c
> index 85b5cb432c4f..7169c1a24dd6 100644
> --- a/arch/arm64/kvm/pmu.c
> +++ b/arch/arm64/kvm/pmu.c
> @@ -243,6 +243,8 @@ void kvm_host_pmu_init(struct arm_pmu *pmu)
>   	entry->arm_pmu = pmu;
>   	list_add_tail(&entry->entry, &arm_pmus);

> +	kvm_pmu_partition(pmu);
> +
>   	if (list_is_singular(&arm_pmus))
>   		static_branch_enable(&kvm_arm_pmu_available);

> diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
> index 35c3a85bee43..ee4fc2e26bff 100644
> --- a/include/linux/perf/arm_pmu.h
> +++ b/include/linux/perf/arm_pmu.h
> @@ -125,6 +125,8 @@ struct arm_pmu {

>   	/* Only to be used by ACPI probing code */
>   	unsigned long acpi_cpuid;
> +	u8		hpmn; /* MDCR_EL2.HPMN: counter partition pivot */
> +	bool		partitioned;
>   };

>   #define to_arm_pmu(p) (container_of(p, struct arm_pmu, pmu))
> --
> 2.48.1.601.g30ceb7b040-goog