[PATCH 06/17] KVM: arm64: Introduce method to partition the PMU

Colton Lewis posted 17 patches 6 months, 2 weeks ago
There is a newer version of this series
[PATCH 06/17] KVM: arm64: Introduce method to partition the PMU
Posted by Colton Lewis 6 months, 2 weeks ago
For PMUv3, the register field MDCR_EL2.HPMN partitiones the PMU
counters into two ranges where counters 0..HPMN-1 are accessible by
EL1 and, if allowed, EL0 while counters HPMN..N are only accessible by
EL2.

Track HPMN and in a variable in struct arm_pmu because both KVM and
the PMUv3 driver will need to know that to handle guests
correctly. Introduce the function kvm_pmu_partition() to set this
variable and modify the PMU driver's cntr_mask of available counters
to exclude the counters being reserved for the guest. Finally, make
sure HPMN is set with this value when setting up the MDCR_EL2
register.

Create a module parameter reserved_host_counters to set a default
value. A more flexible uAPI will be added in a later commit.

Due to the difficulty this feature would create for the driver running
at EL1 on the host, partitioning is only allowed in VHE mode. Working
on nVHE mode would require a hypercall for every counter access in the
driver because the counters reserved for the host by HPMN are only
accessible to EL2.

Signed-off-by: Colton Lewis <coltonlewis@google.com>
---
 arch/arm64/include/asm/kvm_pmu.h |  19 +++++
 arch/arm64/kvm/Makefile          |   2 +-
 arch/arm64/kvm/debug.c           |   9 ++-
 arch/arm64/kvm/pmu-part.c        | 117 +++++++++++++++++++++++++++++++
 arch/arm64/kvm/pmu.c             |  13 ++++
 include/linux/perf/arm_pmu.h     |   1 +
 6 files changed, 157 insertions(+), 4 deletions(-)
 create mode 100644 arch/arm64/kvm/pmu-part.c

diff --git a/arch/arm64/include/asm/kvm_pmu.h b/arch/arm64/include/asm/kvm_pmu.h
index 613cddbdbdd8..83b81e7829bf 100644
--- a/arch/arm64/include/asm/kvm_pmu.h
+++ b/arch/arm64/include/asm/kvm_pmu.h
@@ -22,6 +22,10 @@ bool kvm_set_pmuserenr(u64 val);
 void kvm_vcpu_pmu_resync_el0(void);
 void kvm_host_pmu_init(struct arm_pmu *pmu);
 
+bool kvm_pmu_partition_supported(void);
+u8 kvm_pmu_hpmn(u8 host_counters);
+int kvm_pmu_partition(struct arm_pmu *pmu, u8 host_counters);
+
 #else
 
 static inline void kvm_set_pmu_events(u64 set, struct perf_event_attr *attr) {}
@@ -33,6 +37,21 @@ static inline bool kvm_set_pmuserenr(u64 val)
 static inline void kvm_vcpu_pmu_resync_el0(void) {}
 static inline void kvm_host_pmu_init(struct arm_pmu *pmu) {}
 
+static inline bool kvm_pmu_partiton_supported(void)
+{
+	return false;
+}
+
+static inline u8 kvm_pmu_hpmn(u8 nr_counters)
+{
+	return -1;
+}
+
+static inline int kvm_pmu_partition(struct arm_pmu *pmu)
+{
+	return -EPERM;
+}
+
 #endif
 
 #endif
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 7c329e01c557..8161dfb123d7 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -25,7 +25,7 @@ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \
 	 vgic/vgic-mmio-v3.o vgic/vgic-kvm-device.o \
 	 vgic/vgic-its.o vgic/vgic-debug.o vgic/vgic-v3-nested.o
 
-kvm-$(CONFIG_HW_PERF_EVENTS)  += pmu-emul.o pmu.o
+kvm-$(CONFIG_HW_PERF_EVENTS)  += pmu-emul.o pmu-part.o pmu.o
 kvm-$(CONFIG_ARM64_PTR_AUTH)  += pauth.o
 kvm-$(CONFIG_PTDUMP_STAGE2_DEBUGFS) += ptdump.o
 
diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c
index 7fb1d9e7180f..41746a498a45 100644
--- a/arch/arm64/kvm/debug.c
+++ b/arch/arm64/kvm/debug.c
@@ -9,6 +9,7 @@
 
 #include <linux/kvm_host.h>
 #include <linux/hw_breakpoint.h>
+#include <linux/perf/arm_pmu.h>
 #include <linux/perf/arm_pmuv3.h>
 
 #include <asm/debug-monitors.h>
@@ -31,15 +32,17 @@
  */
 static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu)
 {
+	u8 hpmn = vcpu->kvm->arch.arm_pmu->hpmn;
+
 	preempt_disable();
 
 	/*
 	 * This also clears MDCR_EL2_E2PB_MASK and MDCR_EL2_E2TB_MASK
 	 * to disable guest access to the profiling and trace buffers
 	 */
-	vcpu->arch.mdcr_el2 = FIELD_PREP(MDCR_EL2_HPMN,
-					 *host_data_ptr(nr_event_counters));
-	vcpu->arch.mdcr_el2 |= (MDCR_EL2_TPM |
+	vcpu->arch.mdcr_el2 = FIELD_PREP(MDCR_EL2_HPMN, hpmn);
+	vcpu->arch.mdcr_el2 |= (MDCR_EL2_HPMD |
+				MDCR_EL2_TPM |
 				MDCR_EL2_TPMS |
 				MDCR_EL2_TTRF |
 				MDCR_EL2_TPMCR |
diff --git a/arch/arm64/kvm/pmu-part.c b/arch/arm64/kvm/pmu-part.c
new file mode 100644
index 000000000000..7252a58f085c
--- /dev/null
+++ b/arch/arm64/kvm/pmu-part.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2025 Google LLC
+ * Author: Colton Lewis <coltonlewis@google.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/perf/arm_pmu.h>
+#include <linux/perf/arm_pmuv3.h>
+
+#include <asm/kvm_pmu.h>
+#include <asm/arm_pmuv3.h>
+
+/**
+ * kvm_pmu_reservation_is_valid() - Determine if reservation is allowed
+ * @host_counters: Number of host counters to reserve
+ *
+ * Determine if the number of host counters in the argument is
+ * allowed. It is allowed if it will produce a valid value for
+ * register field MDCR_EL2.HPMN.
+ *
+ * Return: True if reservation allowed, false otherwise
+ */
+static bool kvm_pmu_reservation_is_valid(u8 host_counters)
+{
+	u8 nr_counters = *host_data_ptr(nr_event_counters);
+
+	return host_counters < nr_counters ||
+		(host_counters == nr_counters
+		 && cpus_have_final_cap(ARM64_HAS_HPMN0));
+}
+
+/**
+ * kvm_pmu_hpmn() - Compute HPMN value
+ * @host_counters: Number of host counters to reserve
+ *
+ * This function computes the value of HPMN, the partition pivot
+ * value, such that counters 0..HPMN are reserved for the guest and
+ * counters HPMN..N are reserved for the host.
+ *
+ * If the requested @host_counters would create an invalid partition,
+ * return the value of HPMN that creates no partition.
+ *
+ * Return: Value of HPMN
+ */
+u8 kvm_pmu_hpmn(u8 host_counters)
+{
+	u8 nr_counters = *host_data_ptr(nr_event_counters);
+
+	if (likely(kvm_pmu_reservation_is_valid(host_counters)))
+		return nr_counters - host_counters;
+	else
+		return nr_counters;
+}
+
+/**
+ * kvm_pmu_partition_supported() - Determine if partitioning is possible
+ *
+ * Partitioning is only supported in VHE mode where we have PMUv3 and
+ * Fine Grain Traps (FGT).
+ *
+ * Return: True if partitioning is possible, false otherwise
+ */
+bool kvm_pmu_partition_supported(void)
+{
+	return has_vhe()
+		&& pmuv3_implemented(kvm_arm_pmu_get_pmuver_limit())
+		&& cpus_have_final_cap(ARM64_HAS_FGT);
+}
+
+/**
+ * kvm_pmu_partition() - Partition the PMU
+ * @pmu: Pointer to pmu being partitioned
+ * @host_counters: Number of host counters to reserve
+ *
+ * Partition the given PMU by taking a number of host counters to
+ * reserve and, if it is a valid reservation, recording the
+ * corresponding HPMN value in the hpmn field of the PMU and clearing
+ * the guest-reserved counters from the counter mask.
+ *
+ * Passing 0 for @host_counters has the effect of disabling partitioning.
+ *
+ * Return: 0 on success, -ERROR otherwise
+ */
+int kvm_pmu_partition(struct arm_pmu *pmu, u8 host_counters)
+{
+	u8 nr_counters;
+	u8 hpmn;
+
+	if (!kvm_pmu_reservation_is_valid(host_counters))
+		return -EINVAL;
+
+	nr_counters = *host_data_ptr(nr_event_counters);
+	hpmn = kvm_pmu_hpmn(host_counters);
+
+	if (hpmn < nr_counters) {
+		pmu->hpmn = hpmn;
+		/* Inform host driver of available counters */
+		bitmap_clear(pmu->cntr_mask, 0, hpmn);
+		bitmap_set(pmu->cntr_mask, hpmn, nr_counters);
+		clear_bit(ARMV8_PMU_CYCLE_IDX, pmu->cntr_mask);
+		if (pmuv3_has_icntr())
+			clear_bit(ARMV8_PMU_INSTR_IDX, pmu->cntr_mask);
+
+		kvm_debug("Partitioned PMU with HPMN %u", hpmn);
+	} else {
+		pmu->hpmn = nr_counters;
+		bitmap_set(pmu->cntr_mask, 0, nr_counters);
+		set_bit(ARMV8_PMU_CYCLE_IDX, pmu->cntr_mask);
+		if (pmuv3_has_icntr())
+			set_bit(ARMV8_PMU_INSTR_IDX, pmu->cntr_mask);
+
+		kvm_debug("Unpartitioned PMU");
+	}
+
+	return 0;
+}
diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c
index 4f0152e67ff3..2dcfac3ea9c6 100644
--- a/arch/arm64/kvm/pmu.c
+++ b/arch/arm64/kvm/pmu.c
@@ -15,6 +15,12 @@ static LIST_HEAD(arm_pmus);
 static DEFINE_MUTEX(arm_pmus_lock);
 static DEFINE_PER_CPU(struct kvm_pmu_events, kvm_pmu_events);
 
+static u8 reserved_host_counters __read_mostly;
+
+module_param(reserved_host_counters, byte, 0);
+MODULE_PARM_DESC(reserved_host_counters,
+		 "Partition the PMU into host and guest counters");
+
 #define kvm_arm_pmu_irq_initialized(v)	((v)->arch.pmu.irq_num >= VGIC_NR_SGIS)
 
 bool kvm_supports_guest_pmuv3(void)
@@ -239,6 +245,13 @@ void kvm_host_pmu_init(struct arm_pmu *pmu)
 	if (!pmuv3_implemented(kvm_arm_pmu_get_pmuver_limit()))
 		return;
 
+	if (reserved_host_counters) {
+		if (kvm_pmu_partition_supported())
+			WARN_ON(kvm_pmu_partition(pmu, reserved_host_counters));
+		else
+			kvm_err("PMU Partition is not supported");
+	}
+
 	guard(mutex)(&arm_pmus_lock);
 
 	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
index 1de206b09616..3843d66b7328 100644
--- a/include/linux/perf/arm_pmu.h
+++ b/include/linux/perf/arm_pmu.h
@@ -130,6 +130,7 @@ struct arm_pmu {
 
 	/* Only to be used by ACPI probing code */
 	unsigned long acpi_cpuid;
+	u8		hpmn; /* MDCR_EL2.HPMN: counter partition pivot */
 };
 
 #define to_arm_pmu(p) (container_of(p, struct arm_pmu, pmu))
-- 
2.49.0.1204.g71687c7c1d-goog
Re: [PATCH 06/17] KVM: arm64: Introduce method to partition the PMU
Posted by Oliver Upton 6 months, 2 weeks ago
On Mon, Jun 02, 2025 at 07:26:51PM +0000, Colton Lewis wrote:
>  static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu)
>  {
> +	u8 hpmn = vcpu->kvm->arch.arm_pmu->hpmn;
> +
>  	preempt_disable();
>  
>  	/*
>  	 * This also clears MDCR_EL2_E2PB_MASK and MDCR_EL2_E2TB_MASK
>  	 * to disable guest access to the profiling and trace buffers
>  	 */
> -	vcpu->arch.mdcr_el2 = FIELD_PREP(MDCR_EL2_HPMN,
> -					 *host_data_ptr(nr_event_counters));
> -	vcpu->arch.mdcr_el2 |= (MDCR_EL2_TPM |
> +	vcpu->arch.mdcr_el2 = FIELD_PREP(MDCR_EL2_HPMN, hpmn);
> +	vcpu->arch.mdcr_el2 |= (MDCR_EL2_HPMD |
> +				MDCR_EL2_TPM |

This isn't safe, as there's no guarantee that kvm_arch::arm_pmu is
pointing that the PMU for this CPU. KVM needs to derive HPMN from some
per-CPU state, not anything tied to the VM/vCPU.

> +/**
> + * kvm_pmu_partition() - Partition the PMU
> + * @pmu: Pointer to pmu being partitioned
> + * @host_counters: Number of host counters to reserve
> + *
> + * Partition the given PMU by taking a number of host counters to
> + * reserve and, if it is a valid reservation, recording the
> + * corresponding HPMN value in the hpmn field of the PMU and clearing
> + * the guest-reserved counters from the counter mask.
> + *
> + * Passing 0 for @host_counters has the effect of disabling partitioning.
> + *
> + * Return: 0 on success, -ERROR otherwise
> + */
> +int kvm_pmu_partition(struct arm_pmu *pmu, u8 host_counters)
> +{
> +	u8 nr_counters;
> +	u8 hpmn;
> +
> +	if (!kvm_pmu_reservation_is_valid(host_counters))
> +		return -EINVAL;
> +
> +	nr_counters = *host_data_ptr(nr_event_counters);
> +	hpmn = kvm_pmu_hpmn(host_counters);
> +
> +	if (hpmn < nr_counters) {
> +		pmu->hpmn = hpmn;
> +		/* Inform host driver of available counters */
> +		bitmap_clear(pmu->cntr_mask, 0, hpmn);
> +		bitmap_set(pmu->cntr_mask, hpmn, nr_counters);
> +		clear_bit(ARMV8_PMU_CYCLE_IDX, pmu->cntr_mask);
> +		if (pmuv3_has_icntr())
> +			clear_bit(ARMV8_PMU_INSTR_IDX, pmu->cntr_mask);
> +
> +		kvm_debug("Partitioned PMU with HPMN %u", hpmn);
> +	} else {
> +		pmu->hpmn = nr_counters;
> +		bitmap_set(pmu->cntr_mask, 0, nr_counters);
> +		set_bit(ARMV8_PMU_CYCLE_IDX, pmu->cntr_mask);
> +		if (pmuv3_has_icntr())
> +			set_bit(ARMV8_PMU_INSTR_IDX, pmu->cntr_mask);
> +
> +		kvm_debug("Unpartitioned PMU");
> +	}
> +
> +	return 0;
> +}

Hmm... Just in terms of code organization I'm not sure I like having KVM
twiddling with *host* support for PMUv3. Feels like the ARM PMU driver
should own partitioning and KVM just takes what it can get.

> @@ -239,6 +245,13 @@ void kvm_host_pmu_init(struct arm_pmu *pmu)
>  	if (!pmuv3_implemented(kvm_arm_pmu_get_pmuver_limit()))
>  		return;
>  
> +	if (reserved_host_counters) {
> +		if (kvm_pmu_partition_supported())
> +			WARN_ON(kvm_pmu_partition(pmu, reserved_host_counters));
> +		else
> +			kvm_err("PMU Partition is not supported");
> +	}
> +

Hasn't the ARM PMU been registered with perf at this point? Surely the
driver wouldn't be very pleased with us ripping counters out from under
its feet.

Thanks,
Oliver
Re: [PATCH 06/17] KVM: arm64: Introduce method to partition the PMU
Posted by Colton Lewis 6 months, 2 weeks ago
Oliver Upton <oliver.upton@linux.dev> writes:

> On Mon, Jun 02, 2025 at 07:26:51PM +0000, Colton Lewis wrote:
>>   static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu)
>>   {
>> +	u8 hpmn = vcpu->kvm->arch.arm_pmu->hpmn;
>> +
>>   	preempt_disable();

>>   	/*
>>   	 * This also clears MDCR_EL2_E2PB_MASK and MDCR_EL2_E2TB_MASK
>>   	 * to disable guest access to the profiling and trace buffers
>>   	 */
>> -	vcpu->arch.mdcr_el2 = FIELD_PREP(MDCR_EL2_HPMN,
>> -					 *host_data_ptr(nr_event_counters));
>> -	vcpu->arch.mdcr_el2 |= (MDCR_EL2_TPM |
>> +	vcpu->arch.mdcr_el2 = FIELD_PREP(MDCR_EL2_HPMN, hpmn);
>> +	vcpu->arch.mdcr_el2 |= (MDCR_EL2_HPMD |
>> +				MDCR_EL2_TPM |

> This isn't safe, as there's no guarantee that kvm_arch::arm_pmu is
> pointing that the PMU for this CPU. KVM needs to derive HPMN from some
> per-CPU state, not anything tied to the VM/vCPU.

I'm confused. Isn't this function preparing to run the vCPU on this
CPU? Why would it be pointing at a different PMU?

And HPMN is something that we only want set when running a vCPU, so
there isn't any per-CPU state saying it should be anything but the
default value (number of counters) outside that context.

Unless you just mean I should check the number of counters again and
make sure HPMN is not an invalid value.

>> +/**
>> + * kvm_pmu_partition() - Partition the PMU
>> + * @pmu: Pointer to pmu being partitioned
>> + * @host_counters: Number of host counters to reserve
>> + *
>> + * Partition the given PMU by taking a number of host counters to
>> + * reserve and, if it is a valid reservation, recording the
>> + * corresponding HPMN value in the hpmn field of the PMU and clearing
>> + * the guest-reserved counters from the counter mask.
>> + *
>> + * Passing 0 for @host_counters has the effect of disabling  
>> partitioning.
>> + *
>> + * Return: 0 on success, -ERROR otherwise
>> + */
>> +int kvm_pmu_partition(struct arm_pmu *pmu, u8 host_counters)
>> +{
>> +	u8 nr_counters;
>> +	u8 hpmn;
>> +
>> +	if (!kvm_pmu_reservation_is_valid(host_counters))
>> +		return -EINVAL;
>> +
>> +	nr_counters = *host_data_ptr(nr_event_counters);
>> +	hpmn = kvm_pmu_hpmn(host_counters);
>> +
>> +	if (hpmn < nr_counters) {
>> +		pmu->hpmn = hpmn;
>> +		/* Inform host driver of available counters */
>> +		bitmap_clear(pmu->cntr_mask, 0, hpmn);
>> +		bitmap_set(pmu->cntr_mask, hpmn, nr_counters);
>> +		clear_bit(ARMV8_PMU_CYCLE_IDX, pmu->cntr_mask);
>> +		if (pmuv3_has_icntr())
>> +			clear_bit(ARMV8_PMU_INSTR_IDX, pmu->cntr_mask);
>> +
>> +		kvm_debug("Partitioned PMU with HPMN %u", hpmn);
>> +	} else {
>> +		pmu->hpmn = nr_counters;
>> +		bitmap_set(pmu->cntr_mask, 0, nr_counters);
>> +		set_bit(ARMV8_PMU_CYCLE_IDX, pmu->cntr_mask);
>> +		if (pmuv3_has_icntr())
>> +			set_bit(ARMV8_PMU_INSTR_IDX, pmu->cntr_mask);
>> +
>> +		kvm_debug("Unpartitioned PMU");
>> +	}
>> +
>> +	return 0;
>> +}

> Hmm... Just in terms of code organization I'm not sure I like having KVM
> twiddling with *host* support for PMUv3. Feels like the ARM PMU driver
> should own partitioning and KVM just takes what it can get.

Okay. I can move the code.

>> @@ -239,6 +245,13 @@ void kvm_host_pmu_init(struct arm_pmu *pmu)
>>   	if (!pmuv3_implemented(kvm_arm_pmu_get_pmuver_limit()))
>>   		return;

>> +	if (reserved_host_counters) {
>> +		if (kvm_pmu_partition_supported())
>> +			WARN_ON(kvm_pmu_partition(pmu, reserved_host_counters));
>> +		else
>> +			kvm_err("PMU Partition is not supported");
>> +	}
>> +

> Hasn't the ARM PMU been registered with perf at this point? Surely the
> driver wouldn't be very pleased with us ripping counters out from under
> its feet.

AFAICT nothing in perf registration cares about the number of counters
the PMU has. The PMUv3 driver tracks its own available counters through
cntr_mask and I modify that during partition.

Since this is still initialization of the PMU, I don't believe anything
has had a chance to use a counter yet that will be ripped away.

Aesthetically It makes since to change this if I move the partitioning
code to the PMUv3 driver, but I think it's inconsequential to the
function.
Re: [PATCH 06/17] KVM: arm64: Introduce method to partition the PMU
Posted by Oliver Upton 6 months, 2 weeks ago
On Tue, Jun 03, 2025 at 09:32:41PM +0000, Colton Lewis wrote:
> Oliver Upton <oliver.upton@linux.dev> writes:
> 
> > On Mon, Jun 02, 2025 at 07:26:51PM +0000, Colton Lewis wrote:
> > >   static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu)
> > >   {
> > > +	u8 hpmn = vcpu->kvm->arch.arm_pmu->hpmn;
> > > +
> > >   	preempt_disable();
> 
> > >   	/*
> > >   	 * This also clears MDCR_EL2_E2PB_MASK and MDCR_EL2_E2TB_MASK
> > >   	 * to disable guest access to the profiling and trace buffers
> > >   	 */
> > > -	vcpu->arch.mdcr_el2 = FIELD_PREP(MDCR_EL2_HPMN,
> > > -					 *host_data_ptr(nr_event_counters));
> > > -	vcpu->arch.mdcr_el2 |= (MDCR_EL2_TPM |
> > > +	vcpu->arch.mdcr_el2 = FIELD_PREP(MDCR_EL2_HPMN, hpmn);
> > > +	vcpu->arch.mdcr_el2 |= (MDCR_EL2_HPMD |
> > > +				MDCR_EL2_TPM |
> 
> > This isn't safe, as there's no guarantee that kvm_arch::arm_pmu is
> > pointing that the PMU for this CPU. KVM needs to derive HPMN from some
> > per-CPU state, not anything tied to the VM/vCPU.
> 
> I'm confused. Isn't this function preparing to run the vCPU on this
> CPU? Why would it be pointing at a different PMU?

Because arm64 is a silly ecosystem and system designers can glue
together heterogenous CPU implementations. The arm_pmu that KVM is
pointing at might only match a subset of CPUs, but vCPUs migrate at the
whim of the scheduler (and userspace).

> And HPMN is something that we only want set when running a vCPU, so
> there isn't any per-CPU state saying it should be anything but the
> default value (number of counters) outside that context.
> 
> Unless you just mean I should check the number of counters again and
> make sure HPMN is not an invalid value.

As you've implemented it the host cannot schedule events in the guest
range of counters regardless of context. You need to reconcile that
global limit with the desires of the VMM on how many counters it wants
presented to this particular guest.

> > > +/**
> > > + * kvm_pmu_partition() - Partition the PMU
> > > + * @pmu: Pointer to pmu being partitioned
> > > + * @host_counters: Number of host counters to reserve
> > > + *
> > > + * Partition the given PMU by taking a number of host counters to
> > > + * reserve and, if it is a valid reservation, recording the
> > > + * corresponding HPMN value in the hpmn field of the PMU and clearing
> > > + * the guest-reserved counters from the counter mask.
> > > + *
> > > + * Passing 0 for @host_counters has the effect of disabling
> > > partitioning.
> > > + *
> > > + * Return: 0 on success, -ERROR otherwise
> > > + */
> > > +int kvm_pmu_partition(struct arm_pmu *pmu, u8 host_counters)
> > > +{
> > > +	u8 nr_counters;
> > > +	u8 hpmn;
> > > +
> > > +	if (!kvm_pmu_reservation_is_valid(host_counters))
> > > +		return -EINVAL;
> > > +
> > > +	nr_counters = *host_data_ptr(nr_event_counters);
> > > +	hpmn = kvm_pmu_hpmn(host_counters);
> > > +
> > > +	if (hpmn < nr_counters) {
> > > +		pmu->hpmn = hpmn;
> > > +		/* Inform host driver of available counters */
> > > +		bitmap_clear(pmu->cntr_mask, 0, hpmn);
> > > +		bitmap_set(pmu->cntr_mask, hpmn, nr_counters);
> > > +		clear_bit(ARMV8_PMU_CYCLE_IDX, pmu->cntr_mask);
> > > +		if (pmuv3_has_icntr())
> > > +			clear_bit(ARMV8_PMU_INSTR_IDX, pmu->cntr_mask);
> > > +
> > > +		kvm_debug("Partitioned PMU with HPMN %u", hpmn);
> > > +	} else {
> > > +		pmu->hpmn = nr_counters;
> > > +		bitmap_set(pmu->cntr_mask, 0, nr_counters);
> > > +		set_bit(ARMV8_PMU_CYCLE_IDX, pmu->cntr_mask);
> > > +		if (pmuv3_has_icntr())
> > > +			set_bit(ARMV8_PMU_INSTR_IDX, pmu->cntr_mask);
> > > +
> > > +		kvm_debug("Unpartitioned PMU");
> > > +	}
> > > +
> > > +	return 0;
> > > +}
> 
> > Hmm... Just in terms of code organization I'm not sure I like having KVM
> > twiddling with *host* support for PMUv3. Feels like the ARM PMU driver
> > should own partitioning and KVM just takes what it can get.
> 
> Okay. I can move the code.
> 
> > > @@ -239,6 +245,13 @@ void kvm_host_pmu_init(struct arm_pmu *pmu)
> > >   	if (!pmuv3_implemented(kvm_arm_pmu_get_pmuver_limit()))
> > >   		return;
> 
> > > +	if (reserved_host_counters) {
> > > +		if (kvm_pmu_partition_supported())
> > > +			WARN_ON(kvm_pmu_partition(pmu, reserved_host_counters));
> > > +		else
> > > +			kvm_err("PMU Partition is not supported");
> > > +	}
> > > +
> 
> > Hasn't the ARM PMU been registered with perf at this point? Surely the
> > driver wouldn't be very pleased with us ripping counters out from under
> > its feet.
> 
> AFAICT nothing in perf registration cares about the number of counters
> the PMU has. The PMUv3 driver tracks its own available counters through
> cntr_mask and I modify that during partition.
> 
> Since this is still initialization of the PMU, I don't believe anything
> has had a chance to use a counter yet that will be ripped away.

Given that kvm_pmu_partition() is called from an ioctl, it is entirely
possible that events have been scheduled prior to applying the
partition.

> Aesthetically It makes since to change this if I move the partitioning
> code to the PMUv3 driver, but I think it's inconsequential to the
> function.

There are two *very* distinct functions w.r.t. partitioning:

 1) Partitioning of a particular arm_pmu that says how many counters the
 host can use

 2) VMM intentions to present a subset of the KVM-owned counter
 partition to its guest

#1 is modifying *global* state, we really can't mess with that in the
context of a single VM...

Thanks,
Oliver
Re: [PATCH 06/17] KVM: arm64: Introduce method to partition the PMU
Posted by Colton Lewis 6 months, 2 weeks ago
Thank you Oliver for the additional explanation.

Oliver Upton <oliver.upton@linux.dev> writes:

> On Tue, Jun 03, 2025 at 09:32:41PM +0000, Colton Lewis wrote:
>> Oliver Upton <oliver.upton@linux.dev> writes:

>> > On Mon, Jun 02, 2025 at 07:26:51PM +0000, Colton Lewis wrote:
>> > >   static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu)
>> > >   {
>> > > +	u8 hpmn = vcpu->kvm->arch.arm_pmu->hpmn;
>> > > +
>> > >   	preempt_disable();

>> > >   	/*
>> > >   	 * This also clears MDCR_EL2_E2PB_MASK and MDCR_EL2_E2TB_MASK
>> > >   	 * to disable guest access to the profiling and trace buffers
>> > >   	 */
>> > > -	vcpu->arch.mdcr_el2 = FIELD_PREP(MDCR_EL2_HPMN,
>> > > -					 *host_data_ptr(nr_event_counters));
>> > > -	vcpu->arch.mdcr_el2 |= (MDCR_EL2_TPM |
>> > > +	vcpu->arch.mdcr_el2 = FIELD_PREP(MDCR_EL2_HPMN, hpmn);
>> > > +	vcpu->arch.mdcr_el2 |= (MDCR_EL2_HPMD |
>> > > +				MDCR_EL2_TPM |

>> > This isn't safe, as there's no guarantee that kvm_arch::arm_pmu is
>> > pointing that the PMU for this CPU. KVM needs to derive HPMN from some
>> > per-CPU state, not anything tied to the VM/vCPU.

>> I'm confused. Isn't this function preparing to run the vCPU on this
>> CPU? Why would it be pointing at a different PMU?

> Because arm64 is a silly ecosystem and system designers can glue
> together heterogenous CPU implementations. The arm_pmu that KVM is
> pointing at might only match a subset of CPUs, but vCPUs migrate at the
> whim of the scheduler (and userspace).

That means the arm_pmu field might at any time point to data that
doesn't represent the current CPU. I'm surprised that's not swapped out
anywhere. Seems like it would be useful to have an arch struct be a
reliable source of information about the current arch.

>> And HPMN is something that we only want set when running a vCPU, so
>> there isn't any per-CPU state saying it should be anything but the
>> default value (number of counters) outside that context.

>> Unless you just mean I should check the number of counters again and
>> make sure HPMN is not an invalid value.

> As you've implemented it the host cannot schedule events in the guest
> range of counters regardless of context. You need to reconcile that
> global limit with the desires of the VMM on how many counters it wants
> presented to this particular guest.

It's true that's the current implementation. I was assuming the VMM
would control that with the new partition API. Given that partitioning
untraps access to counters, there is no other way besides HPMN to
control how many counters are exposed to the guest.

>> > > +/**
>> > > + * kvm_pmu_partition() - Partition the PMU
>> > > + * @pmu: Pointer to pmu being partitioned
>> > > + * @host_counters: Number of host counters to reserve
>> > > + *
>> > > + * Partition the given PMU by taking a number of host counters to
>> > > + * reserve and, if it is a valid reservation, recording the
>> > > + * corresponding HPMN value in the hpmn field of the PMU and  
>> clearing
>> > > + * the guest-reserved counters from the counter mask.
>> > > + *
>> > > + * Passing 0 for @host_counters has the effect of disabling
>> > > partitioning.
>> > > + *
>> > > + * Return: 0 on success, -ERROR otherwise
>> > > + */
>> > > +int kvm_pmu_partition(struct arm_pmu *pmu, u8 host_counters)
>> > > +{
>> > > +	u8 nr_counters;
>> > > +	u8 hpmn;
>> > > +
>> > > +	if (!kvm_pmu_reservation_is_valid(host_counters))
>> > > +		return -EINVAL;
>> > > +
>> > > +	nr_counters = *host_data_ptr(nr_event_counters);
>> > > +	hpmn = kvm_pmu_hpmn(host_counters);
>> > > +
>> > > +	if (hpmn < nr_counters) {
>> > > +		pmu->hpmn = hpmn;
>> > > +		/* Inform host driver of available counters */
>> > > +		bitmap_clear(pmu->cntr_mask, 0, hpmn);
>> > > +		bitmap_set(pmu->cntr_mask, hpmn, nr_counters);
>> > > +		clear_bit(ARMV8_PMU_CYCLE_IDX, pmu->cntr_mask);
>> > > +		if (pmuv3_has_icntr())
>> > > +			clear_bit(ARMV8_PMU_INSTR_IDX, pmu->cntr_mask);
>> > > +
>> > > +		kvm_debug("Partitioned PMU with HPMN %u", hpmn);
>> > > +	} else {
>> > > +		pmu->hpmn = nr_counters;
>> > > +		bitmap_set(pmu->cntr_mask, 0, nr_counters);
>> > > +		set_bit(ARMV8_PMU_CYCLE_IDX, pmu->cntr_mask);
>> > > +		if (pmuv3_has_icntr())
>> > > +			set_bit(ARMV8_PMU_INSTR_IDX, pmu->cntr_mask);
>> > > +
>> > > +		kvm_debug("Unpartitioned PMU");
>> > > +	}
>> > > +
>> > > +	return 0;
>> > > +}

>> > Hmm... Just in terms of code organization I'm not sure I like having  
>> KVM
>> > twiddling with *host* support for PMUv3. Feels like the ARM PMU driver
>> > should own partitioning and KVM just takes what it can get.

>> Okay. I can move the code.

>> > > @@ -239,6 +245,13 @@ void kvm_host_pmu_init(struct arm_pmu *pmu)
>> > >   	if (!pmuv3_implemented(kvm_arm_pmu_get_pmuver_limit()))
>> > >   		return;

>> > > +	if (reserved_host_counters) {
>> > > +		if (kvm_pmu_partition_supported())
>> > > +			WARN_ON(kvm_pmu_partition(pmu, reserved_host_counters));
>> > > +		else
>> > > +			kvm_err("PMU Partition is not supported");
>> > > +	}
>> > > +

>> > Hasn't the ARM PMU been registered with perf at this point? Surely the
>> > driver wouldn't be very pleased with us ripping counters out from under
>> > its feet.

>> AFAICT nothing in perf registration cares about the number of counters
>> the PMU has. The PMUv3 driver tracks its own available counters through
>> cntr_mask and I modify that during partition.

>> Since this is still initialization of the PMU, I don't believe anything
>> has had a chance to use a counter yet that will be ripped away.

> Given that kvm_pmu_partition() is called from an ioctl, it is entirely
> possible that events have been scheduled prior to applying the
> partition.

That's true for the ioctl call. I was only saying it's not true here.

>> Aesthetically It makes since to change this if I move the partitioning
>> code to the PMUv3 driver, but I think it's inconsequential to the
>> function.

> There are two *very* distinct functions w.r.t. partitioning:

>   1) Partitioning of a particular arm_pmu that says how many counters the
>   host can use

>   2) VMM intentions to present a subset of the KVM-owned counter
>   partition to its guest

> #1 is modifying *global* state, we really can't mess with that in the
> context of a single VM...

I see the distinction more clearly now. Since KVM can only control the
number of counters presented to the guest through HPMN, why would the
VMM ever choose a subset? If the host PMU is globally partitioned to not
use anything in the guest range, presenting fewer counters to a guest is
just leaving some counters in the middle of the range unused.


> Thanks,
> Oliver
Re: [PATCH 06/17] KVM: arm64: Introduce method to partition the PMU
Posted by Oliver Upton 6 months, 2 weeks ago
On Wed, Jun 04, 2025 at 08:10:27PM +0000, Colton Lewis wrote:
> Thank you Oliver for the additional explanation.
> 
> Oliver Upton <oliver.upton@linux.dev> writes:
> 
> > On Tue, Jun 03, 2025 at 09:32:41PM +0000, Colton Lewis wrote:
> > > Oliver Upton <oliver.upton@linux.dev> writes:
> 
> > > > On Mon, Jun 02, 2025 at 07:26:51PM +0000, Colton Lewis wrote:
> > > > >   static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu)
> > > > >   {
> > > > > +	u8 hpmn = vcpu->kvm->arch.arm_pmu->hpmn;
> > > > > +
> > > > >   	preempt_disable();
> 
> > > > >   	/*
> > > > >   	 * This also clears MDCR_EL2_E2PB_MASK and MDCR_EL2_E2TB_MASK
> > > > >   	 * to disable guest access to the profiling and trace buffers
> > > > >   	 */
> > > > > -	vcpu->arch.mdcr_el2 = FIELD_PREP(MDCR_EL2_HPMN,
> > > > > -					 *host_data_ptr(nr_event_counters));
> > > > > -	vcpu->arch.mdcr_el2 |= (MDCR_EL2_TPM |
> > > > > +	vcpu->arch.mdcr_el2 = FIELD_PREP(MDCR_EL2_HPMN, hpmn);
> > > > > +	vcpu->arch.mdcr_el2 |= (MDCR_EL2_HPMD |
> > > > > +				MDCR_EL2_TPM |
> 
> > > > This isn't safe, as there's no guarantee that kvm_arch::arm_pmu is
> > > > pointing that the PMU for this CPU. KVM needs to derive HPMN from some
> > > > per-CPU state, not anything tied to the VM/vCPU.
> 
> > > I'm confused. Isn't this function preparing to run the vCPU on this
> > > CPU? Why would it be pointing at a different PMU?
> 
> > Because arm64 is a silly ecosystem and system designers can glue
> > together heterogenous CPU implementations. The arm_pmu that KVM is
> > pointing at might only match a subset of CPUs, but vCPUs migrate at the
> > whim of the scheduler (and userspace).
> 
> That means the arm_pmu field might at any time point to data that
> doesn't represent the current CPU. I'm surprised that's not swapped out
> anywhere. Seems like it would be useful to have an arch struct be a
> reliable source of information about the current arch.

There's no way to accomplish that. It is per-VM data, and you could have
vCPUs on a mix of physical CPUs.

This is mitigated somewhat when the VMM explicitly selects a PMU
implementation, as we prevent vCPUs from actually entering the guest on
an unsupported CPU (see ON_SUPPORTED_CPU flag).

> > There are two *very* distinct functions w.r.t. partitioning:
> 
> >   1) Partitioning of a particular arm_pmu that says how many counters the
> >   host can use
> 
> >   2) VMM intentions to present a subset of the KVM-owned counter
> >   partition to its guest
> 
> > #1 is modifying *global* state, we really can't mess with that in the
> > context of a single VM...
> 
> I see the distinction more clearly now. Since KVM can only control the
> number of counters presented to the guest through HPMN, why would the
> VMM ever choose a subset? If the host PMU is globally partitioned to not
> use anything in the guest range, presenting fewer counters to a guest is
> just leaving some counters in the middle of the range unused.

You may not want to give a 'full' PMU to all VMs running on a system,
but some OSes (Windows) expect to have at least the fixed CPU cycle
counter present. In this case the VMM would deliberately expose fewer
counters. FEAT_HPMN0 didn't get added to the architecture by accident...

Thanks,
Oliver