[PATCH v5] perf/arm_pmu: Skip PMCCNTR_EL0 on NVIDIA Olympus

Besar Wicaksono posted 1 patch 5 days ago
drivers/perf/arm_pmu.c       |  7 +++-
drivers/perf/arm_pmuv3.c     | 64 +++++++++++++++++++++++++++++++-----
include/linux/perf/arm_pmu.h |  2 +-
3 files changed, 62 insertions(+), 11 deletions(-)
[PATCH v5] perf/arm_pmu: Skip PMCCNTR_EL0 on NVIDIA Olympus
Posted by Besar Wicaksono 5 days ago
The PMCCNTR_EL0 in NVIDIA Olympus CPU may increment while
in WFI/WFE, which does not align with counting CPU_CYCLES
on a programmable counter. Add a MIDR range entry and refuse
PMCCNTR_EL0 for cycle events on affected parts so perf does
not mix the two behaviors.

Also keep PMCCNTR_EL0 unavailable to EL0 direct counter reads
on affected CPUs. When userspace counter access is enabled,
avoid setting PMUSERENR_EL0.CR for PMUs that must avoid
PMCCNTR_EL0, while still allowing direct reads from programmable
event counters. For 64-bit userspace CPU_CYCLES events on PMUs
without native long event counters, reject the event if the only
valid direct-read path would be PMCCNTR_EL0.

Signed-off-by: Besar Wicaksono <bwicaksono@nvidia.com>
---

Changes from v1:
  * add CONFIG_ARM64 check to fix build error found by kernel test robot
  * add explicit include of <asm/cputype.h>
v1: https://lore.kernel.org/linux-arm-kernel/20260406232034.2566133-1-bwicaksono@nvidia.com/

Changes from v2:
  * Move the Olympus PMCCNTR avoidance check from arm_pmuv3.c to the
    common arm_pmu registration path.
  * Replace the PMUv3-only has_smt flag with avoid_pmccntr, covering both
    the existing SMT restriction and the Olympus MIDR restriction.
  * Use the cached per-CPU MIDR from cpu_data instead of calling
    is_midr_in_range_list() from armv8pmu_can_use_pmccntr().
  * Add the required asm/cpu.h include for cpu_data.
v2: https://lore.kernel.org/linux-arm-kernel/20260421203856.3539186-1-bwicaksono@nvidia.com/#t

Changes from v3:
  * Move avoidance check based on MIDR to __armv8pmu_probe_pmu() to make sure
    the MIDR is retrieved from the correct online CPU.
v3: https://lore.kernel.org/linux-arm-kernel/20260429215614.1793131-1-bwicaksono@nvidia.com/

Changes from v4:
  * Avoid granting PMCCNTR_EL0 direct userspace access by leaving
    PMUSERENR_EL0.CR clear on PMUs that must avoid PMCCNTR_EL0.
  * Keep direct userspace access available for programmable event counters.
  * Reject 64-bit userspace CPU_CYCLES events on PMUs without native long
    counters when the only valid direct-read path would be PMCCNTR_EL0.
  * Expand the Olympus comment to describe the mismatch with programmable
    CPU_CYCLES counters.
v4: https://lore.kernel.org/linux-arm-kernel/20260504175204.3122979-1-bwicaksono@nvidia.com/

---
 drivers/perf/arm_pmu.c       |  7 +++-
 drivers/perf/arm_pmuv3.c     | 64 +++++++++++++++++++++++++++++++-----
 include/linux/perf/arm_pmu.h |  2 +-
 3 files changed, 62 insertions(+), 11 deletions(-)

diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index 939bcbd433aa..aa1dac0b440f 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -931,8 +931,13 @@ int armpmu_register(struct arm_pmu *pmu)
 	/*
 	 * By this stage we know our supported CPUs on either DT/ACPI platforms,
 	 * detect the SMT implementation.
+	 * On SMT CPUs, the PMCCNTR_EL0 increments from the processor clock rather
+	 * than the PE clock (ARM DDI0487 L.b D13.1.3) which means it'll continue
+	 * counting on a WFI PE if one of its SMT sibling is not idle on a
+	 * multi-threaded implementation. So don't use it on SMT cores.
 	 */
-	pmu->has_smt = topology_core_has_smt(cpumask_first(&pmu->supported_cpus));
+	pmu->avoid_pmccntr |=
+		topology_core_has_smt(cpumask_first(&pmu->supported_cpus));
 
 	if (!pmu->set_event_filter)
 		pmu->pmu.capabilities |= PERF_PMU_CAP_NO_EXCLUDE;
diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
index 8014ff766cff..6d4d57342352 100644
--- a/drivers/perf/arm_pmuv3.c
+++ b/drivers/perf/arm_pmuv3.c
@@ -8,6 +8,7 @@
  * This code is based heavily on the ARMv7 perf event code.
  */
 
+#include <asm/cputype.h>
 #include <asm/irq_regs.h>
 #include <asm/perf_event.h>
 #include <asm/virt.h>
@@ -795,6 +796,7 @@ static void armv8pmu_disable_user_access(void)
 static void armv8pmu_enable_user_access(struct arm_pmu *cpu_pmu)
 {
 	int i;
+	u64 userenr = ARMV8_PMU_USERENR_ER | ARMV8_PMU_USERENR_UEN;
 	struct pmu_hw_events *cpuc = this_cpu_ptr(cpu_pmu->hw_events);
 
 	if (is_pmuv3p9(cpu_pmu->pmuver)) {
@@ -817,7 +819,10 @@ static void armv8pmu_enable_user_access(struct arm_pmu *cpu_pmu)
 		}
 	}
 
-	update_pmuserenr(ARMV8_PMU_USERENR_ER | ARMV8_PMU_USERENR_CR | ARMV8_PMU_USERENR_UEN);
+	if (!cpu_pmu->avoid_pmccntr)
+		userenr |= ARMV8_PMU_USERENR_CR;
+
+	update_pmuserenr(userenr);
 }
 
 static void armv8pmu_enable_event(struct perf_event *event)
@@ -1002,13 +1007,7 @@ static bool armv8pmu_can_use_pmccntr(struct pmu_hw_events *cpuc,
 	if (has_branch_stack(event))
 		return false;
 
-	/*
-	 * The PMCCNTR_EL0 increments from the processor clock rather than
-	 * the PE clock (ARM DDI0487 L.b D13.1.3) which means it'll continue
-	 * counting on a WFI PE if one of its SMT sibling is not idle on a
-	 * multi-threaded implementation. So don't use it on SMT cores.
-	 */
-	if (cpu_pmu->has_smt)
+	if (cpu_pmu->avoid_pmccntr)
 		return false;
 
 	return true;
@@ -1250,7 +1249,8 @@ static int __armv8_pmuv3_map_event(struct perf_event *event,
 		if (!(event->attach_state & PERF_ATTACH_TASK))
 			return -EINVAL;
 		if (armv8pmu_event_is_64bit(event) &&
-		    (hw_event_id != ARMV8_PMUV3_PERFCTR_CPU_CYCLES) &&
+		    (hw_event_id != ARMV8_PMUV3_PERFCTR_CPU_CYCLES ||
+		     armpmu->avoid_pmccntr) &&
 		    !armv8pmu_has_long_event(armpmu))
 			return -EOPNOTSUPP;
 
@@ -1299,6 +1299,45 @@ static int armv8_vulcan_map_event(struct perf_event *event)
 				       &armv8_vulcan_perf_cache_map);
 }
 
+#ifdef CONFIG_ARM64
+/*
+ * List of CPUs that should avoid using PMCCNTR_EL0.
+ */
+static struct midr_range armv8pmu_avoid_pmccntr_cpus[] = {
+	/*
+	 * NVIDIA Olympus may expose different WFI/WFE behaviour between the
+	 * PMCCNTR_EL0 and the CPU_CYCLES event on programmable counters.
+	 * While the CPU is in WFI/WFE state, the PMCCNTR_EL0 may still increment
+	 * but the programmable counter may not. This is an implementation specific
+	 * behavior and not an erratum. Perf assumes those two paths are
+	 * interchangeable, so avoid using PMCCNTR_EL0 for CPU_CYCLES event.
+	 *
+	 * From ARM DDI0487 D14.4:
+	 *   It is IMPLEMENTATION SPECIFIC whether CPU_CYCLES and PMCCNTR count
+	 *   when the PE is in WFI or WFE state, even if the clocks are not stopped.
+	 *
+	 * From ARM DDI0487 D24.5.2:
+	 *   All counters are subject to any changes in clock frequency, including
+	 *   clock stopping caused by the WFI and WFE instructions.
+	 *   This means that it is CONSTRAINED UNPREDICTABLE whether or not
+	 *   PMCCNTR_EL0 continues to increment when clocks are stopped by WFI and
+	 *   WFE instructions.
+	 */
+	MIDR_ALL_VERSIONS(MIDR_NVIDIA_OLYMPUS),
+	{}
+};
+
+static bool armv8pmu_is_in_avoid_pmccntr_cpus(void)
+{
+	return is_midr_in_range_list(armv8pmu_avoid_pmccntr_cpus);
+}
+#else
+static bool armv8pmu_is_in_avoid_pmccntr_cpus(void)
+{
+	return false;
+}
+#endif
+
 struct armv8pmu_probe_info {
 	struct arm_pmu *pmu;
 	bool present;
@@ -1348,6 +1387,13 @@ static void __armv8pmu_probe_pmu(void *info)
 	else
 		cpu_pmu->reg_pmmir = 0;
 
+	/*
+	 * On some CPUs, PMCCNTR_EL0 does not match the behavior of CPU_CYCLES
+	 * programmable counter, so avoid routing cycles through PMCCNTR_EL0 to
+	 * prevent inconsistency in the results.
+	 */
+	cpu_pmu->avoid_pmccntr |= armv8pmu_is_in_avoid_pmccntr_cpus();
+
 	brbe_probe(cpu_pmu);
 }
 
diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
index 52b37f7bdbf9..02d2c7f45b52 100644
--- a/include/linux/perf/arm_pmu.h
+++ b/include/linux/perf/arm_pmu.h
@@ -119,7 +119,7 @@ struct arm_pmu {
 
 	/* PMUv3 only */
 	int		pmuver;
-	bool		has_smt;
+	bool		avoid_pmccntr;
 	u64		reg_pmmir;
 	u64		reg_brbidr;
 #define ARMV8_PMUV3_MAX_COMMON_EVENTS		0x40
-- 
2.43.0