[PATCH 6/8] perf: add NVIDIA Tegra410 CPU Memory Latency PMU

Besar Wicaksono posted 8 patches 1 week, 6 days ago
[PATCH 6/8] perf: add NVIDIA Tegra410 CPU Memory Latency PMU
Posted by Besar Wicaksono 1 week, 6 days ago
Adds CPU Memory (CMEM) Latency  PMU support in Tegra410 SOC.

Signed-off-by: Besar Wicaksono <bwicaksono@nvidia.com>
---
 .../admin-guide/perf/nvidia-tegra410-pmu.rst  |  25 +
 drivers/perf/Kconfig                          |   7 +
 drivers/perf/Makefile                         |   1 +
 drivers/perf/nvidia_t410_cmem_latency_pmu.c   | 727 ++++++++++++++++++
 4 files changed, 760 insertions(+)
 create mode 100644 drivers/perf/nvidia_t410_cmem_latency_pmu.c

diff --git a/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst b/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst
index 07dc447eead7..11fc1c88346a 100644
--- a/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst
+++ b/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst
@@ -8,6 +8,7 @@ metrics like memory bandwidth, latency, and utilization:
 * Unified Coherence Fabric (UCF)
 * PCIE
 * PCIE-TGT
+* CPU Memory (CMEM) Latency
 
 PMU Driver
 ----------
@@ -342,3 +343,27 @@ Example usage:
   0x10000 to 0x100FF on socket 0's PCIE RC-1::
 
     perf stat -a -e nvidia_pcie_tgt_pmu_0_rc_1/event=0x1,dst_addr_base=0x10000,dst_addr_mask=0xFFF00,dst_addr_en=0x1/
+
+CPU Memory (CMEM) Latency PMU
+-----------------------------
+
+This PMU monitors latency events of memory read requests to local
+CPU DRAM:
+
+  * RD_REQ counters: count read requests (32B per request).
+  * RD_CUM_OUTS counters: accumulated outstanding request counter, which track
+    how many cycles the read requests are in flight.
+  * CYCLES counter: counts the number of elapsed cycles.
+
+The average latency is calculated as::
+
+   FREQ_IN_GHZ = CYCLES / ELAPSED_TIME_IN_NS
+   AVG_LATENCY_IN_CYCLES = RD_CUM_OUTS / RD_REQ
+   AVERAGE_LATENCY_IN_NS = AVG_LATENCY_IN_CYCLES / FREQ_IN_GHZ
+
+The events and configuration options of this PMU device are described in sysfs,
+see /sys/bus/event_source/devices/nvidia_cmem_latency_pmu_<socket-id>.
+
+Example usage::
+
+  perf stat -a -e '{nvidia_cmem_latency_pmu_0/rd_req/,nvidia_cmem_latency_pmu_0/rd_cum_outs/,nvidia_cmem_latency_pmu_0/cycles/}'
diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index 638321fc9800..9fed3c41d5ea 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -311,4 +311,11 @@ config MARVELL_PEM_PMU
 	  Enable support for PCIe Interface performance monitoring
 	  on Marvell platform.
 
+config NVIDIA_TEGRA410_CMEM_LATENCY_PMU
+	tristate "NVIDIA Tegra410 CPU Memory Latency PMU"
+	depends on ARM64
+	help
+	  Enable perf support for CPU memory latency counters monitoring on
+	  NVIDIA Tegra410 SoC.
+
 endmenu
diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
index ea52711a87e3..4aa6aad393c2 100644
--- a/drivers/perf/Makefile
+++ b/drivers/perf/Makefile
@@ -35,3 +35,4 @@ obj-$(CONFIG_DWC_PCIE_PMU) += dwc_pcie_pmu.o
 obj-$(CONFIG_ARM_CORESIGHT_PMU_ARCH_SYSTEM_PMU) += arm_cspmu/
 obj-$(CONFIG_MESON_DDR_PMU) += amlogic/
 obj-$(CONFIG_CXL_PMU) += cxl_pmu.o
+obj-$(CONFIG_NVIDIA_TEGRA410_CMEM_LATENCY_PMU) += nvidia_t410_cmem_latency_pmu.o
diff --git a/drivers/perf/nvidia_t410_cmem_latency_pmu.c b/drivers/perf/nvidia_t410_cmem_latency_pmu.c
new file mode 100644
index 000000000000..9b466581c8fc
--- /dev/null
+++ b/drivers/perf/nvidia_t410_cmem_latency_pmu.c
@@ -0,0 +1,727 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NVIDIA Tegra410 CPU Memory (CMEM) Latency PMU driver.
+ *
+ * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ */
+
+#include <linux/acpi.h>
+#include <linux/bitops.h>
+#include <linux/cpumask.h>
+#include <linux/device.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/perf_event.h>
+#include <linux/platform_device.h>
+
+#define NUM_INSTANCES    14
+#define BCAST(pmu) pmu->base[NUM_INSTANCES]
+
+/* Register offsets. */
+#define CG_CTRL         0x800
+#define CTRL            0x808
+#define STATUS          0x810
+#define CYCLE_CNTR      0x818
+#define MC0_REQ_CNTR    0x820
+#define MC0_AOR_CNTR    0x830
+#define MC1_REQ_CNTR    0x838
+#define MC1_AOR_CNTR    0x848
+#define MC2_REQ_CNTR    0x850
+#define MC2_AOR_CNTR    0x860
+
+/* CTRL values. */
+#define CTRL_DISABLE    0x0ULL
+#define CTRL_ENABLE     0x1ULL
+#define CTRL_CLR        0x2ULL
+
+/* CG_CTRL values. */
+#define CG_CTRL_DISABLE    0x0ULL
+#define CG_CTRL_ENABLE     0x1ULL
+
+/* STATUS register field. */
+#define STATUS_CYCLE_OVF      BIT(0)
+#define STATUS_MC0_AOR_OVF    BIT(1)
+#define STATUS_MC0_REQ_OVF    BIT(3)
+#define STATUS_MC1_AOR_OVF    BIT(4)
+#define STATUS_MC1_REQ_OVF    BIT(6)
+#define STATUS_MC2_AOR_OVF    BIT(7)
+#define STATUS_MC2_REQ_OVF    BIT(9)
+
+/* Events. */
+#define EVENT_CYCLES    0x0
+#define EVENT_REQ       0x1
+#define EVENT_AOR       0x2
+
+#define NUM_EVENTS           0x3
+#define MASK_EVENT           0x3
+#define MAX_ACTIVE_EVENTS    32
+
+#define ACTIVE_CPU_MASK        0x0
+#define ASSOCIATED_CPU_MASK    0x1
+
+static unsigned long cmem_lat_pmu_cpuhp_state;
+
+struct cmem_lat_pmu_hw_events {
+	struct perf_event *events[MAX_ACTIVE_EVENTS];
+	DECLARE_BITMAP(used_ctrs, MAX_ACTIVE_EVENTS);
+};
+
+struct cmem_lat_pmu {
+	struct pmu pmu;
+	struct device *dev;
+	const char *name;
+	const char *identifier;
+	void __iomem *base[NUM_INSTANCES + 1];
+	cpumask_t associated_cpus;
+	cpumask_t active_cpu;
+	struct hlist_node node;
+	struct cmem_lat_pmu_hw_events hw_events;
+};
+
+#define to_cmem_lat_pmu(p) \
+	container_of(p, struct cmem_lat_pmu, pmu)
+
+
+/* Get event type from perf_event. */
+static inline u32 get_event_type(struct perf_event *event)
+{
+	return (event->attr.config) & MASK_EVENT;
+}
+
+/* PMU operations. */
+static int cmem_lat_pmu_get_event_idx(struct cmem_lat_pmu_hw_events *hw_events,
+				struct perf_event *event)
+{
+	unsigned int idx;
+
+	idx = find_first_zero_bit(hw_events->used_ctrs, MAX_ACTIVE_EVENTS);
+	if (idx >= MAX_ACTIVE_EVENTS)
+		return -EAGAIN;
+
+	set_bit(idx, hw_events->used_ctrs);
+
+	return idx;
+}
+
+static bool cmem_lat_pmu_validate_event(struct pmu *pmu,
+				 struct cmem_lat_pmu_hw_events *hw_events,
+				 struct perf_event *event)
+{
+	if (is_software_event(event))
+		return true;
+
+	/* Reject groups spanning multiple HW PMUs. */
+	if (event->pmu != pmu)
+		return false;
+
+	return (cmem_lat_pmu_get_event_idx(hw_events, event) >= 0);
+}
+
+/*
+ * Make sure the group of events can be scheduled at once
+ * on the PMU.
+ */
+static bool cmem_lat_pmu_validate_group(struct perf_event *event)
+{
+	struct perf_event *sibling, *leader = event->group_leader;
+	struct cmem_lat_pmu_hw_events fake_hw_events;
+
+	if (event->group_leader == event)
+		return true;
+
+	memset(&fake_hw_events, 0, sizeof(fake_hw_events));
+
+	if (!cmem_lat_pmu_validate_event(event->pmu, &fake_hw_events, leader))
+		return false;
+
+	for_each_sibling_event(sibling, leader) {
+		if (!cmem_lat_pmu_validate_event(event->pmu, &fake_hw_events,
+						sibling))
+			return false;
+	}
+
+	return cmem_lat_pmu_validate_event(event->pmu, &fake_hw_events, event);
+}
+
+static int cmem_lat_pmu_event_init(struct perf_event *event)
+{
+	struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	u32 event_type = get_event_type(event);
+
+	if (event->attr.type != event->pmu->type ||
+	    event_type >= NUM_EVENTS)
+		return -ENOENT;
+
+	/*
+	 * Following other "uncore" PMUs, we do not support sampling mode or
+	 * attach to a task (per-process mode).
+	 */
+	if (is_sampling_event(event)) {
+		dev_dbg(cmem_lat_pmu->pmu.dev,
+			"Can't support sampling events\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (event->cpu < 0 || event->attach_state & PERF_ATTACH_TASK) {
+		dev_dbg(cmem_lat_pmu->pmu.dev,
+			"Can't support per-task counters\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * Make sure the CPU assignment is on one of the CPUs associated with
+	 * this PMU.
+	 */
+	if (!cpumask_test_cpu(event->cpu, &cmem_lat_pmu->associated_cpus)) {
+		dev_dbg(cmem_lat_pmu->pmu.dev,
+			"Requested cpu is not associated with the PMU\n");
+		return -EINVAL;
+	}
+
+	/* Enforce the current active CPU to handle the events in this PMU. */
+	event->cpu = cpumask_first(&cmem_lat_pmu->active_cpu);
+	if (event->cpu >= nr_cpu_ids)
+		return -EINVAL;
+
+	if (!cmem_lat_pmu_validate_group(event))
+		return -EINVAL;
+
+	hwc->idx = -1;
+	hwc->config = event_type;
+
+	return 0;
+}
+
+static u64 cmem_lat_pmu_read_status(struct cmem_lat_pmu *cmem_lat_pmu,
+				   unsigned int inst)
+{
+	return readq(cmem_lat_pmu->base[inst] + STATUS);
+}
+
+static u64 cmem_lat_pmu_read_cycle_counter(struct perf_event *event)
+{
+	const unsigned int instance = 0;
+	u64 status;
+	struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu);
+	struct device *dev = cmem_lat_pmu->dev;
+
+	/*
+	 * Use the reading from first instance since all instances are
+	 * identical.
+	 */
+	status = cmem_lat_pmu_read_status(cmem_lat_pmu, instance);
+	if (status & STATUS_CYCLE_OVF)
+		dev_warn(dev, "Cycle counter overflow\n");
+
+	return readq(cmem_lat_pmu->base[instance] + CYCLE_CNTR);
+}
+
+static u64 cmem_lat_pmu_read_req_counter(struct perf_event *event)
+{
+	unsigned int i;
+	u64 status, val = 0;
+	struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu);
+	struct device *dev = cmem_lat_pmu->dev;
+
+	/* Sum up the counts from all instances. */
+	for (i = 0; i < NUM_INSTANCES; i++) {
+		status = cmem_lat_pmu_read_status(cmem_lat_pmu, i);
+		if (status & STATUS_MC0_REQ_OVF)
+			dev_warn(dev, "MC0 request counter overflow\n");
+		if (status & STATUS_MC1_REQ_OVF)
+			dev_warn(dev, "MC1 request counter overflow\n");
+		if (status & STATUS_MC2_REQ_OVF)
+			dev_warn(dev, "MC2 request counter overflow\n");
+
+		val += readq(cmem_lat_pmu->base[i] + MC0_REQ_CNTR);
+		val += readq(cmem_lat_pmu->base[i] + MC1_REQ_CNTR);
+		val += readq(cmem_lat_pmu->base[i] + MC2_REQ_CNTR);
+	}
+
+	return val;
+}
+
+static u64 cmem_lat_pmu_read_aor_counter(struct perf_event *event)
+{
+	unsigned int i;
+	u64 status, val = 0;
+	struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu);
+	struct device *dev = cmem_lat_pmu->dev;
+
+	/* Sum up the counts from all instances. */
+	for (i = 0; i < NUM_INSTANCES; i++) {
+		status = cmem_lat_pmu_read_status(cmem_lat_pmu, i);
+		if (status & STATUS_MC0_AOR_OVF)
+			dev_warn(dev, "MC0 AOR counter overflow\n");
+		if (status & STATUS_MC1_AOR_OVF)
+			dev_warn(dev, "MC1 AOR counter overflow\n");
+		if (status & STATUS_MC2_AOR_OVF)
+			dev_warn(dev, "MC2 AOR counter overflow\n");
+
+		val += readq(cmem_lat_pmu->base[i] + MC0_AOR_CNTR);
+		val += readq(cmem_lat_pmu->base[i] + MC1_AOR_CNTR);
+		val += readq(cmem_lat_pmu->base[i] + MC2_AOR_CNTR);
+	}
+
+	return val;
+}
+
+static u64 (*read_counter_fn[NUM_EVENTS])(struct perf_event *) = {
+	[EVENT_CYCLES] = cmem_lat_pmu_read_cycle_counter,
+	[EVENT_REQ] = cmem_lat_pmu_read_req_counter,
+	[EVENT_AOR] = cmem_lat_pmu_read_aor_counter,
+};
+
+static void cmem_lat_pmu_event_update(struct perf_event *event)
+{
+	u32 event_type;
+	u64 prev, now;
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (hwc->state & PERF_HES_STOPPED)
+		return;
+
+	event_type = hwc->config;
+
+	do {
+		prev = local64_read(&hwc->prev_count);
+		now = read_counter_fn[event_type](event);
+	} while (local64_cmpxchg(&hwc->prev_count, prev, now) != prev);
+
+	local64_add(now - prev, &event->count);
+
+	hwc->state |= PERF_HES_UPTODATE;
+}
+
+static void cmem_lat_pmu_start(struct perf_event *event, int pmu_flags)
+{
+	event->hw.state = 0;
+}
+
+static void cmem_lat_pmu_stop(struct perf_event *event, int pmu_flags)
+{
+	event->hw.state |= PERF_HES_STOPPED;
+}
+
+static int cmem_lat_pmu_add(struct perf_event *event, int flags)
+{
+	struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu);
+	struct cmem_lat_pmu_hw_events *hw_events = &cmem_lat_pmu->hw_events;
+	struct hw_perf_event *hwc = &event->hw;
+	int idx;
+
+	if (WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(),
+					   &cmem_lat_pmu->associated_cpus)))
+		return -ENOENT;
+
+	idx = cmem_lat_pmu_get_event_idx(hw_events, event);
+	if (idx < 0)
+		return idx;
+
+	hw_events->events[idx] = event;
+	hwc->idx = idx;
+	hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
+
+	if (flags & PERF_EF_START)
+		cmem_lat_pmu_start(event, PERF_EF_RELOAD);
+
+	/* Propagate changes to the userspace mapping. */
+	perf_event_update_userpage(event);
+
+	return 0;
+}
+
+static void cmem_lat_pmu_del(struct perf_event *event, int flags)
+{
+	struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu);
+	struct cmem_lat_pmu_hw_events *hw_events = &cmem_lat_pmu->hw_events;
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = hwc->idx;
+
+	cmem_lat_pmu_stop(event, PERF_EF_UPDATE);
+
+	hw_events->events[idx] = NULL;
+
+	clear_bit(idx, hw_events->used_ctrs);
+
+	perf_event_update_userpage(event);
+}
+
+static void cmem_lat_pmu_read(struct perf_event *event)
+{
+	cmem_lat_pmu_event_update(event);
+}
+
+static inline void cmem_lat_pmu_cg_ctrl(struct cmem_lat_pmu *cmem_lat_pmu, u64 val)
+{
+	writeq(val, BCAST(cmem_lat_pmu) + CG_CTRL);
+}
+
+static inline void cmem_lat_pmu_ctrl(struct cmem_lat_pmu *cmem_lat_pmu, u64 val)
+{
+	writeq(val, BCAST(cmem_lat_pmu) + CTRL);
+}
+
+static void cmem_lat_pmu_enable(struct pmu *pmu)
+{
+	bool disabled;
+	struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(pmu);
+
+	disabled = bitmap_empty(
+		cmem_lat_pmu->hw_events.used_ctrs, MAX_ACTIVE_EVENTS);
+
+	if (disabled)
+		return;
+
+	/* Enable all the counters. */
+	cmem_lat_pmu_cg_ctrl(cmem_lat_pmu, CG_CTRL_ENABLE);
+	cmem_lat_pmu_ctrl(cmem_lat_pmu, CTRL_ENABLE);
+}
+
+static void cmem_lat_pmu_disable(struct pmu *pmu)
+{
+	int idx;
+	struct perf_event *event;
+	struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(pmu);
+
+	/* Disable all the counters. */
+	cmem_lat_pmu_ctrl(cmem_lat_pmu, CTRL_DISABLE);
+
+	/*
+	 * The counters will start from 0 again on restart.
+	 * Update the events immediately to avoid losing the counts.
+	 */
+	for_each_set_bit(
+		idx, cmem_lat_pmu->hw_events.used_ctrs, MAX_ACTIVE_EVENTS) {
+		event = cmem_lat_pmu->hw_events.events[idx];
+
+		if (!event)
+			continue;
+
+		cmem_lat_pmu_event_update(event);
+
+		local64_set(&event->hw.prev_count, 0ULL);
+	}
+
+	cmem_lat_pmu_ctrl(cmem_lat_pmu, CTRL_CLR);
+	cmem_lat_pmu_cg_ctrl(cmem_lat_pmu, CG_CTRL_DISABLE);
+}
+
+/* PMU identifier attribute. */
+
+static ssize_t cmem_lat_pmu_identifier_show(struct device *dev,
+					 struct device_attribute *attr,
+					 char *page)
+{
+	struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(dev_get_drvdata(dev));
+
+	return sysfs_emit(page, "%s\n", cmem_lat_pmu->identifier);
+}
+
+static struct device_attribute cmem_lat_pmu_identifier_attr =
+	__ATTR(identifier, 0444, cmem_lat_pmu_identifier_show, NULL);
+
+static struct attribute *cmem_lat_pmu_identifier_attrs[] = {
+	&cmem_lat_pmu_identifier_attr.attr,
+	NULL,
+};
+
+static struct attribute_group cmem_lat_pmu_identifier_attr_group = {
+	.attrs = cmem_lat_pmu_identifier_attrs,
+};
+
+/* Format attributes. */
+
+#define NV_PMU_EXT_ATTR(_name, _func, _config)			\
+	(&((struct dev_ext_attribute[]){				\
+		{							\
+			.attr = __ATTR(_name, 0444, _func, NULL),	\
+			.var = (void *)_config				\
+		}							\
+	})[0].attr.attr)
+
+static struct attribute *cmem_lat_pmu_formats[] = {
+	NV_PMU_EXT_ATTR(event, device_show_string, "config:0-1"),
+	NULL,
+};
+
+static const struct attribute_group cmem_lat_pmu_format_group = {
+	.name = "format",
+	.attrs = cmem_lat_pmu_formats,
+};
+
+/* Event attributes. */
+
+static ssize_t cmem_lat_pmu_sysfs_event_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct perf_pmu_events_attr *pmu_attr;
+
+	pmu_attr = container_of(attr, typeof(*pmu_attr), attr);
+	return sysfs_emit(buf, "event=0x%llx\n", pmu_attr->id);
+}
+
+#define NV_PMU_EVENT_ATTR(_name, _config)	\
+	PMU_EVENT_ATTR_ID(_name, cmem_lat_pmu_sysfs_event_show, _config)
+
+static struct attribute *cmem_lat_pmu_events[] = {
+	NV_PMU_EVENT_ATTR(cycles, EVENT_CYCLES),
+	NV_PMU_EVENT_ATTR(rd_req, EVENT_REQ),
+	NV_PMU_EVENT_ATTR(rd_cum_outs, EVENT_AOR),
+	NULL
+};
+
+static const struct attribute_group cmem_lat_pmu_events_group = {
+	.name = "events",
+	.attrs = cmem_lat_pmu_events,
+};
+
+/* Cpumask attributes. */
+
+static ssize_t cmem_lat_pmu_cpumask_show(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	struct pmu *pmu = dev_get_drvdata(dev);
+	struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(pmu);
+	struct dev_ext_attribute *eattr =
+		container_of(attr, struct dev_ext_attribute, attr);
+	unsigned long mask_id = (unsigned long)eattr->var;
+	const cpumask_t *cpumask;
+
+	switch (mask_id) {
+	case ACTIVE_CPU_MASK:
+		cpumask = &cmem_lat_pmu->active_cpu;
+		break;
+	case ASSOCIATED_CPU_MASK:
+		cpumask = &cmem_lat_pmu->associated_cpus;
+		break;
+	default:
+		return 0;
+	}
+	return cpumap_print_to_pagebuf(true, buf, cpumask);
+}
+
+#define NV_PMU_CPUMASK_ATTR(_name, _config)			\
+	NV_PMU_EXT_ATTR(_name, cmem_lat_pmu_cpumask_show,	\
+				(unsigned long)_config)
+
+static struct attribute *cmem_lat_pmu_cpumask_attrs[] = {
+	NV_PMU_CPUMASK_ATTR(cpumask, ACTIVE_CPU_MASK),
+	NV_PMU_CPUMASK_ATTR(associated_cpus, ASSOCIATED_CPU_MASK),
+	NULL,
+};
+
+static const struct attribute_group cmem_lat_pmu_cpumask_attr_group = {
+	.attrs = cmem_lat_pmu_cpumask_attrs,
+};
+
+/* Per PMU device attribute groups. */
+
+static const struct attribute_group *cmem_lat_pmu_attr_groups[] = {
+	&cmem_lat_pmu_identifier_attr_group,
+	&cmem_lat_pmu_format_group,
+	&cmem_lat_pmu_events_group,
+	&cmem_lat_pmu_cpumask_attr_group,
+	NULL,
+};
+
+static int cmem_lat_pmu_cpu_online(unsigned int cpu, struct hlist_node *node)
+{
+	struct cmem_lat_pmu *cmem_lat_pmu =
+		hlist_entry_safe(node, struct cmem_lat_pmu, node);
+
+	if (!cpumask_test_cpu(cpu, &cmem_lat_pmu->associated_cpus))
+		return 0;
+
+	/* If the PMU is already managed, there is nothing to do */
+	if (!cpumask_empty(&cmem_lat_pmu->active_cpu))
+		return 0;
+
+	/* Use this CPU for event counting */
+	cpumask_set_cpu(cpu, &cmem_lat_pmu->active_cpu);
+
+	return 0;
+}
+
+static int cmem_lat_pmu_cpu_teardown(unsigned int cpu, struct hlist_node *node)
+{
+	unsigned int dst;
+
+	struct cmem_lat_pmu *cmem_lat_pmu =
+		hlist_entry_safe(node, struct cmem_lat_pmu, node);
+
+	/* Nothing to do if this CPU doesn't own the PMU */
+	if (!cpumask_test_and_clear_cpu(cpu, &cmem_lat_pmu->active_cpu))
+		return 0;
+
+	/* Choose a new CPU to migrate ownership of the PMU to */
+	dst = cpumask_any_and_but(&cmem_lat_pmu->associated_cpus,
+				  cpu_online_mask, cpu);
+	if (dst >= nr_cpu_ids)
+		return 0;
+
+	/* Use this CPU for event counting */
+	perf_pmu_migrate_context(&cmem_lat_pmu->pmu, cpu, dst);
+	cpumask_set_cpu(dst, &cmem_lat_pmu->active_cpu);
+
+	return 0;
+}
+
+static int cmem_lat_pmu_get_cpus(struct cmem_lat_pmu *cmem_lat_pmu,
+				unsigned int socket)
+{
+	int ret = 0, cpu;
+
+	for_each_possible_cpu(cpu) {
+		if (cpu_to_node(cpu) == socket)
+			cpumask_set_cpu(cpu, &cmem_lat_pmu->associated_cpus);
+	}
+
+	if (cpumask_empty(&cmem_lat_pmu->associated_cpus)) {
+		dev_dbg(cmem_lat_pmu->dev,
+			"No cpu associated with PMU socket-%u\n", socket);
+		ret = -ENODEV;
+	}
+
+	return ret;
+}
+
+static int cmem_lat_pmu_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct acpi_device *acpi_dev;
+	struct cmem_lat_pmu *cmem_lat_pmu;
+	char *name, *uid_str;
+	int ret, i;
+	u32 socket;
+
+	acpi_dev = ACPI_COMPANION(dev);
+	if (!acpi_dev)
+		return -ENODEV;
+
+	uid_str = acpi_device_uid(acpi_dev);
+	if (!uid_str)
+		return -ENODEV;
+
+	ret = kstrtou32(uid_str, 0, &socket);
+	if (ret)
+		return ret;
+
+	cmem_lat_pmu = devm_kzalloc(dev, sizeof(*cmem_lat_pmu), GFP_KERNEL);
+	name = devm_kasprintf(dev, GFP_KERNEL, "nvidia_cmem_latency_pmu_%u", socket);
+	if (!cmem_lat_pmu || !name)
+		return -ENOMEM;
+
+	cmem_lat_pmu->dev = dev;
+	cmem_lat_pmu->name = name;
+	cmem_lat_pmu->identifier = acpi_device_hid(acpi_dev);
+	platform_set_drvdata(pdev, cmem_lat_pmu);
+
+	cmem_lat_pmu->pmu = (struct pmu) {
+		.parent		= &pdev->dev,
+		.task_ctx_nr	= perf_invalid_context,
+		.pmu_enable	= cmem_lat_pmu_enable,
+		.pmu_disable	= cmem_lat_pmu_disable,
+		.event_init	= cmem_lat_pmu_event_init,
+		.add		= cmem_lat_pmu_add,
+		.del		= cmem_lat_pmu_del,
+		.start		= cmem_lat_pmu_start,
+		.stop		= cmem_lat_pmu_stop,
+		.read		= cmem_lat_pmu_read,
+		.attr_groups	= cmem_lat_pmu_attr_groups,
+		.capabilities	= PERF_PMU_CAP_NO_EXCLUDE |
+					PERF_PMU_CAP_NO_INTERRUPT,
+	};
+
+	/* Map the address of all the instances plus one for the broadcast. */
+	for (i = 0; i < NUM_INSTANCES + 1; i++) {
+		cmem_lat_pmu->base[i] = devm_platform_ioremap_resource(pdev, i);
+		if (IS_ERR(cmem_lat_pmu->base[i])) {
+			dev_err(dev, "Failed map address for instance %d\n", i);
+			return PTR_ERR(cmem_lat_pmu->base[i]);
+		}
+	}
+
+	ret = cmem_lat_pmu_get_cpus(cmem_lat_pmu, socket);
+	if (ret)
+		return ret;
+
+	ret = cpuhp_state_add_instance(cmem_lat_pmu_cpuhp_state,
+				       &cmem_lat_pmu->node);
+	if (ret) {
+		dev_err(&pdev->dev, "Error %d registering hotplug\n", ret);
+		return ret;
+	}
+
+	cmem_lat_pmu_cg_ctrl(cmem_lat_pmu, CG_CTRL_ENABLE);
+	cmem_lat_pmu_ctrl(cmem_lat_pmu, CTRL_CLR);
+	cmem_lat_pmu_cg_ctrl(cmem_lat_pmu, CG_CTRL_DISABLE);
+
+	ret = perf_pmu_register(&cmem_lat_pmu->pmu, name, -1);
+	if (ret) {
+		dev_err(&pdev->dev, "Failed to register PMU: %d\n", ret);
+		cpuhp_state_remove_instance(cmem_lat_pmu_cpuhp_state,
+					    &cmem_lat_pmu->node);
+		return ret;
+	}
+
+	dev_dbg(&pdev->dev, "Registered %s PMU\n", name);
+
+	return 0;
+}
+
+static void cmem_lat_pmu_device_remove(struct platform_device *pdev)
+{
+	struct cmem_lat_pmu *cmem_lat_pmu = platform_get_drvdata(pdev);
+
+	perf_pmu_unregister(&cmem_lat_pmu->pmu);
+	cpuhp_state_remove_instance(cmem_lat_pmu_cpuhp_state,
+				    &cmem_lat_pmu->node);
+}
+
+static const struct acpi_device_id cmem_lat_pmu_acpi_match[] = {
+	{ "NVDA2021", },
+	{ }
+};
+MODULE_DEVICE_TABLE(acpi, cmem_lat_pmu_acpi_match);
+
+static struct platform_driver cmem_lat_pmu_driver = {
+	.driver = {
+		.name = "nvidia-t410-cmem-latency-pmu",
+		.acpi_match_table = ACPI_PTR(cmem_lat_pmu_acpi_match),
+		.suppress_bind_attrs = true,
+	},
+	.probe = cmem_lat_pmu_probe,
+	.remove = cmem_lat_pmu_device_remove,
+};
+
+static int __init cmem_lat_pmu_init(void)
+{
+	int ret;
+
+	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
+				      "perf/nvidia/cmem_latency:online",
+				      cmem_lat_pmu_cpu_online,
+				      cmem_lat_pmu_cpu_teardown);
+	if (ret < 0)
+		return ret;
+
+	cmem_lat_pmu_cpuhp_state = ret;
+
+	return platform_driver_register(&cmem_lat_pmu_driver);
+}
+
+static void __exit cmem_lat_pmu_exit(void)
+{
+	platform_driver_unregister(&cmem_lat_pmu_driver);
+	cpuhp_remove_multi_state(cmem_lat_pmu_cpuhp_state);
+}
+
+module_init(cmem_lat_pmu_init);
+module_exit(cmem_lat_pmu_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("NVIDIA Tegra410 CPU Memory Latency PMU driver");
+MODULE_AUTHOR("Besar Wicaksono <bwicaksono@nvidia.com>");
-- 
2.43.0
Re: [PATCH 6/8] perf: add NVIDIA Tegra410 CPU Memory Latency PMU
Posted by Ilkka Koskinen 1 week, 3 days ago
Hi Besar,

On Mon, 26 Jan 2026, Besar Wicaksono wrote:
> Adds CPU Memory (CMEM) Latency  PMU support in Tegra410 SOC.
>
> Signed-off-by: Besar Wicaksono <bwicaksono@nvidia.com>

Looks good to me

Reviewed-by: Ilkka Koskinen <ilkka@os.amperecomputing.com>

Cheers, Ilkka

> ---
> .../admin-guide/perf/nvidia-tegra410-pmu.rst  |  25 +
> drivers/perf/Kconfig                          |   7 +
> drivers/perf/Makefile                         |   1 +
> drivers/perf/nvidia_t410_cmem_latency_pmu.c   | 727 ++++++++++++++++++
> 4 files changed, 760 insertions(+)
> create mode 100644 drivers/perf/nvidia_t410_cmem_latency_pmu.c
>
> diff --git a/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst b/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst
> index 07dc447eead7..11fc1c88346a 100644
> --- a/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst
> +++ b/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst
> @@ -8,6 +8,7 @@ metrics like memory bandwidth, latency, and utilization:
> * Unified Coherence Fabric (UCF)
> * PCIE
> * PCIE-TGT
> +* CPU Memory (CMEM) Latency
>
> PMU Driver
> ----------
> @@ -342,3 +343,27 @@ Example usage:
>   0x10000 to 0x100FF on socket 0's PCIE RC-1::
>
>     perf stat -a -e nvidia_pcie_tgt_pmu_0_rc_1/event=0x1,dst_addr_base=0x10000,dst_addr_mask=0xFFF00,dst_addr_en=0x1/
> +
> +CPU Memory (CMEM) Latency PMU
> +-----------------------------
> +
> +This PMU monitors latency events of memory read requests to local
> +CPU DRAM:
> +
> +  * RD_REQ counters: count read requests (32B per request).
> +  * RD_CUM_OUTS counters: accumulated outstanding request counter, which track
> +    how many cycles the read requests are in flight.
> +  * CYCLES counter: counts the number of elapsed cycles.
> +
> +The average latency is calculated as::
> +
> +   FREQ_IN_GHZ = CYCLES / ELAPSED_TIME_IN_NS
> +   AVG_LATENCY_IN_CYCLES = RD_CUM_OUTS / RD_REQ
> +   AVERAGE_LATENCY_IN_NS = AVG_LATENCY_IN_CYCLES / FREQ_IN_GHZ
> +
> +The events and configuration options of this PMU device are described in sysfs,
> +see /sys/bus/event_source/devices/nvidia_cmem_latency_pmu_<socket-id>.
> +
> +Example usage::
> +
> +  perf stat -a -e '{nvidia_cmem_latency_pmu_0/rd_req/,nvidia_cmem_latency_pmu_0/rd_cum_outs/,nvidia_cmem_latency_pmu_0/cycles/}'
> diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
> index 638321fc9800..9fed3c41d5ea 100644
> --- a/drivers/perf/Kconfig
> +++ b/drivers/perf/Kconfig
> @@ -311,4 +311,11 @@ config MARVELL_PEM_PMU
> 	  Enable support for PCIe Interface performance monitoring
> 	  on Marvell platform.
>
> +config NVIDIA_TEGRA410_CMEM_LATENCY_PMU
> +	tristate "NVIDIA Tegra410 CPU Memory Latency PMU"
> +	depends on ARM64
> +	help
> +	  Enable perf support for CPU memory latency counters monitoring on
> +	  NVIDIA Tegra410 SoC.
> +
> endmenu
> diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
> index ea52711a87e3..4aa6aad393c2 100644
> --- a/drivers/perf/Makefile
> +++ b/drivers/perf/Makefile
> @@ -35,3 +35,4 @@ obj-$(CONFIG_DWC_PCIE_PMU) += dwc_pcie_pmu.o
> obj-$(CONFIG_ARM_CORESIGHT_PMU_ARCH_SYSTEM_PMU) += arm_cspmu/
> obj-$(CONFIG_MESON_DDR_PMU) += amlogic/
> obj-$(CONFIG_CXL_PMU) += cxl_pmu.o
> +obj-$(CONFIG_NVIDIA_TEGRA410_CMEM_LATENCY_PMU) += nvidia_t410_cmem_latency_pmu.o
> diff --git a/drivers/perf/nvidia_t410_cmem_latency_pmu.c b/drivers/perf/nvidia_t410_cmem_latency_pmu.c
> new file mode 100644
> index 000000000000..9b466581c8fc
> --- /dev/null
> +++ b/drivers/perf/nvidia_t410_cmem_latency_pmu.c
> @@ -0,0 +1,727 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * NVIDIA Tegra410 CPU Memory (CMEM) Latency PMU driver.
> + *
> + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
> + */
> +
> +#include <linux/acpi.h>
> +#include <linux/bitops.h>
> +#include <linux/cpumask.h>
> +#include <linux/device.h>
> +#include <linux/interrupt.h>
> +#include <linux/io.h>
> +#include <linux/module.h>
> +#include <linux/perf_event.h>
> +#include <linux/platform_device.h>
> +
> +#define NUM_INSTANCES    14
> +#define BCAST(pmu) pmu->base[NUM_INSTANCES]
> +
> +/* Register offsets. */
> +#define CG_CTRL         0x800
> +#define CTRL            0x808
> +#define STATUS          0x810
> +#define CYCLE_CNTR      0x818
> +#define MC0_REQ_CNTR    0x820
> +#define MC0_AOR_CNTR    0x830
> +#define MC1_REQ_CNTR    0x838
> +#define MC1_AOR_CNTR    0x848
> +#define MC2_REQ_CNTR    0x850
> +#define MC2_AOR_CNTR    0x860
> +
> +/* CTRL values. */
> +#define CTRL_DISABLE    0x0ULL
> +#define CTRL_ENABLE     0x1ULL
> +#define CTRL_CLR        0x2ULL
> +
> +/* CG_CTRL values. */
> +#define CG_CTRL_DISABLE    0x0ULL
> +#define CG_CTRL_ENABLE     0x1ULL
> +
> +/* STATUS register field. */
> +#define STATUS_CYCLE_OVF      BIT(0)
> +#define STATUS_MC0_AOR_OVF    BIT(1)
> +#define STATUS_MC0_REQ_OVF    BIT(3)
> +#define STATUS_MC1_AOR_OVF    BIT(4)
> +#define STATUS_MC1_REQ_OVF    BIT(6)
> +#define STATUS_MC2_AOR_OVF    BIT(7)
> +#define STATUS_MC2_REQ_OVF    BIT(9)
> +
> +/* Events. */
> +#define EVENT_CYCLES    0x0
> +#define EVENT_REQ       0x1
> +#define EVENT_AOR       0x2
> +
> +#define NUM_EVENTS           0x3
> +#define MASK_EVENT           0x3
> +#define MAX_ACTIVE_EVENTS    32
> +
> +#define ACTIVE_CPU_MASK        0x0
> +#define ASSOCIATED_CPU_MASK    0x1
> +
> +static unsigned long cmem_lat_pmu_cpuhp_state;
> +
> +struct cmem_lat_pmu_hw_events {
> +	struct perf_event *events[MAX_ACTIVE_EVENTS];
> +	DECLARE_BITMAP(used_ctrs, MAX_ACTIVE_EVENTS);
> +};
> +
> +struct cmem_lat_pmu {
> +	struct pmu pmu;
> +	struct device *dev;
> +	const char *name;
> +	const char *identifier;
> +	void __iomem *base[NUM_INSTANCES + 1];
> +	cpumask_t associated_cpus;
> +	cpumask_t active_cpu;
> +	struct hlist_node node;
> +	struct cmem_lat_pmu_hw_events hw_events;
> +};
> +
> +#define to_cmem_lat_pmu(p) \
> +	container_of(p, struct cmem_lat_pmu, pmu)
> +
> +
> +/* Get event type from perf_event. */
> +static inline u32 get_event_type(struct perf_event *event)
> +{
> +	return (event->attr.config) & MASK_EVENT;
> +}
> +
> +/* PMU operations. */
> +static int cmem_lat_pmu_get_event_idx(struct cmem_lat_pmu_hw_events *hw_events,
> +				struct perf_event *event)
> +{
> +	unsigned int idx;
> +
> +	idx = find_first_zero_bit(hw_events->used_ctrs, MAX_ACTIVE_EVENTS);
> +	if (idx >= MAX_ACTIVE_EVENTS)
> +		return -EAGAIN;
> +
> +	set_bit(idx, hw_events->used_ctrs);
> +
> +	return idx;
> +}
> +
> +static bool cmem_lat_pmu_validate_event(struct pmu *pmu,
> +				 struct cmem_lat_pmu_hw_events *hw_events,
> +				 struct perf_event *event)
> +{
> +	if (is_software_event(event))
> +		return true;
> +
> +	/* Reject groups spanning multiple HW PMUs. */
> +	if (event->pmu != pmu)
> +		return false;
> +
> +	return (cmem_lat_pmu_get_event_idx(hw_events, event) >= 0);
> +}
> +
> +/*
> + * Make sure the group of events can be scheduled at once
> + * on the PMU.
> + */
> +static bool cmem_lat_pmu_validate_group(struct perf_event *event)
> +{
> +	struct perf_event *sibling, *leader = event->group_leader;
> +	struct cmem_lat_pmu_hw_events fake_hw_events;
> +
> +	if (event->group_leader == event)
> +		return true;
> +
> +	memset(&fake_hw_events, 0, sizeof(fake_hw_events));
> +
> +	if (!cmem_lat_pmu_validate_event(event->pmu, &fake_hw_events, leader))
> +		return false;
> +
> +	for_each_sibling_event(sibling, leader) {
> +		if (!cmem_lat_pmu_validate_event(event->pmu, &fake_hw_events,
> +						sibling))
> +			return false;
> +	}
> +
> +	return cmem_lat_pmu_validate_event(event->pmu, &fake_hw_events, event);
> +}
> +
> +static int cmem_lat_pmu_event_init(struct perf_event *event)
> +{
> +	struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu);
> +	struct hw_perf_event *hwc = &event->hw;
> +	u32 event_type = get_event_type(event);
> +
> +	if (event->attr.type != event->pmu->type ||
> +	    event_type >= NUM_EVENTS)
> +		return -ENOENT;
> +
> +	/*
> +	 * Following other "uncore" PMUs, we do not support sampling mode or
> +	 * attach to a task (per-process mode).
> +	 */
> +	if (is_sampling_event(event)) {
> +		dev_dbg(cmem_lat_pmu->pmu.dev,
> +			"Can't support sampling events\n");
> +		return -EOPNOTSUPP;
> +	}
> +
> +	if (event->cpu < 0 || event->attach_state & PERF_ATTACH_TASK) {
> +		dev_dbg(cmem_lat_pmu->pmu.dev,
> +			"Can't support per-task counters\n");
> +		return -EINVAL;
> +	}
> +
> +	/*
> +	 * Make sure the CPU assignment is on one of the CPUs associated with
> +	 * this PMU.
> +	 */
> +	if (!cpumask_test_cpu(event->cpu, &cmem_lat_pmu->associated_cpus)) {
> +		dev_dbg(cmem_lat_pmu->pmu.dev,
> +			"Requested cpu is not associated with the PMU\n");
> +		return -EINVAL;
> +	}
> +
> +	/* Enforce the current active CPU to handle the events in this PMU. */
> +	event->cpu = cpumask_first(&cmem_lat_pmu->active_cpu);
> +	if (event->cpu >= nr_cpu_ids)
> +		return -EINVAL;
> +
> +	if (!cmem_lat_pmu_validate_group(event))
> +		return -EINVAL;
> +
> +	hwc->idx = -1;
> +	hwc->config = event_type;
> +
> +	return 0;
> +}
> +
> +static u64 cmem_lat_pmu_read_status(struct cmem_lat_pmu *cmem_lat_pmu,
> +				   unsigned int inst)
> +{
> +	return readq(cmem_lat_pmu->base[inst] + STATUS);
> +}
> +
> +static u64 cmem_lat_pmu_read_cycle_counter(struct perf_event *event)
> +{
> +	const unsigned int instance = 0;
> +	u64 status;
> +	struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu);
> +	struct device *dev = cmem_lat_pmu->dev;
> +
> +	/*
> +	 * Use the reading from first instance since all instances are
> +	 * identical.
> +	 */
> +	status = cmem_lat_pmu_read_status(cmem_lat_pmu, instance);
> +	if (status & STATUS_CYCLE_OVF)
> +		dev_warn(dev, "Cycle counter overflow\n");
> +
> +	return readq(cmem_lat_pmu->base[instance] + CYCLE_CNTR);
> +}
> +
> +static u64 cmem_lat_pmu_read_req_counter(struct perf_event *event)
> +{
> +	unsigned int i;
> +	u64 status, val = 0;
> +	struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu);
> +	struct device *dev = cmem_lat_pmu->dev;
> +
> +	/* Sum up the counts from all instances. */
> +	for (i = 0; i < NUM_INSTANCES; i++) {
> +		status = cmem_lat_pmu_read_status(cmem_lat_pmu, i);
> +		if (status & STATUS_MC0_REQ_OVF)
> +			dev_warn(dev, "MC0 request counter overflow\n");
> +		if (status & STATUS_MC1_REQ_OVF)
> +			dev_warn(dev, "MC1 request counter overflow\n");
> +		if (status & STATUS_MC2_REQ_OVF)
> +			dev_warn(dev, "MC2 request counter overflow\n");
> +
> +		val += readq(cmem_lat_pmu->base[i] + MC0_REQ_CNTR);
> +		val += readq(cmem_lat_pmu->base[i] + MC1_REQ_CNTR);
> +		val += readq(cmem_lat_pmu->base[i] + MC2_REQ_CNTR);
> +	}
> +
> +	return val;
> +}
> +
> +static u64 cmem_lat_pmu_read_aor_counter(struct perf_event *event)
> +{
> +	unsigned int i;
> +	u64 status, val = 0;
> +	struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu);
> +	struct device *dev = cmem_lat_pmu->dev;
> +
> +	/* Sum up the counts from all instances. */
> +	for (i = 0; i < NUM_INSTANCES; i++) {
> +		status = cmem_lat_pmu_read_status(cmem_lat_pmu, i);
> +		if (status & STATUS_MC0_AOR_OVF)
> +			dev_warn(dev, "MC0 AOR counter overflow\n");
> +		if (status & STATUS_MC1_AOR_OVF)
> +			dev_warn(dev, "MC1 AOR counter overflow\n");
> +		if (status & STATUS_MC2_AOR_OVF)
> +			dev_warn(dev, "MC2 AOR counter overflow\n");
> +
> +		val += readq(cmem_lat_pmu->base[i] + MC0_AOR_CNTR);
> +		val += readq(cmem_lat_pmu->base[i] + MC1_AOR_CNTR);
> +		val += readq(cmem_lat_pmu->base[i] + MC2_AOR_CNTR);
> +	}
> +
> +	return val;
> +}
> +
> +static u64 (*read_counter_fn[NUM_EVENTS])(struct perf_event *) = {
> +	[EVENT_CYCLES] = cmem_lat_pmu_read_cycle_counter,
> +	[EVENT_REQ] = cmem_lat_pmu_read_req_counter,
> +	[EVENT_AOR] = cmem_lat_pmu_read_aor_counter,
> +};
> +
> +static void cmem_lat_pmu_event_update(struct perf_event *event)
> +{
> +	u32 event_type;
> +	u64 prev, now;
> +	struct hw_perf_event *hwc = &event->hw;
> +
> +	if (hwc->state & PERF_HES_STOPPED)
> +		return;
> +
> +	event_type = hwc->config;
> +
> +	do {
> +		prev = local64_read(&hwc->prev_count);
> +		now = read_counter_fn[event_type](event);
> +	} while (local64_cmpxchg(&hwc->prev_count, prev, now) != prev);
> +
> +	local64_add(now - prev, &event->count);
> +
> +	hwc->state |= PERF_HES_UPTODATE;
> +}
> +
> +static void cmem_lat_pmu_start(struct perf_event *event, int pmu_flags)
> +{
> +	event->hw.state = 0;
> +}
> +
> +static void cmem_lat_pmu_stop(struct perf_event *event, int pmu_flags)
> +{
> +	event->hw.state |= PERF_HES_STOPPED;
> +}
> +
> +static int cmem_lat_pmu_add(struct perf_event *event, int flags)
> +{
> +	struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu);
> +	struct cmem_lat_pmu_hw_events *hw_events = &cmem_lat_pmu->hw_events;
> +	struct hw_perf_event *hwc = &event->hw;
> +	int idx;
> +
> +	if (WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(),
> +					   &cmem_lat_pmu->associated_cpus)))
> +		return -ENOENT;
> +
> +	idx = cmem_lat_pmu_get_event_idx(hw_events, event);
> +	if (idx < 0)
> +		return idx;
> +
> +	hw_events->events[idx] = event;
> +	hwc->idx = idx;
> +	hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
> +
> +	if (flags & PERF_EF_START)
> +		cmem_lat_pmu_start(event, PERF_EF_RELOAD);
> +
> +	/* Propagate changes to the userspace mapping. */
> +	perf_event_update_userpage(event);
> +
> +	return 0;
> +}
> +
> +static void cmem_lat_pmu_del(struct perf_event *event, int flags)
> +{
> +	struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu);
> +	struct cmem_lat_pmu_hw_events *hw_events = &cmem_lat_pmu->hw_events;
> +	struct hw_perf_event *hwc = &event->hw;
> +	int idx = hwc->idx;
> +
> +	cmem_lat_pmu_stop(event, PERF_EF_UPDATE);
> +
> +	hw_events->events[idx] = NULL;
> +
> +	clear_bit(idx, hw_events->used_ctrs);
> +
> +	perf_event_update_userpage(event);
> +}
> +
> +static void cmem_lat_pmu_read(struct perf_event *event)
> +{
> +	cmem_lat_pmu_event_update(event);
> +}
> +
> +static inline void cmem_lat_pmu_cg_ctrl(struct cmem_lat_pmu *cmem_lat_pmu, u64 val)
> +{
> +	writeq(val, BCAST(cmem_lat_pmu) + CG_CTRL);
> +}
> +
> +static inline void cmem_lat_pmu_ctrl(struct cmem_lat_pmu *cmem_lat_pmu, u64 val)
> +{
> +	writeq(val, BCAST(cmem_lat_pmu) + CTRL);
> +}
> +
> +static void cmem_lat_pmu_enable(struct pmu *pmu)
> +{
> +	bool disabled;
> +	struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(pmu);
> +
> +	disabled = bitmap_empty(
> +		cmem_lat_pmu->hw_events.used_ctrs, MAX_ACTIVE_EVENTS);
> +
> +	if (disabled)
> +		return;
> +
> +	/* Enable all the counters. */
> +	cmem_lat_pmu_cg_ctrl(cmem_lat_pmu, CG_CTRL_ENABLE);
> +	cmem_lat_pmu_ctrl(cmem_lat_pmu, CTRL_ENABLE);
> +}
> +
> +static void cmem_lat_pmu_disable(struct pmu *pmu)
> +{
> +	int idx;
> +	struct perf_event *event;
> +	struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(pmu);
> +
> +	/* Disable all the counters. */
> +	cmem_lat_pmu_ctrl(cmem_lat_pmu, CTRL_DISABLE);
> +
> +	/*
> +	 * The counters will start from 0 again on restart.
> +	 * Update the events immediately to avoid losing the counts.
> +	 */
> +	for_each_set_bit(
> +		idx, cmem_lat_pmu->hw_events.used_ctrs, MAX_ACTIVE_EVENTS) {
> +		event = cmem_lat_pmu->hw_events.events[idx];
> +
> +		if (!event)
> +			continue;
> +
> +		cmem_lat_pmu_event_update(event);
> +
> +		local64_set(&event->hw.prev_count, 0ULL);
> +	}
> +
> +	cmem_lat_pmu_ctrl(cmem_lat_pmu, CTRL_CLR);
> +	cmem_lat_pmu_cg_ctrl(cmem_lat_pmu, CG_CTRL_DISABLE);
> +}
> +
> +/* PMU identifier attribute. */
> +
> +static ssize_t cmem_lat_pmu_identifier_show(struct device *dev,
> +					 struct device_attribute *attr,
> +					 char *page)
> +{
> +	struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(dev_get_drvdata(dev));
> +
> +	return sysfs_emit(page, "%s\n", cmem_lat_pmu->identifier);
> +}
> +
> +static struct device_attribute cmem_lat_pmu_identifier_attr =
> +	__ATTR(identifier, 0444, cmem_lat_pmu_identifier_show, NULL);
> +
> +static struct attribute *cmem_lat_pmu_identifier_attrs[] = {
> +	&cmem_lat_pmu_identifier_attr.attr,
> +	NULL,
> +};
> +
> +static struct attribute_group cmem_lat_pmu_identifier_attr_group = {
> +	.attrs = cmem_lat_pmu_identifier_attrs,
> +};
> +
> +/* Format attributes. */
> +
> +#define NV_PMU_EXT_ATTR(_name, _func, _config)			\
> +	(&((struct dev_ext_attribute[]){				\
> +		{							\
> +			.attr = __ATTR(_name, 0444, _func, NULL),	\
> +			.var = (void *)_config				\
> +		}							\
> +	})[0].attr.attr)
> +
> +static struct attribute *cmem_lat_pmu_formats[] = {
> +	NV_PMU_EXT_ATTR(event, device_show_string, "config:0-1"),
> +	NULL,
> +};
> +
> +static const struct attribute_group cmem_lat_pmu_format_group = {
> +	.name = "format",
> +	.attrs = cmem_lat_pmu_formats,
> +};
> +
> +/* Event attributes. */
> +
> +static ssize_t cmem_lat_pmu_sysfs_event_show(struct device *dev,
> +				struct device_attribute *attr, char *buf)
> +{
> +	struct perf_pmu_events_attr *pmu_attr;
> +
> +	pmu_attr = container_of(attr, typeof(*pmu_attr), attr);
> +	return sysfs_emit(buf, "event=0x%llx\n", pmu_attr->id);
> +}
> +
> +#define NV_PMU_EVENT_ATTR(_name, _config)	\
> +	PMU_EVENT_ATTR_ID(_name, cmem_lat_pmu_sysfs_event_show, _config)
> +
> +static struct attribute *cmem_lat_pmu_events[] = {
> +	NV_PMU_EVENT_ATTR(cycles, EVENT_CYCLES),
> +	NV_PMU_EVENT_ATTR(rd_req, EVENT_REQ),
> +	NV_PMU_EVENT_ATTR(rd_cum_outs, EVENT_AOR),
> +	NULL
> +};
> +
> +static const struct attribute_group cmem_lat_pmu_events_group = {
> +	.name = "events",
> +	.attrs = cmem_lat_pmu_events,
> +};
> +
> +/* Cpumask attributes. */
> +
> +static ssize_t cmem_lat_pmu_cpumask_show(struct device *dev,
> +			    struct device_attribute *attr, char *buf)
> +{
> +	struct pmu *pmu = dev_get_drvdata(dev);
> +	struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(pmu);
> +	struct dev_ext_attribute *eattr =
> +		container_of(attr, struct dev_ext_attribute, attr);
> +	unsigned long mask_id = (unsigned long)eattr->var;
> +	const cpumask_t *cpumask;
> +
> +	switch (mask_id) {
> +	case ACTIVE_CPU_MASK:
> +		cpumask = &cmem_lat_pmu->active_cpu;
> +		break;
> +	case ASSOCIATED_CPU_MASK:
> +		cpumask = &cmem_lat_pmu->associated_cpus;
> +		break;
> +	default:
> +		return 0;
> +	}
> +	return cpumap_print_to_pagebuf(true, buf, cpumask);
> +}
> +
> +#define NV_PMU_CPUMASK_ATTR(_name, _config)			\
> +	NV_PMU_EXT_ATTR(_name, cmem_lat_pmu_cpumask_show,	\
> +				(unsigned long)_config)
> +
> +static struct attribute *cmem_lat_pmu_cpumask_attrs[] = {
> +	NV_PMU_CPUMASK_ATTR(cpumask, ACTIVE_CPU_MASK),
> +	NV_PMU_CPUMASK_ATTR(associated_cpus, ASSOCIATED_CPU_MASK),
> +	NULL,
> +};
> +
> +static const struct attribute_group cmem_lat_pmu_cpumask_attr_group = {
> +	.attrs = cmem_lat_pmu_cpumask_attrs,
> +};
> +
> +/* Per PMU device attribute groups. */
> +
> +static const struct attribute_group *cmem_lat_pmu_attr_groups[] = {
> +	&cmem_lat_pmu_identifier_attr_group,
> +	&cmem_lat_pmu_format_group,
> +	&cmem_lat_pmu_events_group,
> +	&cmem_lat_pmu_cpumask_attr_group,
> +	NULL,
> +};
> +
> +static int cmem_lat_pmu_cpu_online(unsigned int cpu, struct hlist_node *node)
> +{
> +	struct cmem_lat_pmu *cmem_lat_pmu =
> +		hlist_entry_safe(node, struct cmem_lat_pmu, node);
> +
> +	if (!cpumask_test_cpu(cpu, &cmem_lat_pmu->associated_cpus))
> +		return 0;
> +
> +	/* If the PMU is already managed, there is nothing to do */
> +	if (!cpumask_empty(&cmem_lat_pmu->active_cpu))
> +		return 0;
> +
> +	/* Use this CPU for event counting */
> +	cpumask_set_cpu(cpu, &cmem_lat_pmu->active_cpu);
> +
> +	return 0;
> +}
> +
> +static int cmem_lat_pmu_cpu_teardown(unsigned int cpu, struct hlist_node *node)
> +{
> +	unsigned int dst;
> +
> +	struct cmem_lat_pmu *cmem_lat_pmu =
> +		hlist_entry_safe(node, struct cmem_lat_pmu, node);
> +
> +	/* Nothing to do if this CPU doesn't own the PMU */
> +	if (!cpumask_test_and_clear_cpu(cpu, &cmem_lat_pmu->active_cpu))
> +		return 0;
> +
> +	/* Choose a new CPU to migrate ownership of the PMU to */
> +	dst = cpumask_any_and_but(&cmem_lat_pmu->associated_cpus,
> +				  cpu_online_mask, cpu);
> +	if (dst >= nr_cpu_ids)
> +		return 0;
> +
> +	/* Use this CPU for event counting */
> +	perf_pmu_migrate_context(&cmem_lat_pmu->pmu, cpu, dst);
> +	cpumask_set_cpu(dst, &cmem_lat_pmu->active_cpu);
> +
> +	return 0;
> +}
> +
> +static int cmem_lat_pmu_get_cpus(struct cmem_lat_pmu *cmem_lat_pmu,
> +				unsigned int socket)
> +{
> +	int ret = 0, cpu;
> +
> +	for_each_possible_cpu(cpu) {
> +		if (cpu_to_node(cpu) == socket)
> +			cpumask_set_cpu(cpu, &cmem_lat_pmu->associated_cpus);
> +	}
> +
> +	if (cpumask_empty(&cmem_lat_pmu->associated_cpus)) {
> +		dev_dbg(cmem_lat_pmu->dev,
> +			"No cpu associated with PMU socket-%u\n", socket);
> +		ret = -ENODEV;
> +	}
> +
> +	return ret;
> +}
> +
> +static int cmem_lat_pmu_probe(struct platform_device *pdev)
> +{
> +	struct device *dev = &pdev->dev;
> +	struct acpi_device *acpi_dev;
> +	struct cmem_lat_pmu *cmem_lat_pmu;
> +	char *name, *uid_str;
> +	int ret, i;
> +	u32 socket;
> +
> +	acpi_dev = ACPI_COMPANION(dev);
> +	if (!acpi_dev)
> +		return -ENODEV;
> +
> +	uid_str = acpi_device_uid(acpi_dev);
> +	if (!uid_str)
> +		return -ENODEV;
> +
> +	ret = kstrtou32(uid_str, 0, &socket);
> +	if (ret)
> +		return ret;
> +
> +	cmem_lat_pmu = devm_kzalloc(dev, sizeof(*cmem_lat_pmu), GFP_KERNEL);
> +	name = devm_kasprintf(dev, GFP_KERNEL, "nvidia_cmem_latency_pmu_%u", socket);
> +	if (!cmem_lat_pmu || !name)
> +		return -ENOMEM;
> +
> +	cmem_lat_pmu->dev = dev;
> +	cmem_lat_pmu->name = name;
> +	cmem_lat_pmu->identifier = acpi_device_hid(acpi_dev);
> +	platform_set_drvdata(pdev, cmem_lat_pmu);
> +
> +	cmem_lat_pmu->pmu = (struct pmu) {
> +		.parent		= &pdev->dev,
> +		.task_ctx_nr	= perf_invalid_context,
> +		.pmu_enable	= cmem_lat_pmu_enable,
> +		.pmu_disable	= cmem_lat_pmu_disable,
> +		.event_init	= cmem_lat_pmu_event_init,
> +		.add		= cmem_lat_pmu_add,
> +		.del		= cmem_lat_pmu_del,
> +		.start		= cmem_lat_pmu_start,
> +		.stop		= cmem_lat_pmu_stop,
> +		.read		= cmem_lat_pmu_read,
> +		.attr_groups	= cmem_lat_pmu_attr_groups,
> +		.capabilities	= PERF_PMU_CAP_NO_EXCLUDE |
> +					PERF_PMU_CAP_NO_INTERRUPT,
> +	};
> +
> +	/* Map the address of all the instances plus one for the broadcast. */
> +	for (i = 0; i < NUM_INSTANCES + 1; i++) {
> +		cmem_lat_pmu->base[i] = devm_platform_ioremap_resource(pdev, i);
> +		if (IS_ERR(cmem_lat_pmu->base[i])) {
> +			dev_err(dev, "Failed map address for instance %d\n", i);
> +			return PTR_ERR(cmem_lat_pmu->base[i]);
> +		}
> +	}
> +
> +	ret = cmem_lat_pmu_get_cpus(cmem_lat_pmu, socket);
> +	if (ret)
> +		return ret;
> +
> +	ret = cpuhp_state_add_instance(cmem_lat_pmu_cpuhp_state,
> +				       &cmem_lat_pmu->node);
> +	if (ret) {
> +		dev_err(&pdev->dev, "Error %d registering hotplug\n", ret);
> +		return ret;
> +	}
> +
> +	cmem_lat_pmu_cg_ctrl(cmem_lat_pmu, CG_CTRL_ENABLE);
> +	cmem_lat_pmu_ctrl(cmem_lat_pmu, CTRL_CLR);
> +	cmem_lat_pmu_cg_ctrl(cmem_lat_pmu, CG_CTRL_DISABLE);
> +
> +	ret = perf_pmu_register(&cmem_lat_pmu->pmu, name, -1);
> +	if (ret) {
> +		dev_err(&pdev->dev, "Failed to register PMU: %d\n", ret);
> +		cpuhp_state_remove_instance(cmem_lat_pmu_cpuhp_state,
> +					    &cmem_lat_pmu->node);
> +		return ret;
> +	}
> +
> +	dev_dbg(&pdev->dev, "Registered %s PMU\n", name);
> +
> +	return 0;
> +}
> +
> +static void cmem_lat_pmu_device_remove(struct platform_device *pdev)
> +{
> +	struct cmem_lat_pmu *cmem_lat_pmu = platform_get_drvdata(pdev);
> +
> +	perf_pmu_unregister(&cmem_lat_pmu->pmu);
> +	cpuhp_state_remove_instance(cmem_lat_pmu_cpuhp_state,
> +				    &cmem_lat_pmu->node);
> +}
> +
> +static const struct acpi_device_id cmem_lat_pmu_acpi_match[] = {
> +	{ "NVDA2021", },
> +	{ }
> +};
> +MODULE_DEVICE_TABLE(acpi, cmem_lat_pmu_acpi_match);
> +
> +static struct platform_driver cmem_lat_pmu_driver = {
> +	.driver = {
> +		.name = "nvidia-t410-cmem-latency-pmu",
> +		.acpi_match_table = ACPI_PTR(cmem_lat_pmu_acpi_match),
> +		.suppress_bind_attrs = true,
> +	},
> +	.probe = cmem_lat_pmu_probe,
> +	.remove = cmem_lat_pmu_device_remove,
> +};
> +
> +static int __init cmem_lat_pmu_init(void)
> +{
> +	int ret;
> +
> +	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
> +				      "perf/nvidia/cmem_latency:online",
> +				      cmem_lat_pmu_cpu_online,
> +				      cmem_lat_pmu_cpu_teardown);
> +	if (ret < 0)
> +		return ret;
> +
> +	cmem_lat_pmu_cpuhp_state = ret;
> +
> +	return platform_driver_register(&cmem_lat_pmu_driver);
> +}
> +
> +static void __exit cmem_lat_pmu_exit(void)
> +{
> +	platform_driver_unregister(&cmem_lat_pmu_driver);
> +	cpuhp_remove_multi_state(cmem_lat_pmu_cpuhp_state);
> +}
> +
> +module_init(cmem_lat_pmu_init);
> +module_exit(cmem_lat_pmu_exit);
> +
> +MODULE_LICENSE("GPL");
> +MODULE_DESCRIPTION("NVIDIA Tegra410 CPU Memory Latency PMU driver");
> +MODULE_AUTHOR("Besar Wicaksono <bwicaksono@nvidia.com>");
> -- 
> 2.43.0
>
>
Re: [PATCH 6/8] perf: add NVIDIA Tegra410 CPU Memory Latency PMU
Posted by kernel test robot 1 week, 5 days ago
Hi Besar,

kernel test robot noticed the following build errors:

[auto build test ERROR on arm64/for-next/core]
[also build test ERROR on linus/master v6.19-rc7 next-20260127]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Besar-Wicaksono/perf-arm_cspmu-nvidia-Rename-doc-to-Tegra241/20260127-021604
base:   https://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git for-next/core
patch link:    https://lore.kernel.org/r/20260126181155.2776097-7-bwicaksono%40nvidia.com
patch subject: [PATCH 6/8] perf: add NVIDIA Tegra410 CPU Memory Latency PMU
config: arm64-randconfig-r113-20260128 (https://download.01.org/0day-ci/archive/20260128/202601280830.2IJaaITg-lkp@intel.com/config)
compiler: clang version 22.0.0git (https://github.com/llvm/llvm-project 9b8addffa70cee5b2acc5454712d9cf78ce45710)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260128/202601280830.2IJaaITg-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202601280830.2IJaaITg-lkp@intel.com/

All errors (new ones prefixed by >>):

>> drivers/perf/nvidia_t410_cmem_latency_pmu.c:604:12: error: call to undeclared function 'acpi_device_uid'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
     604 |         uid_str = acpi_device_uid(acpi_dev);
         |                   ^
   drivers/perf/nvidia_t410_cmem_latency_pmu.c:604:12: note: did you mean 'cpu_device_up'?
   include/linux/cpu.h:119:5: note: 'cpu_device_up' declared here
     119 | int cpu_device_up(struct device *dev);
         |     ^
>> drivers/perf/nvidia_t410_cmem_latency_pmu.c:604:10: error: incompatible integer to pointer conversion assigning to 'char *' from 'int' [-Wint-conversion]
     604 |         uid_str = acpi_device_uid(acpi_dev);
         |                 ^ ~~~~~~~~~~~~~~~~~~~~~~~~~
>> drivers/perf/nvidia_t410_cmem_latency_pmu.c:619:29: error: call to undeclared function 'acpi_device_hid'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
     619 |         cmem_lat_pmu->identifier = acpi_device_hid(acpi_dev);
         |                                    ^
>> drivers/perf/nvidia_t410_cmem_latency_pmu.c:619:27: error: incompatible integer to pointer conversion assigning to 'const char *' from 'int' [-Wint-conversion]
     619 |         cmem_lat_pmu->identifier = acpi_device_hid(acpi_dev);
         |                                  ^ ~~~~~~~~~~~~~~~~~~~~~~~~~
   4 errors generated.


vim +/acpi_device_uid +604 drivers/perf/nvidia_t410_cmem_latency_pmu.c

   590	
   591	static int cmem_lat_pmu_probe(struct platform_device *pdev)
   592	{
   593		struct device *dev = &pdev->dev;
   594		struct acpi_device *acpi_dev;
   595		struct cmem_lat_pmu *cmem_lat_pmu;
   596		char *name, *uid_str;
   597		int ret, i;
   598		u32 socket;
   599	
   600		acpi_dev = ACPI_COMPANION(dev);
   601		if (!acpi_dev)
   602			return -ENODEV;
   603	
 > 604		uid_str = acpi_device_uid(acpi_dev);
   605		if (!uid_str)
   606			return -ENODEV;
   607	
   608		ret = kstrtou32(uid_str, 0, &socket);
   609		if (ret)
   610			return ret;
   611	
   612		cmem_lat_pmu = devm_kzalloc(dev, sizeof(*cmem_lat_pmu), GFP_KERNEL);
   613		name = devm_kasprintf(dev, GFP_KERNEL, "nvidia_cmem_latency_pmu_%u", socket);
   614		if (!cmem_lat_pmu || !name)
   615			return -ENOMEM;
   616	
   617		cmem_lat_pmu->dev = dev;
   618		cmem_lat_pmu->name = name;
 > 619		cmem_lat_pmu->identifier = acpi_device_hid(acpi_dev);
   620		platform_set_drvdata(pdev, cmem_lat_pmu);
   621	
   622		cmem_lat_pmu->pmu = (struct pmu) {
   623			.parent		= &pdev->dev,
   624			.task_ctx_nr	= perf_invalid_context,
   625			.pmu_enable	= cmem_lat_pmu_enable,
   626			.pmu_disable	= cmem_lat_pmu_disable,
   627			.event_init	= cmem_lat_pmu_event_init,
   628			.add		= cmem_lat_pmu_add,
   629			.del		= cmem_lat_pmu_del,
   630			.start		= cmem_lat_pmu_start,
   631			.stop		= cmem_lat_pmu_stop,
   632			.read		= cmem_lat_pmu_read,
   633			.attr_groups	= cmem_lat_pmu_attr_groups,
   634			.capabilities	= PERF_PMU_CAP_NO_EXCLUDE |
   635						PERF_PMU_CAP_NO_INTERRUPT,
   636		};
   637	
   638		/* Map the address of all the instances plus one for the broadcast. */
   639		for (i = 0; i < NUM_INSTANCES + 1; i++) {
   640			cmem_lat_pmu->base[i] = devm_platform_ioremap_resource(pdev, i);
   641			if (IS_ERR(cmem_lat_pmu->base[i])) {
   642				dev_err(dev, "Failed map address for instance %d\n", i);
   643				return PTR_ERR(cmem_lat_pmu->base[i]);
   644			}
   645		}
   646	
   647		ret = cmem_lat_pmu_get_cpus(cmem_lat_pmu, socket);
   648		if (ret)
   649			return ret;
   650	
   651		ret = cpuhp_state_add_instance(cmem_lat_pmu_cpuhp_state,
   652					       &cmem_lat_pmu->node);
   653		if (ret) {
   654			dev_err(&pdev->dev, "Error %d registering hotplug\n", ret);
   655			return ret;
   656		}
   657	
   658		cmem_lat_pmu_cg_ctrl(cmem_lat_pmu, CG_CTRL_ENABLE);
   659		cmem_lat_pmu_ctrl(cmem_lat_pmu, CTRL_CLR);
   660		cmem_lat_pmu_cg_ctrl(cmem_lat_pmu, CG_CTRL_DISABLE);
   661	
   662		ret = perf_pmu_register(&cmem_lat_pmu->pmu, name, -1);
   663		if (ret) {
   664			dev_err(&pdev->dev, "Failed to register PMU: %d\n", ret);
   665			cpuhp_state_remove_instance(cmem_lat_pmu_cpuhp_state,
   666						    &cmem_lat_pmu->node);
   667			return ret;
   668		}
   669	
   670		dev_dbg(&pdev->dev, "Registered %s PMU\n", name);
   671	
   672		return 0;
   673	}
   674	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki