Adds Unified Coherent Fabric PMU support in Tegra410 SOC.
Signed-off-by: Besar Wicaksono <bwicaksono@nvidia.com>
---
Documentation/admin-guide/perf/index.rst | 1 +
.../admin-guide/perf/nvidia-tegra410-pmu.rst | 106 ++++++++++++++++++
drivers/perf/arm_cspmu/nvidia_cspmu.c | 90 ++++++++++++++-
3 files changed, 196 insertions(+), 1 deletion(-)
create mode 100644 Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst
diff --git a/Documentation/admin-guide/perf/index.rst b/Documentation/admin-guide/perf/index.rst
index c407bb44b08e..aa12708ddb96 100644
--- a/Documentation/admin-guide/perf/index.rst
+++ b/Documentation/admin-guide/perf/index.rst
@@ -25,6 +25,7 @@ Performance monitor support
alibaba_pmu
dwc_pcie_pmu
nvidia-tegra241-pmu
+ nvidia-tegra410-pmu
meson-ddr-pmu
cxl
ampere_cspmu
diff --git a/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst b/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst
new file mode 100644
index 000000000000..7b7ba5700ca1
--- /dev/null
+++ b/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst
@@ -0,0 +1,106 @@
+=====================================================================
+NVIDIA Tegra410 SoC Uncore Performance Monitoring Unit (PMU)
+=====================================================================
+
+The NVIDIA Tegra410 SoC includes various system PMUs to measure key performance
+metrics like memory bandwidth, latency, and utilization:
+
+* Unified Coherence Fabric (UCF)
+
+PMU Driver
+----------
+
+The PMU driver describes the available events and configuration of each PMU in
+sysfs. Please see the sections below to get the sysfs path of each PMU. Like
+other uncore PMU drivers, the driver provides "cpumask" sysfs attribute to show
+the CPU id used to handle the PMU event. There is also "associated_cpus"
+sysfs attribute, which contains a list of CPUs associated with the PMU instance.
+
+UCF PMU
+-------
+
+The Unified Coherence Fabric (UCF) in the NVIDIA Tegra410 SoC serves as a
+distributed cache, last level for CPU Memory and CXL Memory, and cache coherent
+interconnect that supports hardware coherence across multiple coherently caching
+agents, including:
+
+ * CPU clusters
+ * GPU
+ * PCIe Ordering Controller Unit (OCU)
+ * Other IO-coherent requesters
+
+The events and configuration options of this PMU device are described in sysfs,
+see /sys/bus/event_source/devices/nvidia_ucf_pmu_<socket-id>.
+
+Some of the events available in this PMU can be used to measure bandwidth and
+utilization:
+
+ * slc_access_rd: count the number of read requests to SLC.
+ * slc_access_wr: count the number of write requests to SLC.
+ * slc_bytes_rd: count the number of bytes transferred by slc_access_rd.
+ * slc_bytes_wr: count the number of bytes transferred by slc_access_wr.
+ * mem_access_rd: count the number of read requests to local or remote memory.
+ * mem_access_wr: count the number of write requests to local or remote memory.
+ * mem_bytes_rd: count the number of bytes transferred by mem_access_rd.
+ * mem_bytes_wr: count the number of bytes transferred by mem_access_wr.
+ * cycles: counts the UCF cycles.
+
+The average bandwidth is calculated as::
+
+ AVG_SLC_READ_BANDWIDTH_IN_GBPS = SLC_BYTES_RD / ELAPSED_TIME_IN_NS
+ AVG_SLC_WRITE_BANDWIDTH_IN_GBPS = SLC_BYTES_WR / ELAPSED_TIME_IN_NS
+ AVG_MEM_READ_BANDWIDTH_IN_GBPS = MEM_BYTES_RD / ELAPSED_TIME_IN_NS
+ AVG_MEM_WRITE_BANDWIDTH_IN_GBPS = MEM_BYTES_WR / ELAPSED_TIME_IN_NS
+
+The average request rate is calculated as::
+
+ AVG_SLC_READ_REQUEST_RATE = SLC_ACCESS_RD / CYCLES
+ AVG_SLC_WRITE_REQUEST_RATE = SLC_ACCESS_WR / CYCLES
+ AVG_MEM_READ_REQUEST_RATE = MEM_ACCESS_RD / CYCLES
+ AVG_MEM_WRITE_REQUEST_RATE = MEM_ACCESS_WR / CYCLES
+
+More details about what other events are available can be found in Tegra410 SoC
+technical reference manual.
+
+The events can be filtered based on source or destination. The source filter
+indicates the traffic initiator to the SLC, e.g local CPU, non-CPU device, or
+remote socket. The destination filter specifies the destination memory type,
+e.g. local system memory (CMEM), local GPU memory (GMEM), or remote memory. The
+local/remote classification of the destination filter is based on the home
+socket of the address, not where the data actually resides. The available
+filters are described in
+/sys/bus/event_source/devices/nvidia_ucf_pmu_<socket-id>/format/.
+
+The list of UCF PMU event filters:
+
+* Source filter:
+
+ * src_loc_cpu: if set, count events from local CPU
+ * src_loc_noncpu: if set, count events from local non-CPU device
+ * src_rem: if set, count events from CPU, GPU, PCIE devices of remote socket
+
+* Destination filter:
+
+ * dst_loc_cmem: if set, count events to local system memory (CMEM) address
+ * dst_loc_gmem: if set, count events to local GPU memory (GMEM) address
+ * dst_loc_other: if set, count events to local CXL memory address
+ * dst_rem: if set, count events to CPU, GPU, and CXL memory address of remote socket
+
+If the source is not specified, the PMU will count events from all sources. If
+the destination is not specified, the PMU will count events to all destinations.
+
+Example usage:
+
+* Count event id 0x0 in socket 0 from all sources and to all destinations::
+
+ perf stat -a -e nvidia_ucf_pmu_0/event=0x0/
+
+* Count event id 0x0 in socket 0 with source filter = local CPU and destination
+ filter = local system memory (CMEM)::
+
+ perf stat -a -e nvidia_ucf_pmu_0/event=0x0,src_loc_cpu=0x1,dst_loc_cmem=0x1/
+
+* Count event id 0x0 in socket 1 with source filter = local non-CPU device and
+ destination filter = remote memory::
+
+ perf stat -a -e nvidia_ucf_pmu_1/event=0x0,src_loc_noncpu=0x1,dst_rem=0x1/
diff --git a/drivers/perf/arm_cspmu/nvidia_cspmu.c b/drivers/perf/arm_cspmu/nvidia_cspmu.c
index e06a06d3407b..c67667097a3c 100644
--- a/drivers/perf/arm_cspmu/nvidia_cspmu.c
+++ b/drivers/perf/arm_cspmu/nvidia_cspmu.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
*/
@@ -21,6 +21,13 @@
#define NV_CNVL_PORT_COUNT 4ULL
#define NV_CNVL_FILTER_ID_MASK GENMASK_ULL(NV_CNVL_PORT_COUNT - 1, 0)
+#define NV_UCF_SRC_COUNT 3ULL
+#define NV_UCF_DST_COUNT 4ULL
+#define NV_UCF_FILTER_ID_MASK GENMASK_ULL(11, 0)
+#define NV_UCF_FILTER_SRC GENMASK_ULL(2, 0)
+#define NV_UCF_FILTER_DST GENMASK_ULL(11, 8)
+#define NV_UCF_FILTER_DEFAULT (NV_UCF_FILTER_SRC | NV_UCF_FILTER_DST)
+
#define NV_GENERIC_FILTER_ID_MASK GENMASK_ULL(31, 0)
#define NV_PRODID_MASK (PMIIDR_PRODUCTID | PMIIDR_VARIANT | PMIIDR_REVISION)
@@ -124,6 +131,37 @@ static struct attribute *mcf_pmu_event_attrs[] = {
NULL,
};
+static struct attribute *ucf_pmu_event_attrs[] = {
+ ARM_CSPMU_EVENT_ATTR(bus_cycles, 0x1D),
+
+ ARM_CSPMU_EVENT_ATTR(slc_allocate, 0xF0),
+ ARM_CSPMU_EVENT_ATTR(slc_wb, 0xF3),
+ ARM_CSPMU_EVENT_ATTR(slc_refill_rd, 0x109),
+ ARM_CSPMU_EVENT_ATTR(slc_refill_wr, 0x10A),
+ ARM_CSPMU_EVENT_ATTR(slc_hit_rd, 0x119),
+
+ ARM_CSPMU_EVENT_ATTR(slc_access_dataless, 0x183),
+ ARM_CSPMU_EVENT_ATTR(slc_access_atomic, 0x184),
+
+ ARM_CSPMU_EVENT_ATTR(slc_access, 0xF2),
+ ARM_CSPMU_EVENT_ATTR(slc_access_rd, 0x111),
+ ARM_CSPMU_EVENT_ATTR(slc_access_wr, 0x112),
+ ARM_CSPMU_EVENT_ATTR(slc_bytes_rd, 0x113),
+ ARM_CSPMU_EVENT_ATTR(slc_bytes_wr, 0x114),
+
+ ARM_CSPMU_EVENT_ATTR(mem_access_rd, 0x121),
+ ARM_CSPMU_EVENT_ATTR(mem_access_wr, 0x122),
+ ARM_CSPMU_EVENT_ATTR(mem_bytes_rd, 0x123),
+ ARM_CSPMU_EVENT_ATTR(mem_bytes_wr, 0x124),
+
+ ARM_CSPMU_EVENT_ATTR(local_snoop, 0x180),
+ ARM_CSPMU_EVENT_ATTR(ext_snp_access, 0x181),
+ ARM_CSPMU_EVENT_ATTR(ext_snp_evict, 0x182),
+
+ ARM_CSPMU_EVENT_ATTR(cycles, ARM_CSPMU_EVT_CYCLES_DEFAULT),
+ NULL,
+};
+
static struct attribute *generic_pmu_event_attrs[] = {
ARM_CSPMU_EVENT_ATTR(cycles, ARM_CSPMU_EVT_CYCLES_DEFAULT),
NULL,
@@ -152,6 +190,18 @@ static struct attribute *cnvlink_pmu_format_attrs[] = {
NULL,
};
+static struct attribute *ucf_pmu_format_attrs[] = {
+ ARM_CSPMU_FORMAT_EVENT_ATTR,
+ ARM_CSPMU_FORMAT_ATTR(src_loc_noncpu, "config1:0"),
+ ARM_CSPMU_FORMAT_ATTR(src_loc_cpu, "config1:1"),
+ ARM_CSPMU_FORMAT_ATTR(src_rem, "config1:2"),
+ ARM_CSPMU_FORMAT_ATTR(dst_loc_cmem, "config1:8"),
+ ARM_CSPMU_FORMAT_ATTR(dst_loc_gmem, "config1:9"),
+ ARM_CSPMU_FORMAT_ATTR(dst_loc_other, "config1:10"),
+ ARM_CSPMU_FORMAT_ATTR(dst_rem, "config1:11"),
+ NULL,
+};
+
static struct attribute *generic_pmu_format_attrs[] = {
ARM_CSPMU_FORMAT_EVENT_ATTR,
ARM_CSPMU_FORMAT_FILTER_ATTR,
@@ -236,6 +286,27 @@ static void nv_cspmu_set_cc_filter(struct arm_cspmu *cspmu,
writel(filter, cspmu->base0 + PMCCFILTR);
}
+static u32 ucf_pmu_event_filter(const struct perf_event *event)
+{
+ u32 ret, filter, src, dst;
+
+ filter = nv_cspmu_event_filter(event);
+
+ /* Monitor all sources if none is selected. */
+ src = FIELD_GET(NV_UCF_FILTER_SRC, filter);
+ if (src == 0)
+ src = GENMASK_ULL(NV_UCF_SRC_COUNT - 1, 0);
+
+ /* Monitor all destinations if none is selected. */
+ dst = FIELD_GET(NV_UCF_FILTER_DST, filter);
+ if (dst == 0)
+ dst = GENMASK_ULL(NV_UCF_DST_COUNT - 1, 0);
+
+ ret = FIELD_PREP(NV_UCF_FILTER_SRC, src);
+ ret |= FIELD_PREP(NV_UCF_FILTER_DST, dst);
+
+ return ret;
+}
enum nv_cspmu_name_fmt {
NAME_FMT_GENERIC,
@@ -342,6 +413,23 @@ static const struct nv_cspmu_match nv_cspmu_match[] = {
.init_data = NULL
},
},
+ {
+ .prodid = 0x2CF20000,
+ .prodid_mask = NV_PRODID_MASK,
+ .name_pattern = "nvidia_ucf_pmu_%u",
+ .name_fmt = NAME_FMT_SOCKET,
+ .template_ctx = {
+ .event_attr = ucf_pmu_event_attrs,
+ .format_attr = ucf_pmu_format_attrs,
+ .filter_mask = NV_UCF_FILTER_ID_MASK,
+ .filter_default_val = NV_UCF_FILTER_DEFAULT,
+ .filter2_mask = 0x0,
+ .filter2_default_val = 0x0,
+ .get_filter = ucf_pmu_event_filter,
+ .get_filter2 = NULL,
+ .init_data = NULL
+ },
+ },
{
.prodid = 0,
.prodid_mask = 0,
--
2.43.0
© 2016 - 2026 Red Hat, Inc.