This patch extends the DEV_QUERY ioctl to return information about the
performance counter setup for userspace, and introduces the new
ioctl DRM_PANTHOR_PERF_CONTROL in order to allow for the sampling of
performance counters.
The new design is inspired by the perf aux ringbuffer [0], with the
insert and extract indices being mapped to userspace, allowing
multiple samples to be exposed at any given time. To avoid pointer
chasing, the sample metadata and block metadata are inline with
the elements they describe.
Userspace is responsible for passing in resources for samples to be
exposed, including the event file descriptor for notification of new
sample availability, the ringbuffer BO to store samples, and the
control BO along with the offset for mapping the insert and extract
indices. Though these indices are only a total of 8 bytes, userspace
can then reuse the same physical page for tracking the state of
multiple buffers by giving different offsets from the BO start to
map them.
[0]: https://docs.kernel.org/userspace-api/perf_ring_buffer.html
Co-developed-by: Mihail Atanassov <mihail.atanassov@arm.com>
Signed-off-by: Mihail Atanassov <mihail.atanassov@arm.com>
Signed-off-by: Lukas Zapolskas <lukas.zapolskas@arm.com>
Reviewed-by: Adrián Larumbe <adrian.larumbe@collabora.com>
---
include/uapi/drm/panthor_drm.h | 565 +++++++++++++++++++++++++++++++++
1 file changed, 565 insertions(+)
diff --git a/include/uapi/drm/panthor_drm.h b/include/uapi/drm/panthor_drm.h
index e238c6264fa1..d1a92172e878 100644
--- a/include/uapi/drm/panthor_drm.h
+++ b/include/uapi/drm/panthor_drm.h
@@ -154,6 +154,9 @@ enum drm_panthor_ioctl_id {
* This is useful for imported BOs.
*/
DRM_PANTHOR_BO_QUERY_INFO,
+
+ /** @DRM_PANTHOR_PERF_CONTROL: Control a performance counter session. */
+ DRM_PANTHOR_PERF_CONTROL,
};
/**
@@ -253,6 +256,9 @@ enum drm_panthor_dev_query_type {
* @DRM_PANTHOR_DEV_QUERY_GROUP_PRIORITIES_INFO: Query allowed group priorities information.
*/
DRM_PANTHOR_DEV_QUERY_GROUP_PRIORITIES_INFO,
+
+ /** @DRM_PANTHOR_DEV_QUERY_PERF_INFO: Query performance counter interface information. */
+ DRM_PANTHOR_DEV_QUERY_PERF_INFO,
};
/**
@@ -445,6 +451,135 @@ struct drm_panthor_group_priorities_info {
__u8 pad[3];
};
+/**
+ * enum drm_panthor_perf_feat_flags - Performance counter configuration feature flags.
+ */
+enum drm_panthor_perf_feat_flags {
+ /** @DRM_PANTHOR_PERF_BLOCK_STATES_SUPPORT: Coarse-grained block states are supported. */
+ DRM_PANTHOR_PERF_BLOCK_STATES_SUPPORT = 1 << 0,
+};
+
+/**
+ * enum drm_panthor_perf_block_type - Performance counter supported block types.
+ */
+enum drm_panthor_perf_block_type {
+ /** @DRM_PANTHOR_PERF_BLOCK_METADATA: Internal use only. */
+ DRM_PANTHOR_PERF_BLOCK_METADATA = 0,
+
+ /** @DRM_PANTHOR_PERF_BLOCK_FW: The FW counter block. */
+ DRM_PANTHOR_PERF_BLOCK_FW,
+
+ /** @DRM_PANTHOR_PERF_BLOCK_CSHW: The CSHW counter block. */
+ DRM_PANTHOR_PERF_BLOCK_CSHW,
+
+ /** @DRM_PANTHOR_PERF_BLOCK_TILER: The tiler counter block. */
+ DRM_PANTHOR_PERF_BLOCK_TILER,
+
+ /** @DRM_PANTHOR_PERF_BLOCK_MEMSYS: A memsys counter block. */
+ DRM_PANTHOR_PERF_BLOCK_MEMSYS,
+
+ /** @DRM_PANTHOR_PERF_BLOCK_SHADER: A shader core counter block. */
+ DRM_PANTHOR_PERF_BLOCK_SHADER,
+
+ /** @DRM_PANTHOR_PERF_BLOCK_FIRST: Internal use only. */
+ DRM_PANTHOR_PERF_BLOCK_FIRST = DRM_PANTHOR_PERF_BLOCK_FW,
+
+ /** @DRM_PANTHOR_PERF_BLOCK_LAST: Internal use only. */
+ DRM_PANTHOR_PERF_BLOCK_LAST = DRM_PANTHOR_PERF_BLOCK_SHADER,
+
+ /** @DRM_PANTHOR_PERF_BLOCK_MAX: Internal use only. */
+ DRM_PANTHOR_PERF_BLOCK_MAX = DRM_PANTHOR_PERF_BLOCK_LAST + 1,
+};
+
+/**
+ * enum drm_panthor_perf_clock - Identifier of the clock used to produce the cycle count values
+ * in a given block.
+ *
+ * Since the integrator has the choice of using one or more clocks, there may be some confusion
+ * as to which blocks are counted by which clock values unless this information is explicitly
+ * provided as part of every block sample. Not every single clock here can be used: in the simplest
+ * case, all cycle counts will be associated with the top-level clock.
+ */
+enum drm_panthor_perf_clock {
+ /** @DRM_PANTHOR_PERF_CLOCK_TOPLEVEL: Top-level CSF clock. */
+ DRM_PANTHOR_PERF_CLOCK_TOPLEVEL,
+
+ /**
+ * @DRM_PANTHOR_PERF_CLOCK_COREGROUP: Core group clock, responsible for the MMU, L2
+ * caches and the tiler.
+ */
+ DRM_PANTHOR_PERF_CLOCK_COREGROUP,
+
+ /** @DRM_PANTHOR_PERF_CLOCK_SHADER: Clock for the shader cores. */
+ DRM_PANTHOR_PERF_CLOCK_SHADER,
+};
+
+/**
+ * struct drm_panthor_perf_info - Performance counter interface information
+ *
+ * Structure grouping all queryable information relating to the performance counter
+ * interfaces.
+ */
+struct drm_panthor_perf_info {
+ /**
+ * @counters_per_block: The number of 8-byte counters available in a block.
+ */
+ __u32 counters_per_block;
+
+ /**
+ * @sample_header_size: The size of the header struct available at the beginning
+ * of every sample.
+ */
+ __u32 sample_header_size;
+
+ /**
+ * @block_header_size: The size of the header struct inline with the counters for a
+ * single block.
+ */
+ __u32 block_header_size;
+
+ /**
+ * @sample_size: The size of a fully annotated sample, starting with a sample header
+ * of size @sample_header_size bytes, and all available blocks for the current
+ * configuration, each comprised of @counters_per_block 64-bit counters and
+ * a block header of @block_header_size bytes.
+ *
+ * The user must use this field to allocate size for the ring buffer. In
+ * the case of new blocks being added, an old userspace can always use
+ * this field and ignore any blocks it does not know about.
+ */
+ __u32 sample_size;
+
+ /** @flags: Combination of drm_panthor_perf_feat_flags flags. */
+ __u32 flags;
+
+ /**
+ * @supported_clocks: Bitmask of the clocks supported by the GPU.
+ *
+ * Each bit represents a variant of the enum drm_panthor_perf_clock.
+ *
+ * For the same GPU, different implementers may have different clocks for the same hardware
+ * block. At the moment, up to three clocks are supported, and any clocks that are present
+ * will be reported here.
+ */
+ __u32 supported_clocks;
+
+ /** @fw_blocks: Number of FW blocks available. */
+ __u32 fw_blocks;
+
+ /** @cshw_blocks: Number of CSHW blocks available. */
+ __u32 cshw_blocks;
+
+ /** @tiler_blocks: Number of tiler blocks available. */
+ __u32 tiler_blocks;
+
+ /** @memsys_blocks: Number of memsys blocks available. */
+ __u32 memsys_blocks;
+
+ /** @shader_blocks: Number of shader core blocks available. */
+ __u32 shader_blocks;
+};
+
/**
* struct drm_panthor_dev_query - Arguments passed to DRM_PANTHOR_IOCTL_DEV_QUERY
*/
@@ -1187,6 +1322,434 @@ struct drm_panthor_bo_query_info {
__u32 pad;
};
+/**
+ * DOC: Performance counter decoding in userspace.
+ *
+ * Each sample will be exposed to userspace in the following manner:
+ *
+ * +--------+--------+------------------------+--------+-------------------------+-----+
+ * | Sample | Block | Block | Block | Block | ... |
+ * | header | header | counters | header | counters | |
+ * +--------+--------+------------------------+--------+-------------------------+-----+
+ *
+ * Each sample will start with a sample header of type @struct drm_panthor_perf_sample header,
+ * providing sample-wide information like the start and end timestamps, the counter set currently
+ * configured, and any errors that may have occurred during sampling.
+ *
+ * After the fixed size header, the sample will consist of blocks of
+ * 64-bit @drm_panthor_dev_query_perf_info::counters_per_block counters, each prefaced with a
+ * header of its own, indicating source block type, as well as the cycle count needed to normalize
+ * cycle values within that block, and a clock source identifier.
+ */
+
+/**
+ * enum drm_panthor_perf_block_state - Bitmask of the power and execution states that an individual
+ * hardware block went through in a sampling period.
+ *
+ * Because the sampling period is controlled from userspace, the block may undergo multiple
+ * state transitions, so this must be interpreted as one or more such transitions occurring.
+ */
+enum drm_panthor_perf_block_state {
+ /**
+ * @DRM_PANTHOR_PERF_BLOCK_STATE_UNKNOWN: The state of this block was unknown during
+ * the sampling period.
+ */
+ DRM_PANTHOR_PERF_BLOCK_STATE_UNKNOWN = 0,
+
+ /**
+ * @DRM_PANTHOR_PERF_BLOCK_STATE_ON: This block was powered on for some or all of
+ * the sampling period.
+ */
+ DRM_PANTHOR_PERF_BLOCK_STATE_ON = 1 << 0,
+
+ /**
+ * @DRM_PANTHOR_PERF_BLOCK_STATE_OFF: This block was powered off for some or all of the
+ * sampling period.
+ */
+ DRM_PANTHOR_PERF_BLOCK_STATE_OFF = 1 << 1,
+
+ /**
+ * @DRM_PANTHOR_PERF_BLOCK_STATE_AVAILABLE: This block was available for execution for
+ * some or all of the sampling period.
+ */
+ DRM_PANTHOR_PERF_BLOCK_STATE_AVAILABLE = 1 << 2,
+ /**
+ * @DRM_PANTHOR_PERF_BLOCK_STATE_UNAVAILABLE: This block was unavailable for execution for
+ * some or all of the sampling period.
+ */
+ DRM_PANTHOR_PERF_BLOCK_STATE_UNAVAILABLE = 1 << 3,
+
+ /**
+ * @DRM_PANTHOR_PERF_BLOCK_STATE_NORMAL: This block was executing in normal mode
+ * for some or all of the sampling period.
+ */
+ DRM_PANTHOR_PERF_BLOCK_STATE_NORMAL = 1 << 4,
+
+ /**
+ * @DRM_PANTHOR_PERF_BLOCK_STATE_PROTECTED: This block was executing in protected mode
+ * for some or all of the sampling period.
+ */
+ DRM_PANTHOR_PERF_BLOCK_STATE_PROTECTED = 1 << 5,
+};
+
+/**
+ * struct drm_panthor_perf_block_header - Header present before every block in the
+ * sample ringbuffer.
+ */
+struct drm_panthor_perf_block_header {
+ /** @block_type: Type of the block. */
+ __u8 block_type;
+
+ /** @block_idx: Block index. */
+ __u8 block_idx;
+
+ /**
+ * @block_states: Coarse-grained block transitions, bitmask of enum
+ * drm_panthor_perf_block_states.
+ */
+ __u8 block_states;
+
+ /**
+ * @clock: Clock used to produce the cycle count for this block, taken from
+ * enum drm_panthor_perf_clock. The cycle counts are stored in the sample header.
+ */
+ __u8 clock;
+
+ /** @pad: MBZ. */
+ __u8 pad[4];
+
+ /** @enable_mask: Bitmask of counters requested during the session setup. */
+ __u64 enable_mask[2];
+};
+
+/**
+ * enum drm_panthor_perf_sample_flags - Sample-wide events that occurred over the sampling
+ * period.
+ */
+enum drm_panthor_perf_sample_flags {
+ /**
+ * @DRM_PANTHOR_PERF_SAMPLE_OVERFLOW: This sample contains overflows due to the duration
+ * of the sampling period.
+ */
+ DRM_PANTHOR_PERF_SAMPLE_OVERFLOW = 1 << 0,
+
+ /**
+ * @DRM_PANTHOR_PERF_SAMPLE_ERROR: This sample encountered an error condition during
+ * the sample duration.
+ */
+ DRM_PANTHOR_PERF_SAMPLE_ERROR = 1 << 1,
+};
+
+/**
+ * struct drm_panthor_perf_sample_header - Header present before every sample.
+ */
+struct drm_panthor_perf_sample_header {
+ /**
+ * @timestamp_start_ns: Earliest timestamp that values in this sample represent, in
+ * nanoseconds. Derived from CLOCK_MONOTONIC_RAW.
+ */
+ __u64 timestamp_start_ns;
+
+ /**
+ * @timestamp_end_ns: Latest timestamp that values in this sample represent, in
+ * nanoseconds. Derived from CLOCK_MONOTONIC_RAW.
+ */
+ __u64 timestamp_end_ns;
+
+ /** @block_set: Set of performance counter blocks. */
+ __u8 block_set;
+
+ /** @pad: MBZ. */
+ __u8 pad[3];
+
+ /** @flags: Current sample flags, combination of drm_panthor_perf_sample_flags. */
+ __u32 flags;
+
+ /**
+ * @user_data: User data provided as part of the command that triggered this sample.
+ *
+ * - Automatic samples (periodic ones or those around non-counting periods or power state
+ * transitions) will be tagged with the user_data provided as part of the
+ * DRM_PANTHOR_PERF_COMMAND_START call.
+ * - Manual samples will be tagged with the user_data provided with the
+ * DRM_PANTHOR_PERF_COMMAND_SAMPLE call.
+ * - A session's final automatic sample will be tagged with the user_data provided with the
+ * DRM_PANTHOR_PERF_COMMAND_STOP call.
+ */
+ __u64 user_data;
+
+ /**
+ * @toplevel_clock_cycles: The number of cycles elapsed between
+ * drm_panthor_perf_sample_header::timestamp_start_ns and
+ * drm_panthor_perf_sample_header::timestamp_end_ns on the top-level clock if the
+ * corresponding bit is set in drm_panthor_perf_info::supported_clocks.
+ */
+ __u64 toplevel_clock_cycles;
+
+ /**
+ * @coregroup_clock_cycles: The number of cycles elapsed between
+ * drm_panthor_perf_sample_header::timestamp_start_ns and
+ * drm_panthor_perf_sample_header::timestamp_end_ns on the coregroup clock if the
+ * corresponding bit is set in drm_panthor_perf_info::supported_clocks.
+ */
+ __u64 coregroup_clock_cycles;
+
+ /**
+ * @shader_clock_cycles: The number of cycles elapsed between
+ * drm_panthor_perf_sample_header::timestamp_start_ns and
+ * drm_panthor_perf_sample_header::timestamp_end_ns on the shader core clock if the
+ * corresponding bit is set in drm_panthor_perf_info::supported_clocks.
+ */
+ __u64 shader_clock_cycles;
+};
+
+/**
+ * enum drm_panthor_perf_command - Command type passed to the DRM_PANTHOR_PERF_CONTROL
+ * IOCTL.
+ */
+enum drm_panthor_perf_command {
+ /** @DRM_PANTHOR_PERF_COMMAND_SETUP: Create a new performance counter sampling context. */
+ DRM_PANTHOR_PERF_COMMAND_SETUP,
+
+ /** @DRM_PANTHOR_PERF_COMMAND_TEARDOWN: Teardown a performance counter sampling context. */
+ DRM_PANTHOR_PERF_COMMAND_TEARDOWN,
+
+ /** @DRM_PANTHOR_PERF_COMMAND_START: Start a sampling session on the indicated context. */
+ DRM_PANTHOR_PERF_COMMAND_START,
+
+ /** @DRM_PANTHOR_PERF_COMMAND_STOP: Stop the sampling session on the indicated context. */
+ DRM_PANTHOR_PERF_COMMAND_STOP,
+
+ /**
+ * @DRM_PANTHOR_PERF_COMMAND_SAMPLE: Request a manual sample on the indicated context.
+ *
+ * When the sampling session is configured with a non-zero sampling frequency, any
+ * DRM_PANTHOR_PERF_CONTROL calls with this command will be ignored and return an
+ * -EINVAL.
+ */
+ DRM_PANTHOR_PERF_COMMAND_SAMPLE,
+};
+
+/**
+ * struct drm_panthor_perf_control - Arguments passed to DRM_PANTHOR_IOCTL_PERF_CONTROL.
+ */
+struct drm_panthor_perf_control {
+ /** @cmd: Command from enum drm_panthor_perf_command. */
+ __u32 cmd;
+
+ /**
+ * @handle: session handle.
+ *
+ * Returned by the DRM_PANTHOR_PERF_COMMAND_SETUP call.
+ * It must be used in subsequent commands for the same context.
+ */
+ __u32 handle;
+
+ /**
+ * @size: size of the command structure.
+ *
+ * If the pointer is NULL, the size is updated by the driver to provide the size of the
+ * output structure. If the pointer is not NULL, the driver will only copy min(size,
+ * struct_size) to the pointer and update the size accordingly.
+ */
+ __u64 size;
+
+ /**
+ * @pointer: user pointer to a command type struct, such as
+ * @struct drm_panthor_perf_cmd_start.
+ */
+ __u64 pointer;
+};
+
+/**
+ * enum drm_panthor_perf_counter_set - The counter set to be requested from the hardware.
+ *
+ * The hardware supports a single performance counter set at a time, so requesting any set other
+ * than the primary may fail if another process is sampling at the same time.
+ *
+ * If in doubt, the primary counter set has the most commonly used counters and requires no
+ * additional permissions to open.
+ */
+enum drm_panthor_perf_counter_set {
+ /**
+ * @DRM_PANTHOR_PERF_SET_PRIMARY: The default set configured on the hardware.
+ *
+ * This is the only set for which all counters in all blocks are defined.
+ */
+ DRM_PANTHOR_PERF_SET_PRIMARY,
+
+ /**
+ * @DRM_PANTHOR_PERF_SET_SECONDARY: The secondary performance counter set.
+ *
+ * Some blocks may not have any defined counters for this set, and the block will
+ * have the UNAVAILABLE block state permanently set in the block header.
+ *
+ * Accessing this set requires the calling process to have the CAP_PERFMON capability.
+ */
+ DRM_PANTHOR_PERF_SET_SECONDARY,
+
+ /**
+ * @DRM_PANTHOR_PERF_SET_TERTIARY: The tertiary performance counter set.
+ *
+ * Some blocks may not have any defined counters for this set, and the block will have
+ * the UNAVAILABLE block state permanently set in the block header. Note that the
+ * tertiary set has the fewest defined counter blocks.
+ *
+ * Accessing this set requires the calling process to have the CAP_PERFMON capability.
+ */
+ DRM_PANTHOR_PERF_SET_TERTIARY,
+};
+
+/**
+ * struct drm_panthor_perf_ringbuf_control - Struct used to map in the ring buffer control indices
+ * into memory shared between user and kernel.
+ *
+ */
+struct drm_panthor_perf_ringbuf_control {
+ /**
+ * @extract_idx: The index of the latest sample that was processed by userspace. Only
+ * modifiable by userspace.
+ */
+ __u64 extract_idx;
+
+ /**
+ * @insert_idx: The index of the latest sample emitted by the kernel. Only modifiable by
+ * modifiable by the kernel.
+ */
+ __u64 insert_idx;
+};
+
+/**
+ * struct drm_panthor_perf_cmd_setup - Arguments passed to DRM_PANTHOR_IOCTL_PERF_CONTROL
+ * when the DRM_PANTHOR_PERF_COMMAND_SETUP command is specified.
+ */
+struct drm_panthor_perf_cmd_setup {
+ /**
+ * @block_set: Set of performance counter blocks, member of
+ * enum drm_panthor_perf_block_set.
+ *
+ * This is a global configuration and only one set can be active at a time. If
+ * another client has already requested a counter set, any further requests
+ * for a different counter set will fail and return an -EBUSY.
+ *
+ * If the requested set does not exist, the request will fail and return an -EINVAL.
+ *
+ * Some sets have additional requirements to be enabled, and the setup request will
+ * fail with an -EACCES if these requirements are not satisfied.
+ */
+ __u8 block_set;
+
+ /** @pad: MBZ. */
+ __u8 pad[7];
+
+ /** @fd: eventfd for signalling the availability of a new sample. */
+ __u32 fd;
+
+ /** @ringbuf_handle: Handle to the BO to write perf counter sample to. */
+ __u32 ringbuf_handle;
+
+ /**
+ * @control_handle: Handle to the BO containing a contiguous 16 byte range, used for the
+ * insert and extract indices for the ringbuffer.
+ */
+ __u32 control_handle;
+
+ /**
+ * @sample_slots: The number of slots available in the userspace-provided BO. Must be
+ * a power of 2.
+ *
+ * If sample_slots * sample_size does not match the BO size, the setup request will fail.
+ */
+ __u32 sample_slots;
+
+ /**
+ * @control_offset: Offset into the control BO where the insert and extract indices are
+ * located.
+ */
+ __u64 control_offset;
+
+ /**
+ * @sample_freq_ns: Period between automatic counter sample collection in nanoseconds. Zero
+ * disables automatic collection and all collection must be done through explicit calls
+ * to DRM_PANTHOR_PERF_CONTROL.SAMPLE. Non-zero values will disable manual counter sampling
+ * via the DRM_PANTHOR_PERF_COMMAND_SAMPLE command.
+ *
+ * This disables software-triggered periodic sampling, but hardware will still trigger
+ * automatic samples on certain events, including shader core power transitions, and
+ * entries to and exits from non-counting periods. The final stop command will also
+ * trigger a sample to ensure no data is lost.
+ */
+ __u64 sample_freq_ns;
+
+ /**
+ * @fw_enable_mask: Bitmask of counters to request from the FW counter block. Any bits
+ * past the first drm_panthor_perf_info.counters_per_block bits will be ignored. Bit 0
+ * corresponds to counter 0.
+ */
+ __u64 fw_enable_mask[2];
+
+ /**
+ * @cshw_enable_mask: Bitmask of counters to request from the CSHW counter block. Any bits
+ * past the first drm_panthor_perf_info.counters_per_block bits will be ignored. Bit 0
+ * corresponds to counter 0.
+ */
+ __u64 cshw_enable_mask[2];
+
+ /**
+ * @tiler_enable_mask: Bitmask of counters to request from the tiler counter block. Any
+ * bits past the first drm_panthor_perf_info.counters_per_block bits will be ignored. Bit
+ * 0 corresponds to counter 0.
+ */
+ __u64 tiler_enable_mask[2];
+
+ /**
+ * @memsys_enable_mask: Bitmask of counters to request from the memsys counter blocks. Any
+ * bits past the first drm_panthor_perf_info.counters_per_block bits will be ignored. Bit 0
+ * corresponds to counter 0.
+ */
+ __u64 memsys_enable_mask[2];
+
+ /**
+ * @shader_enable_mask: Bitmask of counters to request from the shader core counter blocks.
+ * Any bits past the first drm_panthor_perf_info.counters_per_block bits will be ignored.
+ * Bit 0 corresponds to counter 0.
+ */
+ __u64 shader_enable_mask[2];
+};
+
+/**
+ * struct drm_panthor_perf_cmd_start - Arguments passed to DRM_PANTHOR_IOCTL_PERF_CONTROL
+ * when the DRM_PANTHOR_PERF_COMMAND_START command is specified.
+ */
+struct drm_panthor_perf_cmd_start {
+ /**
+ * @user_data: User provided data that will be attached to automatic samples collected
+ * until the next DRM_PANTHOR_PERF_COMMAND_STOP.
+ */
+ __u64 user_data;
+};
+
+/**
+ * struct drm_panthor_perf_cmd_stop - Arguments passed to DRM_PANTHOR_IOCTL_PERF_CONTROL
+ * when the DRM_PANTHOR_PERF_COMMAND_STOP command is specified.
+ */
+struct drm_panthor_perf_cmd_stop {
+ /**
+ * @user_data: User provided data that will be attached to the automatic sample collected
+ * at the end of this sampling session.
+ */
+ __u64 user_data;
+};
+
+/**
+ * struct drm_panthor_perf_cmd_sample - Arguments passed to DRM_PANTHOR_IOCTL_PERF_CONTROL
+ * when the DRM_PANTHOR_PERF_COMMAND_SAMPLE command is specified.
+ */
+struct drm_panthor_perf_cmd_sample {
+ /** @user_data: User provided data that will be attached to the sample.*/
+ __u64 user_data;
+};
+
/**
* DRM_IOCTL_PANTHOR() - Build a Panthor IOCTL number
* @__access: Access type. Must be R, W or RW.
@@ -1237,6 +1800,8 @@ enum {
DRM_IOCTL_PANTHOR(WR, BO_SYNC, bo_sync),
DRM_IOCTL_PANTHOR_BO_QUERY_INFO =
DRM_IOCTL_PANTHOR(WR, BO_QUERY_INFO, bo_query_info),
+ DRM_IOCTL_PANTHOR_PERF_CONTROL =
+ DRM_IOCTL_PANTHOR(WR, PERF_CONTROL, perf_control)
};
#if defined(__cplusplus)
--
2.33.0.dirty
On Mon, 15 Dec 2025 17:14:47 +0000
Lukas Zapolskas <lukas.zapolskas@arm.com> wrote:
> +/**
> + * enum drm_panthor_perf_block_type - Performance counter supported block types.
> + */
> +enum drm_panthor_perf_block_type {
> + /** @DRM_PANTHOR_PERF_BLOCK_METADATA: Internal use only. */
> + DRM_PANTHOR_PERF_BLOCK_METADATA = 0,
If it's internal-use-only, it shouldn't be exposed here. That's another
case of "I want an HW-agnostic uAPI so I can reconcile things up in
the KMD if things change on new HW, but I want it to match the current
HW mapping because it's simpler". If we really want to standardize the
block types that are exposed to the user (and after discussing it with
you, it seems you have valid reasons to want that), I'd rather have a
tightly packed enum with only the block types visible to the user. It
looks like METADATA is the only block we want to hide at the moment, so
it shouldn't be too hard to special-case this block.
> +
> + /** @DRM_PANTHOR_PERF_BLOCK_FW: The FW counter block. */
> + DRM_PANTHOR_PERF_BLOCK_FW,
Let's prefix those values with DRM_PANTHOR_PERF_BLOCK_TYPE_ to avoid
the blk vs blk-type confusion.
> +
> + /** @DRM_PANTHOR_PERF_BLOCK_CSHW: The CSHW counter block. */
> + DRM_PANTHOR_PERF_BLOCK_CSHW,
> +
> + /** @DRM_PANTHOR_PERF_BLOCK_TILER: The tiler counter block. */
> + DRM_PANTHOR_PERF_BLOCK_TILER,
> +
> + /** @DRM_PANTHOR_PERF_BLOCK_MEMSYS: A memsys counter block. */
> + DRM_PANTHOR_PERF_BLOCK_MEMSYS,
> +
> + /** @DRM_PANTHOR_PERF_BLOCK_SHADER: A shader core counter block. */
> + DRM_PANTHOR_PERF_BLOCK_SHADER,
> +
> + /** @DRM_PANTHOR_PERF_BLOCK_FIRST: Internal use only. */
> + DRM_PANTHOR_PERF_BLOCK_FIRST = DRM_PANTHOR_PERF_BLOCK_FW,
> +
> + /** @DRM_PANTHOR_PERF_BLOCK_LAST: Internal use only. */
> + DRM_PANTHOR_PERF_BLOCK_LAST = DRM_PANTHOR_PERF_BLOCK_SHADER,
> +
> + /** @DRM_PANTHOR_PERF_BLOCK_MAX: Internal use only. */
> + DRM_PANTHOR_PERF_BLOCK_MAX = DRM_PANTHOR_PERF_BLOCK_LAST + 1,
I think I'd just go for
DRM_PANTHOR_PERF_BLOCK_TYPE_COUNT,
and drop DRM_PANTHOR_PERF_BLOCK_{FIRST,LAST,MAX) definitions, with a
comment stating that DRM_PANTHOR_PERF_BLOCK_TYPE_COUNT must stay last
in this enum.
> +};
Hi Lukas, kernel test robot noticed the following build warnings: [auto build test WARNING on drm-misc/drm-misc-next] [also build test WARNING on next-20251219] [cannot apply to linus/master v6.19-rc2] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information] url: https://github.com/intel-lab-lkp/linux/commits/Lukas-Zapolskas/drm-panthor-Add-performance-counter-uAPI/20251216-012117 base: https://gitlab.freedesktop.org/drm/misc/kernel.git drm-misc-next patch link: https://lore.kernel.org/r/20251215171453.2506348-2-lukas.zapolskas%40arm.com patch subject: [PATCH v6 1/7] drm/panthor: Add performance counter uAPI reproduce: (https://download.01.org/0day-ci/archive/20251222/202512221946.gSB6iMCE-lkp@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Closes: https://lore.kernel.org/oe-kbuild-all/202512221946.gSB6iMCE-lkp@intel.com/ All warnings (new ones prefixed by >>): ERROR: Cannot find file ./include/linux/backlight.h ERROR: Cannot find file ./include/linux/backlight.h WARNING: No kernel-doc for file ./include/linux/backlight.h WARNING: ./include/uapi/drm/panthor_drm.h:381 struct member 'selected_coherency' not described in 'drm_panthor_gpu_info' WARNING: ./include/uapi/drm/panthor_drm.h:381 struct member 'selected_coherency' not described in 'drm_panthor_gpu_info' >> Documentation/gpu/driver-uapi:24: ./include/uapi/drm/panthor_drm.h:1473: WARNING: Bullet list ends without a blank line; unexpected unindent. [docutils] Documentation/gpu/drm-kms:360: ./drivers/gpu/drm/drm_fourcc.c:397: WARNING: Duplicate C declaration, also defined at gpu/drm-kms:35. Declaration is '.. c:function:: const struct drm_format_info * drm_format_info (u32 format)'. [duplicate_declaration.c] Documentation/gpu/drm-kms:491: ./drivers/gpu/drm/drm_modeset_lock.c:377: WARNING: Duplicate C declaration, also defined at gpu/drm-kms:48. Declaration is '.. c:function:: int drm_modeset_lock (struct drm_modeset_lock *lock, struct drm_modeset_acquire_ctx *ctx)'. [duplicate_declaration.c] ERROR: Cannot find file ./include/linux/hdmi.h -- 0-DAY CI Kernel Test Service https://github.com/intel/lkp-tests/wiki
Hi Lukas,
On Mon, 15 Dec 2025 17:14:47 +0000
Lukas Zapolskas <lukas.zapolskas@arm.com> wrote:
> This patch extends the DEV_QUERY ioctl to return information about the
> performance counter setup for userspace, and introduces the new
> ioctl DRM_PANTHOR_PERF_CONTROL in order to allow for the sampling of
> performance counters.
>
> The new design is inspired by the perf aux ringbuffer [0], with the
> insert and extract indices being mapped to userspace, allowing
> multiple samples to be exposed at any given time. To avoid pointer
> chasing, the sample metadata and block metadata are inline with
> the elements they describe.
>
> Userspace is responsible for passing in resources for samples to be
> exposed, including the event file descriptor for notification of new
> sample availability, the ringbuffer BO to store samples, and the
> control BO along with the offset for mapping the insert and extract
> indices. Though these indices are only a total of 8 bytes, userspace
> can then reuse the same physical page for tracking the state of
> multiple buffers by giving different offsets from the BO start to
> map them.
>
> [0]: https://docs.kernel.org/userspace-api/perf_ring_buffer.html
>
> Co-developed-by: Mihail Atanassov <mihail.atanassov@arm.com>
> Signed-off-by: Mihail Atanassov <mihail.atanassov@arm.com>
> Signed-off-by: Lukas Zapolskas <lukas.zapolskas@arm.com>
> Reviewed-by: Adrián Larumbe <adrian.larumbe@collabora.com>
> ---
> include/uapi/drm/panthor_drm.h | 565 +++++++++++++++++++++++++++++++++
> 1 file changed, 565 insertions(+)
>
> diff --git a/include/uapi/drm/panthor_drm.h b/include/uapi/drm/panthor_drm.h
> index e238c6264fa1..d1a92172e878 100644
> --- a/include/uapi/drm/panthor_drm.h
> +++ b/include/uapi/drm/panthor_drm.h
> @@ -154,6 +154,9 @@ enum drm_panthor_ioctl_id {
> * This is useful for imported BOs.
> */
> DRM_PANTHOR_BO_QUERY_INFO,
> +
> + /** @DRM_PANTHOR_PERF_CONTROL: Control a performance counter session. */
> + DRM_PANTHOR_PERF_CONTROL,
> };
>
> /**
> @@ -253,6 +256,9 @@ enum drm_panthor_dev_query_type {
> * @DRM_PANTHOR_DEV_QUERY_GROUP_PRIORITIES_INFO: Query allowed group priorities information.
> */
> DRM_PANTHOR_DEV_QUERY_GROUP_PRIORITIES_INFO,
> +
> + /** @DRM_PANTHOR_DEV_QUERY_PERF_INFO: Query performance counter interface information. */
> + DRM_PANTHOR_DEV_QUERY_PERF_INFO,
> };
>
> /**
> @@ -445,6 +451,135 @@ struct drm_panthor_group_priorities_info {
> __u8 pad[3];
> };
>
> +/**
> + * enum drm_panthor_perf_feat_flags - Performance counter configuration feature flags.
> + */
> +enum drm_panthor_perf_feat_flags {
> + /** @DRM_PANTHOR_PERF_BLOCK_STATES_SUPPORT: Coarse-grained block states are supported. */
> + DRM_PANTHOR_PERF_BLOCK_STATES_SUPPORT = 1 << 0,
> +};
> +
> +/**
> + * enum drm_panthor_perf_block_type - Performance counter supported block types.
> + */
> +enum drm_panthor_perf_block_type {
> + /** @DRM_PANTHOR_PERF_BLOCK_METADATA: Internal use only. */
> + DRM_PANTHOR_PERF_BLOCK_METADATA = 0,
> +
> + /** @DRM_PANTHOR_PERF_BLOCK_FW: The FW counter block. */
> + DRM_PANTHOR_PERF_BLOCK_FW,
> +
> + /** @DRM_PANTHOR_PERF_BLOCK_CSHW: The CSHW counter block. */
> + DRM_PANTHOR_PERF_BLOCK_CSHW,
> +
> + /** @DRM_PANTHOR_PERF_BLOCK_TILER: The tiler counter block. */
> + DRM_PANTHOR_PERF_BLOCK_TILER,
> +
> + /** @DRM_PANTHOR_PERF_BLOCK_MEMSYS: A memsys counter block. */
> + DRM_PANTHOR_PERF_BLOCK_MEMSYS,
> +
> + /** @DRM_PANTHOR_PERF_BLOCK_SHADER: A shader core counter block. */
> + DRM_PANTHOR_PERF_BLOCK_SHADER,
> +
> + /** @DRM_PANTHOR_PERF_BLOCK_FIRST: Internal use only. */
> + DRM_PANTHOR_PERF_BLOCK_FIRST = DRM_PANTHOR_PERF_BLOCK_FW,
> +
> + /** @DRM_PANTHOR_PERF_BLOCK_LAST: Internal use only. */
> + DRM_PANTHOR_PERF_BLOCK_LAST = DRM_PANTHOR_PERF_BLOCK_SHADER,
> +
> + /** @DRM_PANTHOR_PERF_BLOCK_MAX: Internal use only. */
> + DRM_PANTHOR_PERF_BLOCK_MAX = DRM_PANTHOR_PERF_BLOCK_LAST + 1,
> +};
I'd really prefer if we were not exposing block types as uAPI if those
are not truly needed for the UMD/KMD to agree on things. The counter
block knowledge exists in userspace (because it has to if we want to
attach meaning to counters), and I don't really see the need to
standardize it here. In my experience, any definition that's not
absolutely required might become a liability at some point. In that
case, I can already imagine new GPUs shuffling the block IDs, getting
rid of some, adding new ones, ... If we have to accommodate the enum
for those changes it will become a mess. On the other hand, if we make
the block ID an opaque u8, it just becomes HW knowledge that the
UMD/perfcnt lib has already (GPU_ID, plus other PERFCNT specific dev
queries if some stuff are implementation-defined).
> +
> +/**
> + * enum drm_panthor_perf_clock - Identifier of the clock used to produce the cycle count values
> + * in a given block.
> + *
> + * Since the integrator has the choice of using one or more clocks, there may be some confusion
> + * as to which blocks are counted by which clock values unless this information is explicitly
> + * provided as part of every block sample. Not every single clock here can be used: in the simplest
> + * case, all cycle counts will be associated with the top-level clock.
> + */
> +enum drm_panthor_perf_clock {
> + /** @DRM_PANTHOR_PERF_CLOCK_TOPLEVEL: Top-level CSF clock. */
> + DRM_PANTHOR_PERF_CLOCK_TOPLEVEL,
> +
> + /**
> + * @DRM_PANTHOR_PERF_CLOCK_COREGROUP: Core group clock, responsible for the MMU, L2
> + * caches and the tiler.
> + */
> + DRM_PANTHOR_PERF_CLOCK_COREGROUP,
> +
> + /** @DRM_PANTHOR_PERF_CLOCK_SHADER: Clock for the shader cores. */
> + DRM_PANTHOR_PERF_CLOCK_SHADER,
> +};
> +
> +/**
> + * struct drm_panthor_perf_info - Performance counter interface information
> + *
> + * Structure grouping all queryable information relating to the performance counter
> + * interfaces.
> + */
> +struct drm_panthor_perf_info {
> + /**
> + * @counters_per_block: The number of 8-byte counters available in a block.
> + */
> + __u32 counters_per_block;
> +
> + /**
> + * @sample_header_size: The size of the header struct available at the beginning
> + * of every sample.
> + */
> + __u32 sample_header_size;
> +
> + /**
> + * @block_header_size: The size of the header struct inline with the counters for a
> + * single block.
> + */
> + __u32 block_header_size;
Are those things not directly deducible from the arch major/minor? If
those things are implementation-defined, I guess that's fine to expose
them, but otherwise I'd rely on the knowledge that exists in the UMD.
> +
> + /**
> + * @sample_size: The size of a fully annotated sample, starting with a sample header
> + * of size @sample_header_size bytes, and all available blocks for the current
> + * configuration, each comprised of @counters_per_block 64-bit counters and
> + * a block header of @block_header_size bytes.
Let's keep the kernel doc formatting consistent and drop the alignment
on the field name (IIRC, it also generate weird indentation in the
final htmldoc if we do that.
> + *
> + * The user must use this field to allocate size for the ring buffer. In
> + * the case of new blocks being added, an old userspace can always use
> + * this field and ignore any blocks it does not know about.
> + */
> + __u32 sample_size;
Same thing for the sample_size, it looks like something the UMD should
know already, given a specific config.
> +
> + /** @flags: Combination of drm_panthor_perf_feat_flags flags. */
> + __u32 flags;
> +
> + /**
> + * @supported_clocks: Bitmask of the clocks supported by the GPU.
> + *
> + * Each bit represents a variant of the enum drm_panthor_perf_clock.
> + *
> + * For the same GPU, different implementers may have different clocks for the same hardware
> + * block. At the moment, up to three clocks are supported, and any clocks that are present
> + * will be reported here.
> + */
> + __u32 supported_clocks;
> +
> + /** @fw_blocks: Number of FW blocks available. */
> + __u32 fw_blocks;
> +
> + /** @cshw_blocks: Number of CSHW blocks available. */
> + __u32 cshw_blocks;
> +
> + /** @tiler_blocks: Number of tiler blocks available. */
> + __u32 tiler_blocks;
> +
> + /** @memsys_blocks: Number of memsys blocks available. */
> + __u32 memsys_blocks;
> +
> + /** @shader_blocks: Number of shader core blocks available. */
> + __u32 shader_blocks;
Again, if it's fixed per arch <major,minor>, I'd refrain from exposing
that. And if the variants are truly implementation-defined, and we go
through opaque block IDs, I guess we'd need another DEV_QUERY, to get
the number of variants supported by a specific HW (might even have to
be a bitmask if there can be holes).
> +};
> +
> /**
> * struct drm_panthor_dev_query - Arguments passed to DRM_PANTHOR_IOCTL_DEV_QUERY
> */
> @@ -1187,6 +1322,434 @@ struct drm_panthor_bo_query_info {
> __u32 pad;
> };
>
> +/**
> + * DOC: Performance counter decoding in userspace.
> + *
> + * Each sample will be exposed to userspace in the following manner:
> + *
> + * +--------+--------+------------------------+--------+-------------------------+-----+
> + * | Sample | Block | Block | Block | Block | ... |
> + * | header | header | counters | header | counters | |
> + * +--------+--------+------------------------+--------+-------------------------+-----+
> + *
> + * Each sample will start with a sample header of type @struct drm_panthor_perf_sample header,
> + * providing sample-wide information like the start and end timestamps, the counter set currently
> + * configured, and any errors that may have occurred during sampling.
Okay, that part has to be uAPI, because it's purely SW-defined IIUC.
> + *
> + * After the fixed size header, the sample will consist of blocks of
> + * 64-bit @drm_panthor_dev_query_perf_info::counters_per_block counters, each prefaced with a
> + * header of its own, indicating source block type, as well as the cycle count needed to normalize
> + * cycle values within that block, and a clock source identifier.
The rest is HW-defined (or FW-defined), and can be deduced from arch
<major,minor>+queries for impl-defined stuff, so I would just treat
that at opaque data in the uAPI.
> + */
> +
> +/**
> + * enum drm_panthor_perf_block_state - Bitmask of the power and execution states that an individual
> + * hardware block went through in a sampling period.
> + *
> + * Because the sampling period is controlled from userspace, the block may undergo multiple
> + * state transitions, so this must be interpreted as one or more such transitions occurring.
> + */
> +enum drm_panthor_perf_block_state {
> + /**
> + * @DRM_PANTHOR_PERF_BLOCK_STATE_UNKNOWN: The state of this block was unknown during
> + * the sampling period.
> + */
> + DRM_PANTHOR_PERF_BLOCK_STATE_UNKNOWN = 0,
> +
> + /**
> + * @DRM_PANTHOR_PERF_BLOCK_STATE_ON: This block was powered on for some or all of
> + * the sampling period.
> + */
> + DRM_PANTHOR_PERF_BLOCK_STATE_ON = 1 << 0,
> +
> + /**
> + * @DRM_PANTHOR_PERF_BLOCK_STATE_OFF: This block was powered off for some or all of the
> + * sampling period.
> + */
> + DRM_PANTHOR_PERF_BLOCK_STATE_OFF = 1 << 1,
> +
> + /**
> + * @DRM_PANTHOR_PERF_BLOCK_STATE_AVAILABLE: This block was available for execution for
> + * some or all of the sampling period.
> + */
> + DRM_PANTHOR_PERF_BLOCK_STATE_AVAILABLE = 1 << 2,
> + /**
> + * @DRM_PANTHOR_PERF_BLOCK_STATE_UNAVAILABLE: This block was unavailable for execution for
> + * some or all of the sampling period.
> + */
> + DRM_PANTHOR_PERF_BLOCK_STATE_UNAVAILABLE = 1 << 3,
> +
> + /**
> + * @DRM_PANTHOR_PERF_BLOCK_STATE_NORMAL: This block was executing in normal mode
> + * for some or all of the sampling period.
> + */
> + DRM_PANTHOR_PERF_BLOCK_STATE_NORMAL = 1 << 4,
> +
> + /**
> + * @DRM_PANTHOR_PERF_BLOCK_STATE_PROTECTED: This block was executing in protected mode
> + * for some or all of the sampling period.
> + */
> + DRM_PANTHOR_PERF_BLOCK_STATE_PROTECTED = 1 << 5,
> +};
> +
> +/**
> + * struct drm_panthor_perf_block_header - Header present before every block in the
> + * sample ringbuffer.
> + */
> +struct drm_panthor_perf_block_header {
> + /** @block_type: Type of the block. */
> + __u8 block_type;
> +
> + /** @block_idx: Block index. */
> + __u8 block_idx;
I first thought this was the block_set, but it looks like it's
something else. I imagine it's here to workaround the 128-counters
per-block limitation that exists because of the size of the
enable_mask field. If that's the case, this should probably be
documented.
> +
> + /**
> + * @block_states: Coarse-grained block transitions, bitmask of enum
> + * drm_panthor_perf_block_states.
> + */
> + __u8 block_states;
I'd rather make that state a __u32, so we're not blocked if other
events are added.
> +
> + /**
> + * @clock: Clock used to produce the cycle count for this block, taken from
> + * enum drm_panthor_perf_clock. The cycle counts are stored in the sample header.
> + */
> + __u8 clock;
> +
> + /** @pad: MBZ. */
> + __u8 pad[4];
> +
> + /** @enable_mask: Bitmask of counters requested during the session setup. */
> + __u64 enable_mask[2];
I'm a worried that this breaks if new GPUs expose more than 128
counters per-block. I'm also unsure why we need it here. Isn't the
ENABLE mask also part of the per-block HW sample header. If we could
pass that to the UMD directly, we wouldn't have to account for such
changes, because the per-HW layout knowledge exists on the libperf side
too.
> +};
> +
> +/**
> + * enum drm_panthor_perf_sample_flags - Sample-wide events that occurred over the sampling
> + * period.
> + */
> +enum drm_panthor_perf_sample_flags {
> + /**
> + * @DRM_PANTHOR_PERF_SAMPLE_OVERFLOW: This sample contains overflows due to the duration
> + * of the sampling period.
> + */
> + DRM_PANTHOR_PERF_SAMPLE_OVERFLOW = 1 << 0,
> +
> + /**
> + * @DRM_PANTHOR_PERF_SAMPLE_ERROR: This sample encountered an error condition during
> + * the sample duration.
> + */
> + DRM_PANTHOR_PERF_SAMPLE_ERROR = 1 << 1,
> +};
> +
> +/**
> + * struct drm_panthor_perf_sample_header - Header present before every sample.
> + */
> +struct drm_panthor_perf_sample_header {
> + /**
> + * @timestamp_start_ns: Earliest timestamp that values in this sample represent, in
> + * nanoseconds. Derived from CLOCK_MONOTONIC_RAW.
> + */
> + __u64 timestamp_start_ns;
> +
> + /**
> + * @timestamp_end_ns: Latest timestamp that values in this sample represent, in
> + * nanoseconds. Derived from CLOCK_MONOTONIC_RAW.
> + */
> + __u64 timestamp_end_ns;
> +
> + /** @block_set: Set of performance counter blocks. */
> + __u8 block_set;
Is this the same as drm_panthor_perf_block_header::block_idx? If it is,
we need to choose a name and stick to it, otherwise it's confusing.
> +
> + /** @pad: MBZ. */
> + __u8 pad[3];
> +
> + /** @flags: Current sample flags, combination of drm_panthor_perf_sample_flags. */
> + __u32 flags;
> +
> + /**
> + * @user_data: User data provided as part of the command that triggered this sample.
> + *
> + * - Automatic samples (periodic ones or those around non-counting periods or power state
> + * transitions) will be tagged with the user_data provided as part of the
> + * DRM_PANTHOR_PERF_COMMAND_START call.
> + * - Manual samples will be tagged with the user_data provided with the
> + * DRM_PANTHOR_PERF_COMMAND_SAMPLE call.
> + * - A session's final automatic sample will be tagged with the user_data provided with the
> + * DRM_PANTHOR_PERF_COMMAND_STOP call.
> + */
> + __u64 user_data;
> +
> + /**
> + * @toplevel_clock_cycles: The number of cycles elapsed between
> + * drm_panthor_perf_sample_header::timestamp_start_ns and
> + * drm_panthor_perf_sample_header::timestamp_end_ns on the top-level clock if the
> + * corresponding bit is set in drm_panthor_perf_info::supported_clocks.
> + */
> + __u64 toplevel_clock_cycles;
> +
> + /**
> + * @coregroup_clock_cycles: The number of cycles elapsed between
> + * drm_panthor_perf_sample_header::timestamp_start_ns and
> + * drm_panthor_perf_sample_header::timestamp_end_ns on the coregroup clock if the
> + * corresponding bit is set in drm_panthor_perf_info::supported_clocks.
> + */
> + __u64 coregroup_clock_cycles;
> +
> + /**
> + * @shader_clock_cycles: The number of cycles elapsed between
> + * drm_panthor_perf_sample_header::timestamp_start_ns and
> + * drm_panthor_perf_sample_header::timestamp_end_ns on the shader core clock if the
> + * corresponding bit is set in drm_panthor_perf_info::supported_clocks.
> + */
> + __u64 shader_clock_cycles;
> +};
> +
> +/**
> + * enum drm_panthor_perf_command - Command type passed to the DRM_PANTHOR_PERF_CONTROL
> + * IOCTL.
> + */
> +enum drm_panthor_perf_command {
> + /** @DRM_PANTHOR_PERF_COMMAND_SETUP: Create a new performance counter sampling context. */
> + DRM_PANTHOR_PERF_COMMAND_SETUP,
> +
> + /** @DRM_PANTHOR_PERF_COMMAND_TEARDOWN: Teardown a performance counter sampling context. */
> + DRM_PANTHOR_PERF_COMMAND_TEARDOWN,
> +
> + /** @DRM_PANTHOR_PERF_COMMAND_START: Start a sampling session on the indicated context. */
> + DRM_PANTHOR_PERF_COMMAND_START,
> +
> + /** @DRM_PANTHOR_PERF_COMMAND_STOP: Stop the sampling session on the indicated context. */
> + DRM_PANTHOR_PERF_COMMAND_STOP,
> +
> + /**
> + * @DRM_PANTHOR_PERF_COMMAND_SAMPLE: Request a manual sample on the indicated context.
> + *
> + * When the sampling session is configured with a non-zero sampling frequency, any
> + * DRM_PANTHOR_PERF_CONTROL calls with this command will be ignored and return an
> + * -EINVAL.
> + */
> + DRM_PANTHOR_PERF_COMMAND_SAMPLE,
> +};
> +
> +/**
> + * struct drm_panthor_perf_control - Arguments passed to DRM_PANTHOR_IOCTL_PERF_CONTROL.
> + */
> +struct drm_panthor_perf_control {
> + /** @cmd: Command from enum drm_panthor_perf_command. */
> + __u32 cmd;
> +
> + /**
> + * @handle: session handle.
> + *
> + * Returned by the DRM_PANTHOR_PERF_COMMAND_SETUP call.
> + * It must be used in subsequent commands for the same context.
> + */
> + __u32 handle;
I'll comment on the patch adding an implementation for that, but I'd
like to understand if there's a need for having more than one perf
session per FD. If not, we can probably drop this handle, attach the
perf-session directly to panthor_file, and have all commands target the
only perf session that exists on this FD-context.
Even if there's a need for multiple perf sessions per FD, I think we
should make this handle/ID per FD to simplify things.
> +
> + /**
> + * @size: size of the command structure.
> + *
> + * If the pointer is NULL, the size is updated by the driver to provide the size of the
> + * output structure. If the pointer is not NULL, the driver will only copy min(size,
> + * struct_size) to the pointer and update the size accordingly.
> + */
> + __u64 size;
I'm wondering if we wouldn't be better off adding multiple ioctl()
instead of doing the demux here. That's basically what we do for other
blocks (see VM_{CREATE,DESTROY,BIND}, GROUP_{CREATE,DESTROY,SUBMIT},
...). This would save an extra deref, and some additional complexity.
I think we have enough driver-specific ioctl()s to add a five more
for the perfcnt stuff.
> +
> + /**
> + * @pointer: user pointer to a command type struct, such as
> + * @struct drm_panthor_perf_cmd_start.
Same alignment issue I mentioned above.
> + */
> + __u64 pointer;
> +};
> +
> +/**
> + * enum drm_panthor_perf_counter_set - The counter set to be requested from the hardware.
> + *
> + * The hardware supports a single performance counter set at a time, so requesting any set other
> + * than the primary may fail if another process is sampling at the same time.
> + *
> + * If in doubt, the primary counter set has the most commonly used counters and requires no
> + * additional permissions to open.
> + */
> +enum drm_panthor_perf_counter_set {
> + /**
> + * @DRM_PANTHOR_PERF_SET_PRIMARY: The default set configured on the hardware.
> + *
> + * This is the only set for which all counters in all blocks are defined.
> + */
> + DRM_PANTHOR_PERF_SET_PRIMARY,
> +
> + /**
> + * @DRM_PANTHOR_PERF_SET_SECONDARY: The secondary performance counter set.
> + *
> + * Some blocks may not have any defined counters for this set, and the block will
> + * have the UNAVAILABLE block state permanently set in the block header.
> + *
> + * Accessing this set requires the calling process to have the CAP_PERFMON capability.
> + */
> + DRM_PANTHOR_PERF_SET_SECONDARY,
> +
> + /**
> + * @DRM_PANTHOR_PERF_SET_TERTIARY: The tertiary performance counter set.
> + *
> + * Some blocks may not have any defined counters for this set, and the block will have
> + * the UNAVAILABLE block state permanently set in the block header. Note that the
> + * tertiary set has the fewest defined counter blocks.
> + *
> + * Accessing this set requires the calling process to have the CAP_PERFMON capability.
> + */
> + DRM_PANTHOR_PERF_SET_TERTIARY,
> +};
Same remark I made for block types, I believe this should be opaque,
since libperf knows about those sets already.
> +
> +/**
> + * struct drm_panthor_perf_ringbuf_control - Struct used to map in the ring buffer control indices
> + * into memory shared between user and kernel.
> + *
> + */
> +struct drm_panthor_perf_ringbuf_control {
> + /**
> + * @extract_idx: The index of the latest sample that was processed by userspace. Only
> + * modifiable by userspace.
Same formatting inconsistency.
> + */
> + __u64 extract_idx;
> +
> + /**
> + * @insert_idx: The index of the latest sample emitted by the kernel. Only modifiable by
> + * modifiable by the kernel.
> + */
> + __u64 insert_idx;
> +};
I'll stop here for today, but I'll try to finish reviewing this patch
and patch 3 before the end of the week.
Regards,
Boris
Hello Boris,
On 17/12/2025 14:37, Boris Brezillon wrote:
> Hi Lukas,
>
> On Mon, 15 Dec 2025 17:14:47 +0000
> Lukas Zapolskas <lukas.zapolskas@arm.com> wrote:
>
>> This patch extends the DEV_QUERY ioctl to return information about the
>> performance counter setup for userspace, and introduces the new
>> ioctl DRM_PANTHOR_PERF_CONTROL in order to allow for the sampling of
>> performance counters.
>>
>> The new design is inspired by the perf aux ringbuffer [0], with the
>> insert and extract indices being mapped to userspace, allowing
>> multiple samples to be exposed at any given time. To avoid pointer
>> chasing, the sample metadata and block metadata are inline with
>> the elements they describe.
>>
>> Userspace is responsible for passing in resources for samples to be
>> exposed, including the event file descriptor for notification of new
>> sample availability, the ringbuffer BO to store samples, and the
>> control BO along with the offset for mapping the insert and extract
>> indices. Though these indices are only a total of 8 bytes, userspace
>> can then reuse the same physical page for tracking the state of
>> multiple buffers by giving different offsets from the BO start to
>> map them.
>>
>> [0]: https://docs.kernel.org/userspace-api/perf_ring_buffer.html
>>
>> Co-developed-by: Mihail Atanassov <mihail.atanassov@arm.com>
>> Signed-off-by: Mihail Atanassov <mihail.atanassov@arm.com>
>> Signed-off-by: Lukas Zapolskas <lukas.zapolskas@arm.com>
>> Reviewed-by: Adrián Larumbe <adrian.larumbe@collabora.com>
>> ---
>> include/uapi/drm/panthor_drm.h | 565 +++++++++++++++++++++++++++++++++
>> 1 file changed, 565 insertions(+)
>>
>> diff --git a/include/uapi/drm/panthor_drm.h b/include/uapi/drm/panthor_drm.h
>> index e238c6264fa1..d1a92172e878 100644
>> --- a/include/uapi/drm/panthor_drm.h
>> +++ b/include/uapi/drm/panthor_drm.h
>> @@ -154,6 +154,9 @@ enum drm_panthor_ioctl_id {
>> * This is useful for imported BOs.
>> */
>> DRM_PANTHOR_BO_QUERY_INFO,
>> +
>> + /** @DRM_PANTHOR_PERF_CONTROL: Control a performance counter session. */
>> + DRM_PANTHOR_PERF_CONTROL,
>> };
>>
>> /**
>> @@ -253,6 +256,9 @@ enum drm_panthor_dev_query_type {
>> * @DRM_PANTHOR_DEV_QUERY_GROUP_PRIORITIES_INFO: Query allowed group priorities information.
>> */
>> DRM_PANTHOR_DEV_QUERY_GROUP_PRIORITIES_INFO,
>> +
>> + /** @DRM_PANTHOR_DEV_QUERY_PERF_INFO: Query performance counter interface information. */
>> + DRM_PANTHOR_DEV_QUERY_PERF_INFO,
>> };
>>
>> /**
>> @@ -445,6 +451,135 @@ struct drm_panthor_group_priorities_info {
>> __u8 pad[3];
>> };
>>
>> +/**
>> + * enum drm_panthor_perf_feat_flags - Performance counter configuration feature flags.
>> + */
>> +enum drm_panthor_perf_feat_flags {
>> + /** @DRM_PANTHOR_PERF_BLOCK_STATES_SUPPORT: Coarse-grained block states are supported. */
>> + DRM_PANTHOR_PERF_BLOCK_STATES_SUPPORT = 1 << 0,
>> +};
>> +
>> +/**
>> + * enum drm_panthor_perf_block_type - Performance counter supported block types.
>> + */
>> +enum drm_panthor_perf_block_type {
>> + /** @DRM_PANTHOR_PERF_BLOCK_METADATA: Internal use only. */
>> + DRM_PANTHOR_PERF_BLOCK_METADATA = 0,
>> +
>> + /** @DRM_PANTHOR_PERF_BLOCK_FW: The FW counter block. */
>> + DRM_PANTHOR_PERF_BLOCK_FW,
>> +
>> + /** @DRM_PANTHOR_PERF_BLOCK_CSHW: The CSHW counter block. */
>> + DRM_PANTHOR_PERF_BLOCK_CSHW,
>> +
>> + /** @DRM_PANTHOR_PERF_BLOCK_TILER: The tiler counter block. */
>> + DRM_PANTHOR_PERF_BLOCK_TILER,
>> +
>> + /** @DRM_PANTHOR_PERF_BLOCK_MEMSYS: A memsys counter block. */
>> + DRM_PANTHOR_PERF_BLOCK_MEMSYS,
>> +
>> + /** @DRM_PANTHOR_PERF_BLOCK_SHADER: A shader core counter block. */
>> + DRM_PANTHOR_PERF_BLOCK_SHADER,
>> +
>> + /** @DRM_PANTHOR_PERF_BLOCK_FIRST: Internal use only. */
>> + DRM_PANTHOR_PERF_BLOCK_FIRST = DRM_PANTHOR_PERF_BLOCK_FW,
>> +
>> + /** @DRM_PANTHOR_PERF_BLOCK_LAST: Internal use only. */
>> + DRM_PANTHOR_PERF_BLOCK_LAST = DRM_PANTHOR_PERF_BLOCK_SHADER,
>> +
>> + /** @DRM_PANTHOR_PERF_BLOCK_MAX: Internal use only. */
>> + DRM_PANTHOR_PERF_BLOCK_MAX = DRM_PANTHOR_PERF_BLOCK_LAST + 1,
>> +};
>
> I'd really prefer if we were not exposing block types as uAPI if those
> are not truly needed for the UMD/KMD to agree on things. The counter
> block knowledge exists in userspace (because it has to if we want to
> attach meaning to counters), and I don't really see the need to
> standardize it here. In my experience, any definition that's not
> absolutely required might become a liability at some point. In that
> case, I can already imagine new GPUs shuffling the block IDs, getting
> rid of some, adding new ones, ... If we have to accommodate the enum
> for those changes it will become a mess. On the other hand, if we make
> the block ID an opaque u8, it just becomes HW knowledge that the
> UMD/perfcnt lib has already (GPU_ID, plus other PERFCNT specific dev
> queries if some stuff are implementation-defined).
>
These IDs are not being provided from the HW, but rather attached to the segment
in the kernel. Identifying the blocks in userspace was much easier in JM, since
they were more or less fixed in the layout. In CSF, on the other hand, the layout
can be a lot more dynamic, and it's not always obvious from the buffer alone
whether a particular block type is available or not. It would require exposing
more of the FW values directly to the user.
>> +
>> +/**
>> + * enum drm_panthor_perf_clock - Identifier of the clock used to produce the cycle count values
>> + * in a given block.
>> + *
>> + * Since the integrator has the choice of using one or more clocks, there may be some confusion
>> + * as to which blocks are counted by which clock values unless this information is explicitly
>> + * provided as part of every block sample. Not every single clock here can be used: in the simplest
>> + * case, all cycle counts will be associated with the top-level clock.
>> + */
>> +enum drm_panthor_perf_clock {
>> + /** @DRM_PANTHOR_PERF_CLOCK_TOPLEVEL: Top-level CSF clock. */
>> + DRM_PANTHOR_PERF_CLOCK_TOPLEVEL,
>> +
>> + /**
>> + * @DRM_PANTHOR_PERF_CLOCK_COREGROUP: Core group clock, responsible for the MMU, L2
>> + * caches and the tiler.
>> + */
>> + DRM_PANTHOR_PERF_CLOCK_COREGROUP,
>> +
>> + /** @DRM_PANTHOR_PERF_CLOCK_SHADER: Clock for the shader cores. */
>> + DRM_PANTHOR_PERF_CLOCK_SHADER,
>> +};
>> +
>> +/**
>> + * struct drm_panthor_perf_info - Performance counter interface information
>> + *
>> + * Structure grouping all queryable information relating to the performance counter
>> + * interfaces.
>> + */
>> +struct drm_panthor_perf_info {
>> + /**
>> + * @counters_per_block: The number of 8-byte counters available in a block.
>> + */
>> + __u32 counters_per_block;
>> +
>> + /**
>> + * @sample_header_size: The size of the header struct available at the beginning
>> + * of every sample.
>> + */
>> + __u32 sample_header_size;
>> +
>> + /**
>> + * @block_header_size: The size of the header struct inline with the counters for a
>> + * single block.
>> + */
>> + __u32 block_header_size;
>
> Are those things not directly deducible from the arch major/minor? If
> those things are implementation-defined, I guess that's fine to expose
> them, but otherwise I'd rely on the knowledge that exists in the UMD.
>
They are implementation-defined, so the sizes may be the same for several different arch major/minors
and then change for all of them.
>> +
>> + /**
>> + * @sample_size: The size of a fully annotated sample, starting with a sample header
>> + * of size @sample_header_size bytes, and all available blocks for the current
>> + * configuration, each comprised of @counters_per_block 64-bit counters and
>> + * a block header of @block_header_size bytes.
>
> Let's keep the kernel doc formatting consistent and drop the alignment
> on the field name (IIRC, it also generate weird indentation in the
> final htmldoc if we do that.
>
Will do!
>> + *
>> + * The user must use this field to allocate size for the ring buffer. In
>> + * the case of new blocks being added, an old userspace can always use
>> + * this field and ignore any blocks it does not know about.
>> + */
>> + __u32 sample_size;
>
> Same thing for the sample_size, it looks like something the UMD should
> know already, given a specific config.
>
Not necessarily. One of the use-cases we have is libGPUCounters[0], which gets embedded in applications
at a particular version with the expectation of it functioning in a forwards compatible fashion, i.e.,
running the application with an old version of the library against a new KMD. In that case, the UMD
cannot infer the size of the sample purely from the fields that were previously exposed to the UMD.
>> +
>> + /** @flags: Combination of drm_panthor_perf_feat_flags flags. */
>> + __u32 flags;
>> +
>> + /**
>> + * @supported_clocks: Bitmask of the clocks supported by the GPU.
>> + *
>> + * Each bit represents a variant of the enum drm_panthor_perf_clock.
>> + *
>> + * For the same GPU, different implementers may have different clocks for the same hardware
>> + * block. At the moment, up to three clocks are supported, and any clocks that are present
>> + * will be reported here.
>> + */
>> + __u32 supported_clocks;
>> +
>> + /** @fw_blocks: Number of FW blocks available. */
>> + __u32 fw_blocks;
>> +
>> + /** @cshw_blocks: Number of CSHW blocks available. */
>> + __u32 cshw_blocks;
>> +
>> + /** @tiler_blocks: Number of tiler blocks available. */
>> + __u32 tiler_blocks;
>> +
>> + /** @memsys_blocks: Number of memsys blocks available. */
>> + __u32 memsys_blocks;
>> +
>> + /** @shader_blocks: Number of shader core blocks available. */
>> + __u32 shader_blocks;
>
> Again, if it's fixed per arch <major,minor>, I'd refrain from exposing
> that. And if the variants are truly implementation-defined, and we go
> through opaque block IDs, I guess we'd need another DEV_QUERY, to get
> the number of variants supported by a specific HW (might even have to
> be a bitmask if there can be holes).
The reasoning behind exposing these here is the same as it was for the sample size:
an old UMD may not be able to interpret new fields from the other DEV_QUERIES to get this
information.
>> +};
>> +
>> /**
>> * struct drm_panthor_dev_query - Arguments passed to DRM_PANTHOR_IOCTL_DEV_QUERY
>> */
>> @@ -1187,6 +1322,434 @@ struct drm_panthor_bo_query_info {
>> __u32 pad;
>> };
>>
>> +/**
>> + * DOC: Performance counter decoding in userspace.
>> + *
>> + * Each sample will be exposed to userspace in the following manner:
>> + *
>> + * +--------+--------+------------------------+--------+-------------------------+-----+
>> + * | Sample | Block | Block | Block | Block | ... |
>> + * | header | header | counters | header | counters | |
>> + * +--------+--------+------------------------+--------+-------------------------+-----+
>> + *
>> + * Each sample will start with a sample header of type @struct drm_panthor_perf_sample header,
>> + * providing sample-wide information like the start and end timestamps, the counter set currently
>> + * configured, and any errors that may have occurred during sampling.
>
> Okay, that part has to be uAPI, because it's purely SW-defined IIUC.
>
That's right.
>> + *
>> + * After the fixed size header, the sample will consist of blocks of
>> + * 64-bit @drm_panthor_dev_query_perf_info::counters_per_block counters, each prefaced with a
>> + * header of its own, indicating source block type, as well as the cycle count needed to normalize
>> + * cycle values within that block, and a clock source identifier.
>
> The rest is HW-defined (or FW-defined), and can be deduced from arch
> <major,minor>+queries for impl-defined stuff, so I would just treat
> that at opaque data in the uAPI.
>
>> + */
>> +
>> +/**
>> + * enum drm_panthor_perf_block_state - Bitmask of the power and execution states that an individual
>> + * hardware block went through in a sampling period.
>> + *
>> + * Because the sampling period is controlled from userspace, the block may undergo multiple
>> + * state transitions, so this must be interpreted as one or more such transitions occurring.
>> + */
>> +enum drm_panthor_perf_block_state {
>> + /**
>> + * @DRM_PANTHOR_PERF_BLOCK_STATE_UNKNOWN: The state of this block was unknown during
>> + * the sampling period.
>> + */
>> + DRM_PANTHOR_PERF_BLOCK_STATE_UNKNOWN = 0,
>> +
>> + /**
>> + * @DRM_PANTHOR_PERF_BLOCK_STATE_ON: This block was powered on for some or all of
>> + * the sampling period.
>> + */
>> + DRM_PANTHOR_PERF_BLOCK_STATE_ON = 1 << 0,
>> +
>> + /**
>> + * @DRM_PANTHOR_PERF_BLOCK_STATE_OFF: This block was powered off for some or all of the
>> + * sampling period.
>> + */
>> + DRM_PANTHOR_PERF_BLOCK_STATE_OFF = 1 << 1,
>> +
>> + /**
>> + * @DRM_PANTHOR_PERF_BLOCK_STATE_AVAILABLE: This block was available for execution for
>> + * some or all of the sampling period.
>> + */
>> + DRM_PANTHOR_PERF_BLOCK_STATE_AVAILABLE = 1 << 2,
>> + /**
>> + * @DRM_PANTHOR_PERF_BLOCK_STATE_UNAVAILABLE: This block was unavailable for execution for
>> + * some or all of the sampling period.
>> + */
>> + DRM_PANTHOR_PERF_BLOCK_STATE_UNAVAILABLE = 1 << 3,
>> +
>> + /**
>> + * @DRM_PANTHOR_PERF_BLOCK_STATE_NORMAL: This block was executing in normal mode
>> + * for some or all of the sampling period.
>> + */
>> + DRM_PANTHOR_PERF_BLOCK_STATE_NORMAL = 1 << 4,
>> +
>> + /**
>> + * @DRM_PANTHOR_PERF_BLOCK_STATE_PROTECTED: This block was executing in protected mode
>> + * for some or all of the sampling period.
>> + */
>> + DRM_PANTHOR_PERF_BLOCK_STATE_PROTECTED = 1 << 5,
>> +};
>> +
>> +/**
>> + * struct drm_panthor_perf_block_header - Header present before every block in the
>> + * sample ringbuffer.
>> + */
>> +struct drm_panthor_perf_block_header {
>> + /** @block_type: Type of the block. */
>> + __u8 block_type;
>> +
>> + /** @block_idx: Block index. */
>> + __u8 block_idx;
>
> I first thought this was the block_set, but it looks like it's
> something else. I imagine it's here to workaround the 128-counters
> per-block limitation that exists because of the size of the
> enable_mask field. If that's the case, this should probably be
> documented.
>
The idea is to have a stable identifier for each block of the same type. For instance,
if you have a sparse mask of four shader cores, the populated counter blocks will always be
numbered sequentially 0-3 while skipping the blocks corresponding to the non-existent
shader cores.
>> +
>> + /**
>> + * @block_states: Coarse-grained block transitions, bitmask of enum
>> + * drm_panthor_perf_block_states.
>> + */
>> + __u8 block_states;
>
> I'd rather make that state a __u32, so we're not blocked if other
> events are added.
>
Sure, can expand this.
>> +
>> + /**
>> + * @clock: Clock used to produce the cycle count for this block, taken from
>> + * enum drm_panthor_perf_clock. The cycle counts are stored in the sample header.
>> + */
>> + __u8 clock;
>> +
>> + /** @pad: MBZ. */
>> + __u8 pad[4];
>> +
>> + /** @enable_mask: Bitmask of counters requested during the session setup. */
>> + __u64 enable_mask[2];
>
> I'm a worried that this breaks if new GPUs expose more than 128
> counters per-block. I'm also unsure why we need it here. Isn't the
> ENABLE mask also part of the per-block HW sample header. If we could
> pass that to the UMD directly, we wouldn't have to account for such
> changes, because the per-HW layout knowledge exists on the libperf side
> too.
Checked, and it looks like we're not actually using this anywhere, so it can be dropped.
>
>> +};
>> +
>> +/**
>> + * enum drm_panthor_perf_sample_flags - Sample-wide events that occurred over the sampling
>> + * period.
>> + */
>> +enum drm_panthor_perf_sample_flags {
>> + /**
>> + * @DRM_PANTHOR_PERF_SAMPLE_OVERFLOW: This sample contains overflows due to the duration
>> + * of the sampling period.
>> + */
>> + DRM_PANTHOR_PERF_SAMPLE_OVERFLOW = 1 << 0,
>> +
>> + /**
>> + * @DRM_PANTHOR_PERF_SAMPLE_ERROR: This sample encountered an error condition during
>> + * the sample duration.
>> + */
>> + DRM_PANTHOR_PERF_SAMPLE_ERROR = 1 << 1,
>> +};
>> +
>> +/**
>> + * struct drm_panthor_perf_sample_header - Header present before every sample.
>> + */
>> +struct drm_panthor_perf_sample_header {
>> + /**
>> + * @timestamp_start_ns: Earliest timestamp that values in this sample represent, in
>> + * nanoseconds. Derived from CLOCK_MONOTONIC_RAW.
>> + */
>> + __u64 timestamp_start_ns;
>> +
>> + /**
>> + * @timestamp_end_ns: Latest timestamp that values in this sample represent, in
>> + * nanoseconds. Derived from CLOCK_MONOTONIC_RAW.
>> + */
>> + __u64 timestamp_end_ns;
>> +
>> + /** @block_set: Set of performance counter blocks. */
>> + __u8 block_set;
>
> Is this the same as drm_panthor_perf_block_header::block_idx? If it is,
> we need to choose a name and stick to it, otherwise it's confusing.
>
This is different. The block set corresponds to the PRFCNT_SET that's globally configured.
>> +
>> + /** @pad: MBZ. */
>> + __u8 pad[3];
>> +
>> + /** @flags: Current sample flags, combination of drm_panthor_perf_sample_flags. */
>> + __u32 flags;
>> +
>> + /**
>> + * @user_data: User data provided as part of the command that triggered this sample.
>> + *
>> + * - Automatic samples (periodic ones or those around non-counting periods or power state
>> + * transitions) will be tagged with the user_data provided as part of the
>> + * DRM_PANTHOR_PERF_COMMAND_START call.
>> + * - Manual samples will be tagged with the user_data provided with the
>> + * DRM_PANTHOR_PERF_COMMAND_SAMPLE call.
>> + * - A session's final automatic sample will be tagged with the user_data provided with the
>> + * DRM_PANTHOR_PERF_COMMAND_STOP call.
>> + */
>> + __u64 user_data;
>> +
>> + /**
>> + * @toplevel_clock_cycles: The number of cycles elapsed between
>> + * drm_panthor_perf_sample_header::timestamp_start_ns and
>> + * drm_panthor_perf_sample_header::timestamp_end_ns on the top-level clock if the
>> + * corresponding bit is set in drm_panthor_perf_info::supported_clocks.
>> + */
>> + __u64 toplevel_clock_cycles;
>> +
>> + /**
>> + * @coregroup_clock_cycles: The number of cycles elapsed between
>> + * drm_panthor_perf_sample_header::timestamp_start_ns and
>> + * drm_panthor_perf_sample_header::timestamp_end_ns on the coregroup clock if the
>> + * corresponding bit is set in drm_panthor_perf_info::supported_clocks.
>> + */
>> + __u64 coregroup_clock_cycles;
>> +
>> + /**
>> + * @shader_clock_cycles: The number of cycles elapsed between
>> + * drm_panthor_perf_sample_header::timestamp_start_ns and
>> + * drm_panthor_perf_sample_header::timestamp_end_ns on the shader core clock if the
>> + * corresponding bit is set in drm_panthor_perf_info::supported_clocks.
>> + */
>> + __u64 shader_clock_cycles;
>> +};
>> +
>> +/**
>> + * enum drm_panthor_perf_command - Command type passed to the DRM_PANTHOR_PERF_CONTROL
>> + * IOCTL.
>> + */
>> +enum drm_panthor_perf_command {
>> + /** @DRM_PANTHOR_PERF_COMMAND_SETUP: Create a new performance counter sampling context. */
>> + DRM_PANTHOR_PERF_COMMAND_SETUP,
>> +
>> + /** @DRM_PANTHOR_PERF_COMMAND_TEARDOWN: Teardown a performance counter sampling context. */
>> + DRM_PANTHOR_PERF_COMMAND_TEARDOWN,
>> +
>> + /** @DRM_PANTHOR_PERF_COMMAND_START: Start a sampling session on the indicated context. */
>> + DRM_PANTHOR_PERF_COMMAND_START,
>> +
>> + /** @DRM_PANTHOR_PERF_COMMAND_STOP: Stop the sampling session on the indicated context. */
>> + DRM_PANTHOR_PERF_COMMAND_STOP,
>> +
>> + /**
>> + * @DRM_PANTHOR_PERF_COMMAND_SAMPLE: Request a manual sample on the indicated context.
>> + *
>> + * When the sampling session is configured with a non-zero sampling frequency, any
>> + * DRM_PANTHOR_PERF_CONTROL calls with this command will be ignored and return an
>> + * -EINVAL.
>> + */
>> + DRM_PANTHOR_PERF_COMMAND_SAMPLE,
>> +};
>> +
>> +/**
>> + * struct drm_panthor_perf_control - Arguments passed to DRM_PANTHOR_IOCTL_PERF_CONTROL.
>> + */
>> +struct drm_panthor_perf_control {
>> + /** @cmd: Command from enum drm_panthor_perf_command. */
>> + __u32 cmd;
>> +
>> + /**
>> + * @handle: session handle.
>> + *
>> + * Returned by the DRM_PANTHOR_PERF_COMMAND_SETUP call.
>> + * It must be used in subsequent commands for the same context.
>> + */
>> + __u32 handle;
>
> I'll comment on the patch adding an implementation for that, but I'd
> like to understand if there's a need for having more than one perf
> session per FD. If not, we can probably drop this handle, attach the
> perf-session directly to panthor_file, and have all commands target the
> only perf session that exists on this FD-context.
>
> Even if there's a need for multiple perf sessions per FD, I think we
> should make this handle/ID per FD to simplify things.
>
The use case is to have both a periodic and a manual sampler on the same process,
looking at different events. More broadly, since the performance counters may be
sampled via an external library, this prevents multiple tools from clashing and
preventing the others from working.
We can definitely make the IDs per-FD, that would be a lot simpler.
>> +
>> + /**
>> + * @size: size of the command structure.
>> + *
>> + * If the pointer is NULL, the size is updated by the driver to provide the size of the
>> + * output structure. If the pointer is not NULL, the driver will only copy min(size,
>> + * struct_size) to the pointer and update the size accordingly.
>> + */
>> + __u64 size;
>
> I'm wondering if we wouldn't be better off adding multiple ioctl()
> instead of doing the demux here. That's basically what we do for other
> blocks (see VM_{CREATE,DESTROY,BIND}, GROUP_{CREATE,DESTROY,SUBMIT},
> ...). This would save an extra deref, and some additional complexity.
>
> I think we have enough driver-specific ioctl()s to add a five more
> for the perfcnt stuff.
>
The initial worry was that it'd be too much to add five new ioctls, but if you're
fine with it, I can separate them out.
>> +
>> + /**
>> + * @pointer: user pointer to a command type struct, such as
>> + * @struct drm_panthor_perf_cmd_start.
>
> Same alignment issue I mentioned above.
>
Ack.
>> + */
>> + __u64 pointer;
>> +};
>> +
>> +/**
>> + * enum drm_panthor_perf_counter_set - The counter set to be requested from the hardware.
>> + *
>> + * The hardware supports a single performance counter set at a time, so requesting any set other
>> + * than the primary may fail if another process is sampling at the same time.
>> + *
>> + * If in doubt, the primary counter set has the most commonly used counters and requires no
>> + * additional permissions to open.
>> + */
>> +enum drm_panthor_perf_counter_set {
>> + /**
>> + * @DRM_PANTHOR_PERF_SET_PRIMARY: The default set configured on the hardware.
>> + *
>> + * This is the only set for which all counters in all blocks are defined.
>> + */
>> + DRM_PANTHOR_PERF_SET_PRIMARY,
>> +
>> + /**
>> + * @DRM_PANTHOR_PERF_SET_SECONDARY: The secondary performance counter set.
>> + *
>> + * Some blocks may not have any defined counters for this set, and the block will
>> + * have the UNAVAILABLE block state permanently set in the block header.
>> + *
>> + * Accessing this set requires the calling process to have the CAP_PERFMON capability.
>> + */
>> + DRM_PANTHOR_PERF_SET_SECONDARY,
>> +
>> + /**
>> + * @DRM_PANTHOR_PERF_SET_TERTIARY: The tertiary performance counter set.
>> + *
>> + * Some blocks may not have any defined counters for this set, and the block will have
>> + * the UNAVAILABLE block state permanently set in the block header. Note that the
>> + * tertiary set has the fewest defined counter blocks.
>> + *
>> + * Accessing this set requires the calling process to have the CAP_PERFMON capability.
>> + */
>> + DRM_PANTHOR_PERF_SET_TERTIARY,
>> +};
>
> Same remark I made for block types, I believe this should be opaque,
> since libperf knows about those sets already.
>
I've erred on the side of being explicit, specifically to avoid breaking old userspace applications running
against new KMDs. Since the HW registers are not necessarily backwards compatible, having the SW translation
ensures that this continues to work.
>> +
>> +/**
>> + * struct drm_panthor_perf_ringbuf_control - Struct used to map in the ring buffer control indices
>> + * into memory shared between user and kernel.
>> + *
>> + */
>> +struct drm_panthor_perf_ringbuf_control {
>> + /**
>> + * @extract_idx: The index of the latest sample that was processed by userspace. Only
>> + * modifiable by userspace.
>
> Same formatting inconsistency.
>
>> + */
>> + __u64 extract_idx;
>> +
>> + /**
>> + * @insert_idx: The index of the latest sample emitted by the kernel. Only modifiable by
>> + * modifiable by the kernel.
>> + */
>> + __u64 insert_idx;
>> +};
>
> I'll stop here for today, but I'll try to finish reviewing this patch
> and patch 3 before the end of the week.
>
> Regards,
>
> Boris
Thanks for taking a look!
Kind regards,
Lukas
[0]: https://github.com/ARM-software/libGPUCounters
Hi Lukas,
On Wed, 7 Jan 2026 15:13:38 +0000
Lukas Zapolskas <lukas.zapolskas@arm.com> wrote:
> Hello Boris,
>
> On 17/12/2025 14:37, Boris Brezillon wrote:
> > Hi Lukas,
> >
> > On Mon, 15 Dec 2025 17:14:47 +0000
> > Lukas Zapolskas <lukas.zapolskas@arm.com> wrote:
> >
> >> This patch extends the DEV_QUERY ioctl to return information about the
> >> performance counter setup for userspace, and introduces the new
> >> ioctl DRM_PANTHOR_PERF_CONTROL in order to allow for the sampling of
> >> performance counters.
> >>
> >> The new design is inspired by the perf aux ringbuffer [0], with the
> >> insert and extract indices being mapped to userspace, allowing
> >> multiple samples to be exposed at any given time. To avoid pointer
> >> chasing, the sample metadata and block metadata are inline with
> >> the elements they describe.
> >>
> >> Userspace is responsible for passing in resources for samples to be
> >> exposed, including the event file descriptor for notification of new
> >> sample availability, the ringbuffer BO to store samples, and the
> >> control BO along with the offset for mapping the insert and extract
> >> indices. Though these indices are only a total of 8 bytes, userspace
> >> can then reuse the same physical page for tracking the state of
> >> multiple buffers by giving different offsets from the BO start to
> >> map them.
> >>
> >> [0]: https://docs.kernel.org/userspace-api/perf_ring_buffer.html
> >>
> >> Co-developed-by: Mihail Atanassov <mihail.atanassov@arm.com>
> >> Signed-off-by: Mihail Atanassov <mihail.atanassov@arm.com>
> >> Signed-off-by: Lukas Zapolskas <lukas.zapolskas@arm.com>
> >> Reviewed-by: Adrián Larumbe <adrian.larumbe@collabora.com>
> >> ---
> >> include/uapi/drm/panthor_drm.h | 565 +++++++++++++++++++++++++++++++++
> >> 1 file changed, 565 insertions(+)
> >>
> >> diff --git a/include/uapi/drm/panthor_drm.h b/include/uapi/drm/panthor_drm.h
> >> index e238c6264fa1..d1a92172e878 100644
> >> --- a/include/uapi/drm/panthor_drm.h
> >> +++ b/include/uapi/drm/panthor_drm.h
> >> @@ -154,6 +154,9 @@ enum drm_panthor_ioctl_id {
> >> * This is useful for imported BOs.
> >> */
> >> DRM_PANTHOR_BO_QUERY_INFO,
> >> +
> >> + /** @DRM_PANTHOR_PERF_CONTROL: Control a performance counter session. */
> >> + DRM_PANTHOR_PERF_CONTROL,
> >> };
> >>
> >> /**
> >> @@ -253,6 +256,9 @@ enum drm_panthor_dev_query_type {
> >> * @DRM_PANTHOR_DEV_QUERY_GROUP_PRIORITIES_INFO: Query allowed group priorities information.
> >> */
> >> DRM_PANTHOR_DEV_QUERY_GROUP_PRIORITIES_INFO,
> >> +
> >> + /** @DRM_PANTHOR_DEV_QUERY_PERF_INFO: Query performance counter interface information. */
> >> + DRM_PANTHOR_DEV_QUERY_PERF_INFO,
> >> };
> >>
> >> /**
> >> @@ -445,6 +451,135 @@ struct drm_panthor_group_priorities_info {
> >> __u8 pad[3];
> >> };
> >>
> >> +/**
> >> + * enum drm_panthor_perf_feat_flags - Performance counter configuration feature flags.
> >> + */
> >> +enum drm_panthor_perf_feat_flags {
> >> + /** @DRM_PANTHOR_PERF_BLOCK_STATES_SUPPORT: Coarse-grained block states are supported. */
> >> + DRM_PANTHOR_PERF_BLOCK_STATES_SUPPORT = 1 << 0,
> >> +};
> >> +
> >> +/**
> >> + * enum drm_panthor_perf_block_type - Performance counter supported block types.
> >> + */
> >> +enum drm_panthor_perf_block_type {
> >> + /** @DRM_PANTHOR_PERF_BLOCK_METADATA: Internal use only. */
> >> + DRM_PANTHOR_PERF_BLOCK_METADATA = 0,
> >> +
> >> + /** @DRM_PANTHOR_PERF_BLOCK_FW: The FW counter block. */
> >> + DRM_PANTHOR_PERF_BLOCK_FW,
> >> +
> >> + /** @DRM_PANTHOR_PERF_BLOCK_CSHW: The CSHW counter block. */
> >> + DRM_PANTHOR_PERF_BLOCK_CSHW,
> >> +
> >> + /** @DRM_PANTHOR_PERF_BLOCK_TILER: The tiler counter block. */
> >> + DRM_PANTHOR_PERF_BLOCK_TILER,
> >> +
> >> + /** @DRM_PANTHOR_PERF_BLOCK_MEMSYS: A memsys counter block. */
> >> + DRM_PANTHOR_PERF_BLOCK_MEMSYS,
> >> +
> >> + /** @DRM_PANTHOR_PERF_BLOCK_SHADER: A shader core counter block. */
> >> + DRM_PANTHOR_PERF_BLOCK_SHADER,
> >> +
> >> + /** @DRM_PANTHOR_PERF_BLOCK_FIRST: Internal use only. */
> >> + DRM_PANTHOR_PERF_BLOCK_FIRST = DRM_PANTHOR_PERF_BLOCK_FW,
> >> +
> >> + /** @DRM_PANTHOR_PERF_BLOCK_LAST: Internal use only. */
> >> + DRM_PANTHOR_PERF_BLOCK_LAST = DRM_PANTHOR_PERF_BLOCK_SHADER,
> >> +
> >> + /** @DRM_PANTHOR_PERF_BLOCK_MAX: Internal use only. */
> >> + DRM_PANTHOR_PERF_BLOCK_MAX = DRM_PANTHOR_PERF_BLOCK_LAST + 1,
> >> +};
> >
> > I'd really prefer if we were not exposing block types as uAPI if those
> > are not truly needed for the UMD/KMD to agree on things. The counter
> > block knowledge exists in userspace (because it has to if we want to
> > attach meaning to counters), and I don't really see the need to
> > standardize it here. In my experience, any definition that's not
> > absolutely required might become a liability at some point. In that
> > case, I can already imagine new GPUs shuffling the block IDs, getting
> > rid of some, adding new ones, ... If we have to accommodate the enum
> > for those changes it will become a mess. On the other hand, if we make
> > the block ID an opaque u8, it just becomes HW knowledge that the
> > UMD/perfcnt lib has already (GPU_ID, plus other PERFCNT specific dev
> > queries if some stuff are implementation-defined).
> >
>
> These IDs are not being provided from the HW, but rather attached to the segment
> in the kernel. Identifying the blocks in userspace was much easier in JM, since
> they were more or less fixed in the layout. In CSF, on the other hand, the layout
> can be a lot more dynamic, and it's not always obvious from the buffer alone
> whether a particular block type is available or not. It would require exposing
> more of the FW values directly to the user.
Can you expand a bit here? What kind of FW values? I was assuming
counters would always be HW counters and depend on the GPU arch
major/minor plus maybe some optional features. Is the FW inserting some
SW-based counters? If that's the case, I'd still prefer to expose FW
interface versions and let the userside lib figure out where each block
is.
>
> >> +
> >> +/**
> >> + * enum drm_panthor_perf_clock - Identifier of the clock used to produce the cycle count values
> >> + * in a given block.
> >> + *
> >> + * Since the integrator has the choice of using one or more clocks, there may be some confusion
> >> + * as to which blocks are counted by which clock values unless this information is explicitly
> >> + * provided as part of every block sample. Not every single clock here can be used: in the simplest
> >> + * case, all cycle counts will be associated with the top-level clock.
> >> + */
> >> +enum drm_panthor_perf_clock {
> >> + /** @DRM_PANTHOR_PERF_CLOCK_TOPLEVEL: Top-level CSF clock. */
> >> + DRM_PANTHOR_PERF_CLOCK_TOPLEVEL,
> >> +
> >> + /**
> >> + * @DRM_PANTHOR_PERF_CLOCK_COREGROUP: Core group clock, responsible for the MMU, L2
> >> + * caches and the tiler.
> >> + */
> >> + DRM_PANTHOR_PERF_CLOCK_COREGROUP,
> >> +
> >> + /** @DRM_PANTHOR_PERF_CLOCK_SHADER: Clock for the shader cores. */
> >> + DRM_PANTHOR_PERF_CLOCK_SHADER,
> >> +};
> >> +
> >> +/**
> >> + * struct drm_panthor_perf_info - Performance counter interface information
> >> + *
> >> + * Structure grouping all queryable information relating to the performance counter
> >> + * interfaces.
> >> + */
> >> +struct drm_panthor_perf_info {
> >> + /**
> >> + * @counters_per_block: The number of 8-byte counters available in a block.
> >> + */
> >> + __u32 counters_per_block;
> >> +
> >> + /**
> >> + * @sample_header_size: The size of the header struct available at the beginning
> >> + * of every sample.
> >> + */
> >> + __u32 sample_header_size;
> >> +
> >> + /**
> >> + * @block_header_size: The size of the header struct inline with the counters for a
> >> + * single block.
> >> + */
> >> + __u32 block_header_size;
> >
> > Are those things not directly deducible from the arch major/minor? If
> > those things are implementation-defined, I guess that's fine to expose
> > them, but otherwise I'd rely on the knowledge that exists in the UMD.
> >
>
> They are implementation-defined, so the sizes may be the same for several different arch major/minors
> and then change for all of them.
Can you be more specific? Which implementation are we talking about?
The FW implementation, or some fixed HW function that might or might
not be present? Actually, let's take a step back, can you explain where
those counters come from? I was assuming those were HW counters that
were simply forwarded by the FW. Are you saying the FW is more than
just a dummy proxy?
>
> >> +
> >> + /**
> >> + * @sample_size: The size of a fully annotated sample, starting with a sample header
> >> + * of size @sample_header_size bytes, and all available blocks for the current
> >> + * configuration, each comprised of @counters_per_block 64-bit counters and
> >> + * a block header of @block_header_size bytes.
> >
> > Let's keep the kernel doc formatting consistent and drop the alignment
> > on the field name (IIRC, it also generate weird indentation in the
> > final htmldoc if we do that.
> >
>
> Will do!
>
> >> + *
> >> + * The user must use this field to allocate size for the ring buffer. In
> >> + * the case of new blocks being added, an old userspace can always use
> >> + * this field and ignore any blocks it does not know about.
> >> + */
> >> + __u32 sample_size;
> >
> > Same thing for the sample_size, it looks like something the UMD should
> > know already, given a specific config.
> >
>
> Not necessarily. One of the use-cases we have is libGPUCounters[0], which gets embedded in applications
> at a particular version with the expectation of it functioning in a forwards compatible fashion, i.e.,
> running the application with an old version of the library against a new KMD. In that case, the UMD
> cannot infer the size of the sample purely from the fields that were previously exposed to the UMD.
That's where I'm lost. Why would the HW counter layout change based on
a KMD version? Feels like you're treating HW counters as a SW concept,
which is somewhat confusing to me. Maybe the answer to my previous
question will answer that.
Regards,
Boris
On Mon, 15 Dec 2025 17:14:47 +0000
Lukas Zapolskas <lukas.zapolskas@arm.com> wrote:
> +/**
> + * enum drm_panthor_perf_block_state - Bitmask of the power and execution states that an individual
> + * hardware block went through in a sampling period.
> + *
> + * Because the sampling period is controlled from userspace, the block may undergo multiple
> + * state transitions, so this must be interpreted as one or more such transitions occurring.
> + */
> +enum drm_panthor_perf_block_state {
> + /**
> + * @DRM_PANTHOR_PERF_BLOCK_STATE_UNKNOWN: The state of this block was unknown during
> + * the sampling period.
> + */
> + DRM_PANTHOR_PERF_BLOCK_STATE_UNKNOWN = 0,
> +
> + /**
> + * @DRM_PANTHOR_PERF_BLOCK_STATE_ON: This block was powered on for some or all of
> + * the sampling period.
> + */
> + DRM_PANTHOR_PERF_BLOCK_STATE_ON = 1 << 0,
> +
> + /**
> + * @DRM_PANTHOR_PERF_BLOCK_STATE_OFF: This block was powered off for some or all of the
> + * sampling period.
> + */
> + DRM_PANTHOR_PERF_BLOCK_STATE_OFF = 1 << 1,
> +
> + /**
> + * @DRM_PANTHOR_PERF_BLOCK_STATE_AVAILABLE: This block was available for execution for
> + * some or all of the sampling period.
> + */
> + DRM_PANTHOR_PERF_BLOCK_STATE_AVAILABLE = 1 << 2,
Missing blank line.
> + /**
> + * @DRM_PANTHOR_PERF_BLOCK_STATE_UNAVAILABLE: This block was unavailable for execution for
> + * some or all of the sampling period.
> + */
> + DRM_PANTHOR_PERF_BLOCK_STATE_UNAVAILABLE = 1 << 3,
> +
> + /**
> + * @DRM_PANTHOR_PERF_BLOCK_STATE_NORMAL: This block was executing in normal mode
> + * for some or all of the sampling period.
> + */
> + DRM_PANTHOR_PERF_BLOCK_STATE_NORMAL = 1 << 4,
> +
> + /**
> + * @DRM_PANTHOR_PERF_BLOCK_STATE_PROTECTED: This block was executing in protected mode
> + * for some or all of the sampling period.
> + */
> + DRM_PANTHOR_PERF_BLOCK_STATE_PROTECTED = 1 << 5,
> +};
> +
> +/**
> + * struct drm_panthor_perf_block_header - Header present before every block in the
> + * sample ringbuffer.
> + */
> +struct drm_panthor_perf_block_header {
> + /** @block_type: Type of the block. */
> + __u8 block_type;
> +
> + /** @block_idx: Block index. */
> + __u8 block_idx;
> +
> + /**
> + * @block_states: Coarse-grained block transitions, bitmask of enum
> + * drm_panthor_perf_block_states.
> + */
> + __u8 block_states;
> +
> + /**
> + * @clock: Clock used to produce the cycle count for this block, taken from
> + * enum drm_panthor_perf_clock. The cycle counts are stored in the sample header.
> + */
> + __u8 clock;
> +
> + /** @pad: MBZ. */
> + __u8 pad[4];
Why not make it a __u32?
> +
> + /** @enable_mask: Bitmask of counters requested during the session setup. */
> + __u64 enable_mask[2];
> +};
> +
> +/**
> + * enum drm_panthor_perf_sample_flags - Sample-wide events that occurred over the sampling
> + * period.
> + */
> +enum drm_panthor_perf_sample_flags {
> + /**
> + * @DRM_PANTHOR_PERF_SAMPLE_OVERFLOW: This sample contains overflows due to the duration
> + * of the sampling period.
> + */
> + DRM_PANTHOR_PERF_SAMPLE_OVERFLOW = 1 << 0,
> +
> + /**
> + * @DRM_PANTHOR_PERF_SAMPLE_ERROR: This sample encountered an error condition during
> + * the sample duration.
> + */
> + DRM_PANTHOR_PERF_SAMPLE_ERROR = 1 << 1,
> +};
> +
> +/**
> + * struct drm_panthor_perf_sample_header - Header present before every sample.
> + */
> +struct drm_panthor_perf_sample_header {
> + /**
> + * @timestamp_start_ns: Earliest timestamp that values in this sample represent, in
> + * nanoseconds. Derived from CLOCK_MONOTONIC_RAW.
> + */
> + __u64 timestamp_start_ns;
> +
> + /**
> + * @timestamp_end_ns: Latest timestamp that values in this sample represent, in
> + * nanoseconds. Derived from CLOCK_MONOTONIC_RAW.
> + */
> + __u64 timestamp_end_ns;
> +
> + /** @block_set: Set of performance counter blocks. */
> + __u8 block_set;
> +
> + /** @pad: MBZ. */
> + __u8 pad[3];
> +
> + /** @flags: Current sample flags, combination of drm_panthor_perf_sample_flags. */
> + __u32 flags;
> +
> + /**
> + * @user_data: User data provided as part of the command that triggered this sample.
> + *
> + * - Automatic samples (periodic ones or those around non-counting periods or power state
> + * transitions) will be tagged with the user_data provided as part of the
> + * DRM_PANTHOR_PERF_COMMAND_START call.
> + * - Manual samples will be tagged with the user_data provided with the
> + * DRM_PANTHOR_PERF_COMMAND_SAMPLE call.
> + * - A session's final automatic sample will be tagged with the user_data provided with the
> + * DRM_PANTHOR_PERF_COMMAND_STOP call.
> + */
> + __u64 user_data;
> +
> + /**
> + * @toplevel_clock_cycles: The number of cycles elapsed between
> + * drm_panthor_perf_sample_header::timestamp_start_ns and
> + * drm_panthor_perf_sample_header::timestamp_end_ns on the top-level clock if the
> + * corresponding bit is set in drm_panthor_perf_info::supported_clocks.
> + */
> + __u64 toplevel_clock_cycles;
> +
> + /**
> + * @coregroup_clock_cycles: The number of cycles elapsed between
> + * drm_panthor_perf_sample_header::timestamp_start_ns and
> + * drm_panthor_perf_sample_header::timestamp_end_ns on the coregroup clock if the
> + * corresponding bit is set in drm_panthor_perf_info::supported_clocks.
> + */
> + __u64 coregroup_clock_cycles;
> +
> + /**
> + * @shader_clock_cycles: The number of cycles elapsed between
> + * drm_panthor_perf_sample_header::timestamp_start_ns and
> + * drm_panthor_perf_sample_header::timestamp_end_ns on the shader core clock if the
> + * corresponding bit is set in drm_panthor_perf_info::supported_clocks.
> + */
> + __u64 shader_clock_cycles;
> +};
> +
> +/**
> + * enum drm_panthor_perf_command - Command type passed to the DRM_PANTHOR_PERF_CONTROL
> + * IOCTL.
> + */
> +enum drm_panthor_perf_command {
> + /** @DRM_PANTHOR_PERF_COMMAND_SETUP: Create a new performance counter sampling context. */
> + DRM_PANTHOR_PERF_COMMAND_SETUP,
> +
> + /** @DRM_PANTHOR_PERF_COMMAND_TEARDOWN: Teardown a performance counter sampling context. */
> + DRM_PANTHOR_PERF_COMMAND_TEARDOWN,
> +
> + /** @DRM_PANTHOR_PERF_COMMAND_START: Start a sampling session on the indicated context. */
> + DRM_PANTHOR_PERF_COMMAND_START,
> +
> + /** @DRM_PANTHOR_PERF_COMMAND_STOP: Stop the sampling session on the indicated context. */
> + DRM_PANTHOR_PERF_COMMAND_STOP,
> +
> + /**
> + * @DRM_PANTHOR_PERF_COMMAND_SAMPLE: Request a manual sample on the indicated context.
> + *
> + * When the sampling session is configured with a non-zero sampling frequency, any
> + * DRM_PANTHOR_PERF_CONTROL calls with this command will be ignored and return an
> + * -EINVAL.
> + */
> + DRM_PANTHOR_PERF_COMMAND_SAMPLE,
> +};
> +
> +/**
> + * struct drm_panthor_perf_control - Arguments passed to DRM_PANTHOR_IOCTL_PERF_CONTROL.
> + */
> +struct drm_panthor_perf_control {
> + /** @cmd: Command from enum drm_panthor_perf_command. */
> + __u32 cmd;
> +
> + /**
> + * @handle: session handle.
> + *
> + * Returned by the DRM_PANTHOR_PERF_COMMAND_SETUP call.
> + * It must be used in subsequent commands for the same context.
> + */
> + __u32 handle;
> +
> + /**
> + * @size: size of the command structure.
> + *
> + * If the pointer is NULL, the size is updated by the driver to provide the size of the
> + * output structure. If the pointer is not NULL, the driver will only copy min(size,
> + * struct_size) to the pointer and update the size accordingly.
> + */
> + __u64 size;
> +
> + /**
> + * @pointer: user pointer to a command type struct, such as
> + * @struct drm_panthor_perf_cmd_start.
> + */
> + __u64 pointer;
> +};
> +
> +/**
> + * enum drm_panthor_perf_counter_set - The counter set to be requested from the hardware.
> + *
> + * The hardware supports a single performance counter set at a time, so requesting any set other
> + * than the primary may fail if another process is sampling at the same time.
> + *
> + * If in doubt, the primary counter set has the most commonly used counters and requires no
> + * additional permissions to open.
> + */
> +enum drm_panthor_perf_counter_set {
> + /**
> + * @DRM_PANTHOR_PERF_SET_PRIMARY: The default set configured on the hardware.
> + *
> + * This is the only set for which all counters in all blocks are defined.
> + */
> + DRM_PANTHOR_PERF_SET_PRIMARY,
> +
> + /**
> + * @DRM_PANTHOR_PERF_SET_SECONDARY: The secondary performance counter set.
> + *
> + * Some blocks may not have any defined counters for this set, and the block will
> + * have the UNAVAILABLE block state permanently set in the block header.
> + *
> + * Accessing this set requires the calling process to have the CAP_PERFMON capability.
> + */
> + DRM_PANTHOR_PERF_SET_SECONDARY,
> +
> + /**
> + * @DRM_PANTHOR_PERF_SET_TERTIARY: The tertiary performance counter set.
> + *
> + * Some blocks may not have any defined counters for this set, and the block will have
> + * the UNAVAILABLE block state permanently set in the block header. Note that the
> + * tertiary set has the fewest defined counter blocks.
> + *
> + * Accessing this set requires the calling process to have the CAP_PERFMON capability.
> + */
> + DRM_PANTHOR_PERF_SET_TERTIARY,
> +};
> +
> +/**
> + * struct drm_panthor_perf_ringbuf_control - Struct used to map in the ring buffer control indices
> + * into memory shared between user and kernel.
> + *
> + */
> +struct drm_panthor_perf_ringbuf_control {
> + /**
> + * @extract_idx: The index of the latest sample that was processed by userspace. Only
> + * modifiable by userspace.
> + */
> + __u64 extract_idx;
> +
> + /**
> + * @insert_idx: The index of the latest sample emitted by the kernel. Only modifiable by
> + * modifiable by the kernel.
> + */
> + __u64 insert_idx;
> +};
> +
> +/**
> + * struct drm_panthor_perf_cmd_setup - Arguments passed to DRM_PANTHOR_IOCTL_PERF_CONTROL
> + * when the DRM_PANTHOR_PERF_COMMAND_SETUP command is specified.
> + */
> +struct drm_panthor_perf_cmd_setup {
> + /**
> + * @block_set: Set of performance counter blocks, member of
> + * enum drm_panthor_perf_block_set.
> + *
> + * This is a global configuration and only one set can be active at a time. If
> + * another client has already requested a counter set, any further requests
> + * for a different counter set will fail and return an -EBUSY.
> + *
> + * If the requested set does not exist, the request will fail and return an -EINVAL.
> + *
> + * Some sets have additional requirements to be enabled, and the setup request will
> + * fail with an -EACCES if these requirements are not satisfied.
> + */
> + __u8 block_set;
> +
> + /** @pad: MBZ. */
> + __u8 pad[7];
> +
> + /** @fd: eventfd for signalling the availability of a new sample. */
> + __u32 fd;
> +
> + /** @ringbuf_handle: Handle to the BO to write perf counter sample to. */
> + __u32 ringbuf_handle;
> +
> + /**
> + * @control_handle: Handle to the BO containing a contiguous 16 byte range, used for the
> + * insert and extract indices for the ringbuffer.
> + */
> + __u32 control_handle;
> +
> + /**
> + * @sample_slots: The number of slots available in the userspace-provided BO. Must be
> + * a power of 2.
> + *
> + * If sample_slots * sample_size does not match the BO size, the setup request will fail.
> + */
> + __u32 sample_slots;
> +
> + /**
> + * @control_offset: Offset into the control BO where the insert and extract indices are
> + * located.
> + */
> + __u64 control_offset;
> +
> + /**
> + * @sample_freq_ns: Period between automatic counter sample collection in nanoseconds. Zero
> + * disables automatic collection and all collection must be done through explicit calls
> + * to DRM_PANTHOR_PERF_CONTROL.SAMPLE. Non-zero values will disable manual counter sampling
> + * via the DRM_PANTHOR_PERF_COMMAND_SAMPLE command.
> + *
> + * This disables software-triggered periodic sampling, but hardware will still trigger
> + * automatic samples on certain events, including shader core power transitions, and
> + * entries to and exits from non-counting periods. The final stop command will also
> + * trigger a sample to ensure no data is lost.
> + */
> + __u64 sample_freq_ns;
> +
> + /**
> + * @fw_enable_mask: Bitmask of counters to request from the FW counter block. Any bits
> + * past the first drm_panthor_perf_info.counters_per_block bits will be ignored. Bit 0
> + * corresponds to counter 0.
> + */
> + __u64 fw_enable_mask[2];
> +
> + /**
> + * @cshw_enable_mask: Bitmask of counters to request from the CSHW counter block. Any bits
> + * past the first drm_panthor_perf_info.counters_per_block bits will be ignored. Bit 0
> + * corresponds to counter 0.
> + */
> + __u64 cshw_enable_mask[2];
> +
> + /**
> + * @tiler_enable_mask: Bitmask of counters to request from the tiler counter block. Any
> + * bits past the first drm_panthor_perf_info.counters_per_block bits will be ignored. Bit
> + * 0 corresponds to counter 0.
> + */
> + __u64 tiler_enable_mask[2];
> +
> + /**
> + * @memsys_enable_mask: Bitmask of counters to request from the memsys counter blocks. Any
> + * bits past the first drm_panthor_perf_info.counters_per_block bits will be ignored. Bit 0
> + * corresponds to counter 0.
> + */
> + __u64 memsys_enable_mask[2];
> +
> + /**
> + * @shader_enable_mask: Bitmask of counters to request from the shader core counter blocks.
> + * Any bits past the first drm_panthor_perf_info.counters_per_block bits will be ignored.
> + * Bit 0 corresponds to counter 0.
> + */
> + __u64 shader_enable_mask[2];
> +};
> +
> +/**
> + * struct drm_panthor_perf_cmd_start - Arguments passed to DRM_PANTHOR_IOCTL_PERF_CONTROL
> + * when the DRM_PANTHOR_PERF_COMMAND_START command is specified.
> + */
> +struct drm_panthor_perf_cmd_start {
> + /**
> + * @user_data: User provided data that will be attached to automatic samples collected
> + * until the next DRM_PANTHOR_PERF_COMMAND_STOP.
> + */
> + __u64 user_data;
> +};
> +
> +/**
> + * struct drm_panthor_perf_cmd_stop - Arguments passed to DRM_PANTHOR_IOCTL_PERF_CONTROL
> + * when the DRM_PANTHOR_PERF_COMMAND_STOP command is specified.
> + */
> +struct drm_panthor_perf_cmd_stop {
> + /**
> + * @user_data: User provided data that will be attached to the automatic sample collected
> + * at the end of this sampling session.
> + */
> + __u64 user_data;
> +};
> +
> +/**
> + * struct drm_panthor_perf_cmd_sample - Arguments passed to DRM_PANTHOR_IOCTL_PERF_CONTROL
> + * when the DRM_PANTHOR_PERF_COMMAND_SAMPLE command is specified.
> + */
> +struct drm_panthor_perf_cmd_sample {
> + /** @user_data: User provided data that will be attached to the sample.*/
> + __u64 user_data;
> +};
> +
> /**
> * DRM_IOCTL_PANTHOR() - Build a Panthor IOCTL number
> * @__access: Access type. Must be R, W or RW.
> @@ -1237,6 +1800,8 @@ enum {
> DRM_IOCTL_PANTHOR(WR, BO_SYNC, bo_sync),
> DRM_IOCTL_PANTHOR_BO_QUERY_INFO =
> DRM_IOCTL_PANTHOR(WR, BO_QUERY_INFO, bo_query_info),
> + DRM_IOCTL_PANTHOR_PERF_CONTROL =
> + DRM_IOCTL_PANTHOR(WR, PERF_CONTROL, perf_control)
> };
>
> #if defined(__cplusplus)
On 16/12/2025 17:30, Boris Brezillon wrote:
> On Mon, 15 Dec 2025 17:14:47 +0000
> Lukas Zapolskas <lukas.zapolskas@arm.com> wrote:
>
>> +/**
>> + * enum drm_panthor_perf_block_state - Bitmask of the power and execution states that an individual
>> + * hardware block went through in a sampling period.
>> + *
>> + * Because the sampling period is controlled from userspace, the block may undergo multiple
>> + * state transitions, so this must be interpreted as one or more such transitions occurring.
>> + */
>> +enum drm_panthor_perf_block_state {
>> + /**
>> + * @DRM_PANTHOR_PERF_BLOCK_STATE_UNKNOWN: The state of this block was unknown during
>> + * the sampling period.
>> + */
>> + DRM_PANTHOR_PERF_BLOCK_STATE_UNKNOWN = 0,
>> +
>> + /**
>> + * @DRM_PANTHOR_PERF_BLOCK_STATE_ON: This block was powered on for some or all of
>> + * the sampling period.
>> + */
>> + DRM_PANTHOR_PERF_BLOCK_STATE_ON = 1 << 0,
>> +
>> + /**
>> + * @DRM_PANTHOR_PERF_BLOCK_STATE_OFF: This block was powered off for some or all of the
>> + * sampling period.
>> + */
>> + DRM_PANTHOR_PERF_BLOCK_STATE_OFF = 1 << 1,
>> +
>> + /**
>> + * @DRM_PANTHOR_PERF_BLOCK_STATE_AVAILABLE: This block was available for execution for
>> + * some or all of the sampling period.
>> + */
>> + DRM_PANTHOR_PERF_BLOCK_STATE_AVAILABLE = 1 << 2,
>
> Missing blank line.
>
Ack.
>> + /**
>> + * @DRM_PANTHOR_PERF_BLOCK_STATE_UNAVAILABLE: This block was unavailable for execution for
>> + * some or all of the sampling period.
>> + */
>> + DRM_PANTHOR_PERF_BLOCK_STATE_UNAVAILABLE = 1 << 3,
>> +
>> + /**
>> + * @DRM_PANTHOR_PERF_BLOCK_STATE_NORMAL: This block was executing in normal mode
>> + * for some or all of the sampling period.
>> + */
>> + DRM_PANTHOR_PERF_BLOCK_STATE_NORMAL = 1 << 4,
>> +
>> + /**
>> + * @DRM_PANTHOR_PERF_BLOCK_STATE_PROTECTED: This block was executing in protected mode
>> + * for some or all of the sampling period.
>> + */
>> + DRM_PANTHOR_PERF_BLOCK_STATE_PROTECTED = 1 << 5,
>> +};
>> +
>> +/**
>> + * struct drm_panthor_perf_block_header - Header present before every block in the
>> + * sample ringbuffer.
>> + */
>> +struct drm_panthor_perf_block_header {
>> + /** @block_type: Type of the block. */
>> + __u8 block_type;
>> +
>> + /** @block_idx: Block index. */
>> + __u8 block_idx;
>> +
>> + /**
>> + * @block_states: Coarse-grained block transitions, bitmask of enum
>> + * drm_panthor_perf_block_states.
>> + */
>> + __u8 block_states;
>> +
>> + /**
>> + * @clock: Clock used to produce the cycle count for this block, taken from
>> + * enum drm_panthor_perf_clock. The cycle counts are stored in the sample header.
>> + */
>> + __u8 clock;
>> +
>> + /** @pad: MBZ. */
>> + __u8 pad[4];
>
> Why not make it a __u32?
>
I don't think there's any good reason not to, will change it in the next series.
>> +
>> + /** @enable_mask: Bitmask of counters requested during the session setup. */
>> + __u64 enable_mask[2];
>> +};
>> +
>> +/**
>> + * enum drm_panthor_perf_sample_flags - Sample-wide events that occurred over the sampling
>> + * period.
>> + */
>> +enum drm_panthor_perf_sample_flags {
>> + /**
>> + * @DRM_PANTHOR_PERF_SAMPLE_OVERFLOW: This sample contains overflows due to the duration
>> + * of the sampling period.
>> + */
>> + DRM_PANTHOR_PERF_SAMPLE_OVERFLOW = 1 << 0,
>> +
>> + /**
>> + * @DRM_PANTHOR_PERF_SAMPLE_ERROR: This sample encountered an error condition during
>> + * the sample duration.
>> + */
>> + DRM_PANTHOR_PERF_SAMPLE_ERROR = 1 << 1,
>> +};
>> +
>> +/**
>> + * struct drm_panthor_perf_sample_header - Header present before every sample.
>> + */
>> +struct drm_panthor_perf_sample_header {
>> + /**
>> + * @timestamp_start_ns: Earliest timestamp that values in this sample represent, in
>> + * nanoseconds. Derived from CLOCK_MONOTONIC_RAW.
>> + */
>> + __u64 timestamp_start_ns;
>> +
>> + /**
>> + * @timestamp_end_ns: Latest timestamp that values in this sample represent, in
>> + * nanoseconds. Derived from CLOCK_MONOTONIC_RAW.
>> + */
>> + __u64 timestamp_end_ns;
>> +
>> + /** @block_set: Set of performance counter blocks. */
>> + __u8 block_set;
>> +
>> + /** @pad: MBZ. */
>> + __u8 pad[3];
>> +
>> + /** @flags: Current sample flags, combination of drm_panthor_perf_sample_flags. */
>> + __u32 flags;
>> +
>> + /**
>> + * @user_data: User data provided as part of the command that triggered this sample.
>> + *
>> + * - Automatic samples (periodic ones or those around non-counting periods or power state
>> + * transitions) will be tagged with the user_data provided as part of the
>> + * DRM_PANTHOR_PERF_COMMAND_START call.
>> + * - Manual samples will be tagged with the user_data provided with the
>> + * DRM_PANTHOR_PERF_COMMAND_SAMPLE call.
>> + * - A session's final automatic sample will be tagged with the user_data provided with the
>> + * DRM_PANTHOR_PERF_COMMAND_STOP call.
>> + */
>> + __u64 user_data;
>> +
>> + /**
>> + * @toplevel_clock_cycles: The number of cycles elapsed between
>> + * drm_panthor_perf_sample_header::timestamp_start_ns and
>> + * drm_panthor_perf_sample_header::timestamp_end_ns on the top-level clock if the
>> + * corresponding bit is set in drm_panthor_perf_info::supported_clocks.
>> + */
>> + __u64 toplevel_clock_cycles;
>> +
>> + /**
>> + * @coregroup_clock_cycles: The number of cycles elapsed between
>> + * drm_panthor_perf_sample_header::timestamp_start_ns and
>> + * drm_panthor_perf_sample_header::timestamp_end_ns on the coregroup clock if the
>> + * corresponding bit is set in drm_panthor_perf_info::supported_clocks.
>> + */
>> + __u64 coregroup_clock_cycles;
>> +
>> + /**
>> + * @shader_clock_cycles: The number of cycles elapsed between
>> + * drm_panthor_perf_sample_header::timestamp_start_ns and
>> + * drm_panthor_perf_sample_header::timestamp_end_ns on the shader core clock if the
>> + * corresponding bit is set in drm_panthor_perf_info::supported_clocks.
>> + */
>> + __u64 shader_clock_cycles;
>> +};
>> +
>> +/**
>> + * enum drm_panthor_perf_command - Command type passed to the DRM_PANTHOR_PERF_CONTROL
>> + * IOCTL.
>> + */
>> +enum drm_panthor_perf_command {
>> + /** @DRM_PANTHOR_PERF_COMMAND_SETUP: Create a new performance counter sampling context. */
>> + DRM_PANTHOR_PERF_COMMAND_SETUP,
>> +
>> + /** @DRM_PANTHOR_PERF_COMMAND_TEARDOWN: Teardown a performance counter sampling context. */
>> + DRM_PANTHOR_PERF_COMMAND_TEARDOWN,
>> +
>> + /** @DRM_PANTHOR_PERF_COMMAND_START: Start a sampling session on the indicated context. */
>> + DRM_PANTHOR_PERF_COMMAND_START,
>> +
>> + /** @DRM_PANTHOR_PERF_COMMAND_STOP: Stop the sampling session on the indicated context. */
>> + DRM_PANTHOR_PERF_COMMAND_STOP,
>> +
>> + /**
>> + * @DRM_PANTHOR_PERF_COMMAND_SAMPLE: Request a manual sample on the indicated context.
>> + *
>> + * When the sampling session is configured with a non-zero sampling frequency, any
>> + * DRM_PANTHOR_PERF_CONTROL calls with this command will be ignored and return an
>> + * -EINVAL.
>> + */
>> + DRM_PANTHOR_PERF_COMMAND_SAMPLE,
>> +};
>> +
>> +/**
>> + * struct drm_panthor_perf_control - Arguments passed to DRM_PANTHOR_IOCTL_PERF_CONTROL.
>> + */
>> +struct drm_panthor_perf_control {
>> + /** @cmd: Command from enum drm_panthor_perf_command. */
>> + __u32 cmd;
>> +
>> + /**
>> + * @handle: session handle.
>> + *
>> + * Returned by the DRM_PANTHOR_PERF_COMMAND_SETUP call.
>> + * It must be used in subsequent commands for the same context.
>> + */
>> + __u32 handle;
>> +
>> + /**
>> + * @size: size of the command structure.
>> + *
>> + * If the pointer is NULL, the size is updated by the driver to provide the size of the
>> + * output structure. If the pointer is not NULL, the driver will only copy min(size,
>> + * struct_size) to the pointer and update the size accordingly.
>> + */
>> + __u64 size;
>> +
>> + /**
>> + * @pointer: user pointer to a command type struct, such as
>> + * @struct drm_panthor_perf_cmd_start.
>> + */
>> + __u64 pointer;
>> +};
>> +
>> +/**
>> + * enum drm_panthor_perf_counter_set - The counter set to be requested from the hardware.
>> + *
>> + * The hardware supports a single performance counter set at a time, so requesting any set other
>> + * than the primary may fail if another process is sampling at the same time.
>> + *
>> + * If in doubt, the primary counter set has the most commonly used counters and requires no
>> + * additional permissions to open.
>> + */
>> +enum drm_panthor_perf_counter_set {
>> + /**
>> + * @DRM_PANTHOR_PERF_SET_PRIMARY: The default set configured on the hardware.
>> + *
>> + * This is the only set for which all counters in all blocks are defined.
>> + */
>> + DRM_PANTHOR_PERF_SET_PRIMARY,
>> +
>> + /**
>> + * @DRM_PANTHOR_PERF_SET_SECONDARY: The secondary performance counter set.
>> + *
>> + * Some blocks may not have any defined counters for this set, and the block will
>> + * have the UNAVAILABLE block state permanently set in the block header.
>> + *
>> + * Accessing this set requires the calling process to have the CAP_PERFMON capability.
>> + */
>> + DRM_PANTHOR_PERF_SET_SECONDARY,
>> +
>> + /**
>> + * @DRM_PANTHOR_PERF_SET_TERTIARY: The tertiary performance counter set.
>> + *
>> + * Some blocks may not have any defined counters for this set, and the block will have
>> + * the UNAVAILABLE block state permanently set in the block header. Note that the
>> + * tertiary set has the fewest defined counter blocks.
>> + *
>> + * Accessing this set requires the calling process to have the CAP_PERFMON capability.
>> + */
>> + DRM_PANTHOR_PERF_SET_TERTIARY,
>> +};
>> +
>> +/**
>> + * struct drm_panthor_perf_ringbuf_control - Struct used to map in the ring buffer control indices
>> + * into memory shared between user and kernel.
>> + *
>> + */
>> +struct drm_panthor_perf_ringbuf_control {
>> + /**
>> + * @extract_idx: The index of the latest sample that was processed by userspace. Only
>> + * modifiable by userspace.
>> + */
>> + __u64 extract_idx;
>> +
>> + /**
>> + * @insert_idx: The index of the latest sample emitted by the kernel. Only modifiable by
>> + * modifiable by the kernel.
>> + */
>> + __u64 insert_idx;
>> +};
>> +
>> +/**
>> + * struct drm_panthor_perf_cmd_setup - Arguments passed to DRM_PANTHOR_IOCTL_PERF_CONTROL
>> + * when the DRM_PANTHOR_PERF_COMMAND_SETUP command is specified.
>> + */
>> +struct drm_panthor_perf_cmd_setup {
>> + /**
>> + * @block_set: Set of performance counter blocks, member of
>> + * enum drm_panthor_perf_block_set.
>> + *
>> + * This is a global configuration and only one set can be active at a time. If
>> + * another client has already requested a counter set, any further requests
>> + * for a different counter set will fail and return an -EBUSY.
>> + *
>> + * If the requested set does not exist, the request will fail and return an -EINVAL.
>> + *
>> + * Some sets have additional requirements to be enabled, and the setup request will
>> + * fail with an -EACCES if these requirements are not satisfied.
>> + */
>> + __u8 block_set;
>> +
>> + /** @pad: MBZ. */
>> + __u8 pad[7];
>> +
>> + /** @fd: eventfd for signalling the availability of a new sample. */
>> + __u32 fd;
>> +
>> + /** @ringbuf_handle: Handle to the BO to write perf counter sample to. */
>> + __u32 ringbuf_handle;
>> +
>> + /**
>> + * @control_handle: Handle to the BO containing a contiguous 16 byte range, used for the
>> + * insert and extract indices for the ringbuffer.
>> + */
>> + __u32 control_handle;
>> +
>> + /**
>> + * @sample_slots: The number of slots available in the userspace-provided BO. Must be
>> + * a power of 2.
>> + *
>> + * If sample_slots * sample_size does not match the BO size, the setup request will fail.
>> + */
>> + __u32 sample_slots;
>> +
>> + /**
>> + * @control_offset: Offset into the control BO where the insert and extract indices are
>> + * located.
>> + */
>> + __u64 control_offset;
>> +
>> + /**
>> + * @sample_freq_ns: Period between automatic counter sample collection in nanoseconds. Zero
>> + * disables automatic collection and all collection must be done through explicit calls
>> + * to DRM_PANTHOR_PERF_CONTROL.SAMPLE. Non-zero values will disable manual counter sampling
>> + * via the DRM_PANTHOR_PERF_COMMAND_SAMPLE command.
>> + *
>> + * This disables software-triggered periodic sampling, but hardware will still trigger
>> + * automatic samples on certain events, including shader core power transitions, and
>> + * entries to and exits from non-counting periods. The final stop command will also
>> + * trigger a sample to ensure no data is lost.
>> + */
>> + __u64 sample_freq_ns;
>> +
>> + /**
>> + * @fw_enable_mask: Bitmask of counters to request from the FW counter block. Any bits
>> + * past the first drm_panthor_perf_info.counters_per_block bits will be ignored. Bit 0
>> + * corresponds to counter 0.
>> + */
>> + __u64 fw_enable_mask[2];
>> +
>> + /**
>> + * @cshw_enable_mask: Bitmask of counters to request from the CSHW counter block. Any bits
>> + * past the first drm_panthor_perf_info.counters_per_block bits will be ignored. Bit 0
>> + * corresponds to counter 0.
>> + */
>> + __u64 cshw_enable_mask[2];
>> +
>> + /**
>> + * @tiler_enable_mask: Bitmask of counters to request from the tiler counter block. Any
>> + * bits past the first drm_panthor_perf_info.counters_per_block bits will be ignored. Bit
>> + * 0 corresponds to counter 0.
>> + */
>> + __u64 tiler_enable_mask[2];
>> +
>> + /**
>> + * @memsys_enable_mask: Bitmask of counters to request from the memsys counter blocks. Any
>> + * bits past the first drm_panthor_perf_info.counters_per_block bits will be ignored. Bit 0
>> + * corresponds to counter 0.
>> + */
>> + __u64 memsys_enable_mask[2];
>> +
>> + /**
>> + * @shader_enable_mask: Bitmask of counters to request from the shader core counter blocks.
>> + * Any bits past the first drm_panthor_perf_info.counters_per_block bits will be ignored.
>> + * Bit 0 corresponds to counter 0.
>> + */
>> + __u64 shader_enable_mask[2];
>> +};
>> +
>> +/**
>> + * struct drm_panthor_perf_cmd_start - Arguments passed to DRM_PANTHOR_IOCTL_PERF_CONTROL
>> + * when the DRM_PANTHOR_PERF_COMMAND_START command is specified.
>> + */
>> +struct drm_panthor_perf_cmd_start {
>> + /**
>> + * @user_data: User provided data that will be attached to automatic samples collected
>> + * until the next DRM_PANTHOR_PERF_COMMAND_STOP.
>> + */
>> + __u64 user_data;
>> +};
>> +
>> +/**
>> + * struct drm_panthor_perf_cmd_stop - Arguments passed to DRM_PANTHOR_IOCTL_PERF_CONTROL
>> + * when the DRM_PANTHOR_PERF_COMMAND_STOP command is specified.
>> + */
>> +struct drm_panthor_perf_cmd_stop {
>> + /**
>> + * @user_data: User provided data that will be attached to the automatic sample collected
>> + * at the end of this sampling session.
>> + */
>> + __u64 user_data;
>> +};
>> +
>> +/**
>> + * struct drm_panthor_perf_cmd_sample - Arguments passed to DRM_PANTHOR_IOCTL_PERF_CONTROL
>> + * when the DRM_PANTHOR_PERF_COMMAND_SAMPLE command is specified.
>> + */
>> +struct drm_panthor_perf_cmd_sample {
>> + /** @user_data: User provided data that will be attached to the sample.*/
>> + __u64 user_data;
>> +};
>> +
>> /**
>> * DRM_IOCTL_PANTHOR() - Build a Panthor IOCTL number
>> * @__access: Access type. Must be R, W or RW.
>> @@ -1237,6 +1800,8 @@ enum {
>> DRM_IOCTL_PANTHOR(WR, BO_SYNC, bo_sync),
>> DRM_IOCTL_PANTHOR_BO_QUERY_INFO =
>> DRM_IOCTL_PANTHOR(WR, BO_QUERY_INFO, bo_query_info),
>> + DRM_IOCTL_PANTHOR_PERF_CONTROL =
>> + DRM_IOCTL_PANTHOR(WR, PERF_CONTROL, perf_control)
>> };
>>
>> #if defined(__cplusplus)
>
On Mon, 15 Dec 2025 17:14:47 +0000
Lukas Zapolskas <lukas.zapolskas@arm.com> wrote:
> This patch extends the DEV_QUERY ioctl to return information about the
> performance counter setup for userspace, and introduces the new
> ioctl DRM_PANTHOR_PERF_CONTROL in order to allow for the sampling of
> performance counters.
>
> The new design is inspired by the perf aux ringbuffer [0], with the
> insert and extract indices being mapped to userspace, allowing
> multiple samples to be exposed at any given time. To avoid pointer
> chasing, the sample metadata and block metadata are inline with
> the elements they describe.
>
> Userspace is responsible for passing in resources for samples to be
> exposed, including the event file descriptor for notification of new
> sample availability, the ringbuffer BO to store samples, and the
> control BO along with the offset for mapping the insert and extract
> indices. Though these indices are only a total of 8 bytes, userspace
> can then reuse the same physical page for tracking the state of
> multiple buffers by giving different offsets from the BO start to
> map them.
>
> [0]: https://docs.kernel.org/userspace-api/perf_ring_buffer.html
>
> Co-developed-by: Mihail Atanassov <mihail.atanassov@arm.com>
> Signed-off-by: Mihail Atanassov <mihail.atanassov@arm.com>
> Signed-off-by: Lukas Zapolskas <lukas.zapolskas@arm.com>
> Reviewed-by: Adrián Larumbe <adrian.larumbe@collabora.com>
A couple things pointed out by Adrian have not been fixed, I think (see
below).
> ---
> include/uapi/drm/panthor_drm.h | 565 +++++++++++++++++++++++++++++++++
> 1 file changed, 565 insertions(+)
>
> diff --git a/include/uapi/drm/panthor_drm.h b/include/uapi/drm/panthor_drm.h
> index e238c6264fa1..d1a92172e878 100644
> --- a/include/uapi/drm/panthor_drm.h
> +++ b/include/uapi/drm/panthor_drm.h
[...]
> +/**
> + * struct drm_panthor_perf_info - Performance counter interface information
> + *
> + * Structure grouping all queryable information relating to the performance counter
> + * interfaces.
> + */
> +struct drm_panthor_perf_info {
> + /**
> + * @counters_per_block: The number of 8-byte counters available in a block.
> + */
> + __u32 counters_per_block;
> +
> + /**
> + * @sample_header_size: The size of the header struct available at the beginning
> + * of every sample.
> + */
> + __u32 sample_header_size;
> +
> + /**
> + * @block_header_size: The size of the header struct inline with the counters for a
> + * single block.
> + */
> + __u32 block_header_size;
> +
> + /**
> + * @sample_size: The size of a fully annotated sample, starting with a sample header
> + * of size @sample_header_size bytes, and all available blocks for the current
> + * configuration, each comprised of @counters_per_block 64-bit counters and
> + * a block header of @block_header_size bytes.
> + *
> + * The user must use this field to allocate size for the ring buffer. In
> + * the case of new blocks being added, an old userspace can always use
> + * this field and ignore any blocks it does not know about.
> + */
> + __u32 sample_size;
> +
> + /** @flags: Combination of drm_panthor_perf_feat_flags flags. */
> + __u32 flags;
> +
> + /**
> + * @supported_clocks: Bitmask of the clocks supported by the GPU.
> + *
> + * Each bit represents a variant of the enum drm_panthor_perf_clock.
> + *
> + * For the same GPU, different implementers may have different clocks for the same hardware
> + * block. At the moment, up to three clocks are supported, and any clocks that are present
> + * will be reported here.
> + */
> + __u32 supported_clocks;
> +
> + /** @fw_blocks: Number of FW blocks available. */
> + __u32 fw_blocks;
> +
> + /** @cshw_blocks: Number of CSHW blocks available. */
> + __u32 cshw_blocks;
> +
> + /** @tiler_blocks: Number of tiler blocks available. */
> + __u32 tiler_blocks;
> +
> + /** @memsys_blocks: Number of memsys blocks available. */
> + __u32 memsys_blocks;
> +
> + /** @shader_blocks: Number of shader core blocks available. */
> + __u32 shader_blocks;
You need an extra
__u32 pad;
to have things aligned on 8 bytes.
> +};
> +
[...]
> +
> +/**
> + * struct drm_panthor_perf_ringbuf_control - Struct used to map in the ring buffer control indices
> + * into memory shared between user and kernel.
> + *
> + */
> +struct drm_panthor_perf_ringbuf_control {
> + /**
> + * @extract_idx: The index of the latest sample that was processed by userspace. Only
> + * modifiable by userspace.
> + */
> + __u64 extract_idx;
> +
> + /**
> + * @insert_idx: The index of the latest sample emitted by the kernel. Only modifiable by
> + * modifiable by the kernel.
"modifiable by" repeated twice.
> + */
> + __u64 insert_idx;
> +};
© 2016 - 2026 Red Hat, Inc.