[PATCH v1 1/5] drm/panthor: Implement CS_FAULT propagation to userspace

Lukas Zapolskas posted 5 patches 11 hours ago
[PATCH v1 1/5] drm/panthor: Implement CS_FAULT propagation to userspace
Posted by Lukas Zapolskas 11 hours ago
From: Paul Toadere <paul.toadere@arm.com>

Though faulted queues do not prevent further submission, the
recoverable faults may have further consequences which are
worth recording and providing to the user.

Signed-off-by: Paul Toadere <paul.toadere@arm.com>
Co-developed-by: Lukas Zapolskas <lukas.zapolskas@arm.com>
Signed-off-by: Lukas Zapolskas <lukas.zapolskas@arm.com>
---
 drivers/gpu/drm/panthor/panthor_sched.c | 18 +++++++++++++++---
 include/uapi/drm/panthor_drm.h          | 11 +++++++++--
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
index a17b067a0439..eb8841beba39 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.c
+++ b/drivers/gpu/drm/panthor/panthor_sched.c
@@ -569,6 +569,14 @@ struct panthor_group {
 	/** @fatal_queues: Bitmask reflecting the queues that hit a fatal exception. */
 	u32 fatal_queues;
 
+	/**
+	 * @fault_queues: Bitmask reflecting the queues that hit a recoverable exception.
+	 *
+	 * This field is reset when the GROUP_GET_STATE ioctl is used to collect the fault
+	 * information.
+	 */
+	u32 fault_queues;
+
 	/** @tiler_oom: Mask of queues that have a tiler OOM event to process. */
 	atomic_t tiler_oom;
 
@@ -1553,6 +1561,8 @@ cs_slot_process_fault_event_locked(struct panthor_device *ptdev,
 	if (group) {
 		drm_warn(&ptdev->base, "CS_FAULT: pid=%d, comm=%s\n",
 			 group->task_info.pid, group->task_info.comm);
+
+		group->fault_queues |= BIT(cs_id);
 	}
 
 	drm_warn(&ptdev->base,
@@ -3807,9 +3817,6 @@ int panthor_group_get_state(struct panthor_file *pfile,
 	struct panthor_scheduler *sched = ptdev->scheduler;
 	struct panthor_group *group;
 
-	if (get_state->pad)
-		return -EINVAL;
-
 	group = group_from_handle(gpool, get_state->group_handle);
 	if (!group)
 		return -EINVAL;
@@ -3825,6 +3832,11 @@ int panthor_group_get_state(struct panthor_file *pfile,
 	}
 	if (group->innocent)
 		get_state->state |= DRM_PANTHOR_GROUP_STATE_INNOCENT;
+	if (group->fault_queues) {
+		get_state->state |= DRM_PANTHOR_GROUP_STATE_QUEUE_FAULT;
+		get_state->fault_queues = group->fault_queues;
+		group->fault_queues = 0;
+	}
 	mutex_unlock(&sched->lock);
 
 	group_put(group);
diff --git a/include/uapi/drm/panthor_drm.h b/include/uapi/drm/panthor_drm.h
index e238c6264fa1..77262d2b9672 100644
--- a/include/uapi/drm/panthor_drm.h
+++ b/include/uapi/drm/panthor_drm.h
@@ -965,6 +965,13 @@ enum drm_panthor_group_state_flags {
 	 * DRM_PANTHOR_GROUP_STATE_FATAL_FAULT is not.
 	 */
 	DRM_PANTHOR_GROUP_STATE_INNOCENT = 1 << 2,
+
+	/**
+	 * @DRM_PANTHOR_GROUP_STATE_QUEUE_FAULT: Group had recoverable faults.
+	 *
+	 * When a group ends up with this flag set, jobs can still be submitted to its queues.
+	 */
+	DRM_PANTHOR_GROUP_STATE_QUEUE_FAULT = 1 << 3,
 };
 
 /**
@@ -986,8 +993,8 @@ struct drm_panthor_group_get_state {
 	/** @fatal_queues: Bitmask of queues that faced fatal faults. */
 	__u32 fatal_queues;
 
-	/** @pad: MBZ */
-	__u32 pad;
+	/** @fatal_queues: Bitmask of queues that faced fatal faults. */
+	__u32 fault_queues;
 };
 
 /**
-- 
2.33.0.dirty
Re: [PATCH v1 1/5] drm/panthor: Implement CS_FAULT propagation to userspace
Posted by Boris Brezillon 11 hours ago
On Mon, 15 Dec 2025 11:54:53 +0000
Lukas Zapolskas <lukas.zapolskas@arm.com> wrote:

> From: Paul Toadere <paul.toadere@arm.com>
> 
> Though faulted queues do not prevent further submission, the
> recoverable faults may have further consequences which are
> worth recording and providing to the user.
> 
> Signed-off-by: Paul Toadere <paul.toadere@arm.com>
> Co-developed-by: Lukas Zapolskas <lukas.zapolskas@arm.com>
> Signed-off-by: Lukas Zapolskas <lukas.zapolskas@arm.com>
> ---
>  drivers/gpu/drm/panthor/panthor_sched.c | 18 +++++++++++++++---
>  include/uapi/drm/panthor_drm.h          | 11 +++++++++--
>  2 files changed, 24 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
> index a17b067a0439..eb8841beba39 100644
> --- a/drivers/gpu/drm/panthor/panthor_sched.c
> +++ b/drivers/gpu/drm/panthor/panthor_sched.c
> @@ -569,6 +569,14 @@ struct panthor_group {
>  	/** @fatal_queues: Bitmask reflecting the queues that hit a fatal exception. */
>  	u32 fatal_queues;
>  
> +	/**
> +	 * @fault_queues: Bitmask reflecting the queues that hit a recoverable exception.
> +	 *
> +	 * This field is reset when the GROUP_GET_STATE ioctl is used to collect the fault
> +	 * information.
> +	 */
> +	u32 fault_queues;

s/fault_queues/faulty_queues/ ?

> +
>  	/** @tiler_oom: Mask of queues that have a tiler OOM event to process. */
>  	atomic_t tiler_oom;
>  
> @@ -1553,6 +1561,8 @@ cs_slot_process_fault_event_locked(struct panthor_device *ptdev,
>  	if (group) {
>  		drm_warn(&ptdev->base, "CS_FAULT: pid=%d, comm=%s\n",
>  			 group->task_info.pid, group->task_info.comm);
> +
> +		group->fault_queues |= BIT(cs_id);
>  	}
>  
>  	drm_warn(&ptdev->base,
> @@ -3807,9 +3817,6 @@ int panthor_group_get_state(struct panthor_file *pfile,
>  	struct panthor_scheduler *sched = ptdev->scheduler;
>  	struct panthor_group *group;
>  
> -	if (get_state->pad)
> -		return -EINVAL;
> -
>  	group = group_from_handle(gpool, get_state->group_handle);
>  	if (!group)
>  		return -EINVAL;
> @@ -3825,6 +3832,11 @@ int panthor_group_get_state(struct panthor_file *pfile,
>  	}
>  	if (group->innocent)
>  		get_state->state |= DRM_PANTHOR_GROUP_STATE_INNOCENT;
> +	if (group->fault_queues) {
> +		get_state->state |= DRM_PANTHOR_GROUP_STATE_QUEUE_FAULT;
> +		get_state->fault_queues = group->fault_queues;
> +		group->fault_queues = 0;
> +	}
>  	mutex_unlock(&sched->lock);
>  
>  	group_put(group);
> diff --git a/include/uapi/drm/panthor_drm.h b/include/uapi/drm/panthor_drm.h
> index e238c6264fa1..77262d2b9672 100644
> --- a/include/uapi/drm/panthor_drm.h
> +++ b/include/uapi/drm/panthor_drm.h
> @@ -965,6 +965,13 @@ enum drm_panthor_group_state_flags {
>  	 * DRM_PANTHOR_GROUP_STATE_FATAL_FAULT is not.
>  	 */
>  	DRM_PANTHOR_GROUP_STATE_INNOCENT = 1 << 2,
> +
> +	/**
> +	 * @DRM_PANTHOR_GROUP_STATE_QUEUE_FAULT: Group had recoverable faults.
> +	 *
> +	 * When a group ends up with this flag set, jobs can still be submitted to its queues.
> +	 */
> +	DRM_PANTHOR_GROUP_STATE_QUEUE_FAULT = 1 << 3,
>  };
>  
>  /**
> @@ -986,8 +993,8 @@ struct drm_panthor_group_get_state {
>  	/** @fatal_queues: Bitmask of queues that faced fatal faults. */
>  	__u32 fatal_queues;
>  
> -	/** @pad: MBZ */
> -	__u32 pad;
> +	/** @fatal_queues: Bitmask of queues that faced fatal faults. */

s/fatal/recoverable/

> +	__u32 fault_queues;
>  };
>  
>  /**