[v2] panthor: print task pid and comm on gpu errors

[PATCH v2 3/3] panthor: dump task pid and comm on gpu errors

Posted by Chia-I Wu 7 months ago

It is useful to know which tasks cause gpu errors.

Signed-off-by: Chia-I Wu <olvaffe@gmail.com>
---
 drivers/gpu/drm/panthor/panthor_sched.c | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
index 823b0fe678ba6..47912b06ec9d3 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.c
+++ b/drivers/gpu/drm/panthor/panthor_sched.c
@@ -1364,8 +1364,12 @@ cs_slot_process_fatal_event_locked(struct panthor_device *ptdev,
 	fatal = cs_iface->output->fatal;
 	info = cs_iface->output->fatal_info;
 
-	if (group)
+	if (group) {
+		drm_warn(&ptdev->base, "CS_FATAL: pid=%d, comm=%s\n",
+			 group->task_info.pid, group->task_info.comm);
+
 		group->fatal_queues |= BIT(cs_id);
+	}
 
 	if (CS_EXCEPTION_TYPE(fatal) == DRM_PANTHOR_EXCEPTION_CS_UNRECOVERABLE) {
 		/* If this exception is unrecoverable, queue a reset, and make
@@ -1425,6 +1429,11 @@ cs_slot_process_fault_event_locked(struct panthor_device *ptdev,
 		spin_unlock(&queue->fence_ctx.lock);
 	}
 
+	if (group) {
+		drm_warn(&ptdev->base, "CS_FAULT: pid=%d, comm=%s\n",
+			 group->task_info.pid, group->task_info.comm);
+	}
+
 	drm_warn(&ptdev->base,
 		 "CSG slot %d CS slot: %d\n"
 		 "CS_FAULT.EXCEPTION_TYPE: 0x%x (%s)\n"
@@ -1641,11 +1650,15 @@ csg_slot_process_progress_timer_event_locked(struct panthor_device *ptdev, u32 c
 
 	lockdep_assert_held(&sched->lock);
 
-	drm_warn(&ptdev->base, "CSG slot %d progress timeout\n", csg_id);
-
 	group = csg_slot->group;
-	if (!drm_WARN_ON(&ptdev->base, !group))
+	if (!drm_WARN_ON(&ptdev->base, !group)) {
+		drm_warn(&ptdev->base, "CSG_PROGRESS_TIMER_EVENT: pid=%d, comm=%s\n",
+			 group->task_info.pid, group->task_info.comm);
+
 		group->timedout = true;
+	}
+
+	drm_warn(&ptdev->base, "CSG slot %d progress timeout\n", csg_id);
 
 	sched_queue_delayed_work(sched, tick, 0);
 }
@@ -3227,7 +3240,8 @@ queue_timedout_job(struct drm_sched_job *sched_job)
 	struct panthor_scheduler *sched = ptdev->scheduler;
 	struct panthor_queue *queue = group->queues[job->queue_idx];
 
-	drm_warn(&ptdev->base, "job timeout\n");
+	drm_warn(&ptdev->base, "job timeout: pid=%d, comm=%s, seqno=%llu\n",
+		 group->task_info.pid, group->task_info.comm, job->done_fence->seqno);
 
 	drm_WARN_ON(&ptdev->base, atomic_read(&sched->reset.in_progress));
 
-- 
2.50.0.727.gbf7dc18ff4-goog

Re: [PATCH v2 3/3] panthor: dump task pid and comm on gpu errors

Posted by Steven Price 6 months, 3 weeks ago

On 13/07/2025 04:08, Chia-I Wu wrote:
> It is useful to know which tasks cause gpu errors.
> 
> Signed-off-by: Chia-I Wu <olvaffe@gmail.com>

Reviewed-by: Steven Price <steven.price@arm.com>

> ---
>  drivers/gpu/drm/panthor/panthor_sched.c | 24 +++++++++++++++++++-----
>  1 file changed, 19 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
> index 823b0fe678ba6..47912b06ec9d3 100644
> --- a/drivers/gpu/drm/panthor/panthor_sched.c
> +++ b/drivers/gpu/drm/panthor/panthor_sched.c
> @@ -1364,8 +1364,12 @@ cs_slot_process_fatal_event_locked(struct panthor_device *ptdev,
>  	fatal = cs_iface->output->fatal;
>  	info = cs_iface->output->fatal_info;
>  
> -	if (group)
> +	if (group) {
> +		drm_warn(&ptdev->base, "CS_FATAL: pid=%d, comm=%s\n",
> +			 group->task_info.pid, group->task_info.comm);
> +
>  		group->fatal_queues |= BIT(cs_id);
> +	}
>  
>  	if (CS_EXCEPTION_TYPE(fatal) == DRM_PANTHOR_EXCEPTION_CS_UNRECOVERABLE) {
>  		/* If this exception is unrecoverable, queue a reset, and make
> @@ -1425,6 +1429,11 @@ cs_slot_process_fault_event_locked(struct panthor_device *ptdev,
>  		spin_unlock(&queue->fence_ctx.lock);
>  	}
>  
> +	if (group) {
> +		drm_warn(&ptdev->base, "CS_FAULT: pid=%d, comm=%s\n",
> +			 group->task_info.pid, group->task_info.comm);
> +	}
> +
>  	drm_warn(&ptdev->base,
>  		 "CSG slot %d CS slot: %d\n"
>  		 "CS_FAULT.EXCEPTION_TYPE: 0x%x (%s)\n"
> @@ -1641,11 +1650,15 @@ csg_slot_process_progress_timer_event_locked(struct panthor_device *ptdev, u32 c
>  
>  	lockdep_assert_held(&sched->lock);
>  
> -	drm_warn(&ptdev->base, "CSG slot %d progress timeout\n", csg_id);
> -
>  	group = csg_slot->group;
> -	if (!drm_WARN_ON(&ptdev->base, !group))
> +	if (!drm_WARN_ON(&ptdev->base, !group)) {
> +		drm_warn(&ptdev->base, "CSG_PROGRESS_TIMER_EVENT: pid=%d, comm=%s\n",
> +			 group->task_info.pid, group->task_info.comm);
> +
>  		group->timedout = true;
> +	}
> +
> +	drm_warn(&ptdev->base, "CSG slot %d progress timeout\n", csg_id);
>  
>  	sched_queue_delayed_work(sched, tick, 0);
>  }
> @@ -3227,7 +3240,8 @@ queue_timedout_job(struct drm_sched_job *sched_job)
>  	struct panthor_scheduler *sched = ptdev->scheduler;
>  	struct panthor_queue *queue = group->queues[job->queue_idx];
>  
> -	drm_warn(&ptdev->base, "job timeout\n");
> +	drm_warn(&ptdev->base, "job timeout: pid=%d, comm=%s, seqno=%llu\n",
> +		 group->task_info.pid, group->task_info.comm, job->done_fence->seqno);
>  
>  	drm_WARN_ON(&ptdev->base, atomic_read(&sched->reset.in_progress));
>

[PATCH v2 1/3] panthor: set owner field for driver fops
[PATCH v2 2/3] panthor: save task pid and comm in panthor_group
[PATCH v2 3/3] panthor: dump task pid and comm on gpu errors