It is useful to know which tasks cause gpu errors.
Signed-off-by: Chia-I Wu <olvaffe@gmail.com>
---
drivers/gpu/drm/panthor/panthor_sched.c | 24 +++++++++++++++++++-----
1 file changed, 19 insertions(+), 5 deletions(-)
diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
index 823b0fe678ba6..47912b06ec9d3 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.c
+++ b/drivers/gpu/drm/panthor/panthor_sched.c
@@ -1364,8 +1364,12 @@ cs_slot_process_fatal_event_locked(struct panthor_device *ptdev,
fatal = cs_iface->output->fatal;
info = cs_iface->output->fatal_info;
- if (group)
+ if (group) {
+ drm_warn(&ptdev->base, "CS_FATAL: pid=%d, comm=%s\n",
+ group->task_info.pid, group->task_info.comm);
+
group->fatal_queues |= BIT(cs_id);
+ }
if (CS_EXCEPTION_TYPE(fatal) == DRM_PANTHOR_EXCEPTION_CS_UNRECOVERABLE) {
/* If this exception is unrecoverable, queue a reset, and make
@@ -1425,6 +1429,11 @@ cs_slot_process_fault_event_locked(struct panthor_device *ptdev,
spin_unlock(&queue->fence_ctx.lock);
}
+ if (group) {
+ drm_warn(&ptdev->base, "CS_FAULT: pid=%d, comm=%s\n",
+ group->task_info.pid, group->task_info.comm);
+ }
+
drm_warn(&ptdev->base,
"CSG slot %d CS slot: %d\n"
"CS_FAULT.EXCEPTION_TYPE: 0x%x (%s)\n"
@@ -1641,11 +1650,15 @@ csg_slot_process_progress_timer_event_locked(struct panthor_device *ptdev, u32 c
lockdep_assert_held(&sched->lock);
- drm_warn(&ptdev->base, "CSG slot %d progress timeout\n", csg_id);
-
group = csg_slot->group;
- if (!drm_WARN_ON(&ptdev->base, !group))
+ if (!drm_WARN_ON(&ptdev->base, !group)) {
+ drm_warn(&ptdev->base, "CSG_PROGRESS_TIMER_EVENT: pid=%d, comm=%s\n",
+ group->task_info.pid, group->task_info.comm);
+
group->timedout = true;
+ }
+
+ drm_warn(&ptdev->base, "CSG slot %d progress timeout\n", csg_id);
sched_queue_delayed_work(sched, tick, 0);
}
@@ -3227,7 +3240,8 @@ queue_timedout_job(struct drm_sched_job *sched_job)
struct panthor_scheduler *sched = ptdev->scheduler;
struct panthor_queue *queue = group->queues[job->queue_idx];
- drm_warn(&ptdev->base, "job timeout\n");
+ drm_warn(&ptdev->base, "job timeout: pid=%d, comm=%s, seqno=%llu\n",
+ group->task_info.pid, group->task_info.comm, job->done_fence->seqno);
drm_WARN_ON(&ptdev->base, atomic_read(&sched->reset.in_progress));
--
2.50.0.727.gbf7dc18ff4-goog
On 13/07/2025 04:08, Chia-I Wu wrote: > It is useful to know which tasks cause gpu errors. > > Signed-off-by: Chia-I Wu <olvaffe@gmail.com> Reviewed-by: Steven Price <steven.price@arm.com> > --- > drivers/gpu/drm/panthor/panthor_sched.c | 24 +++++++++++++++++++----- > 1 file changed, 19 insertions(+), 5 deletions(-) > > diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c > index 823b0fe678ba6..47912b06ec9d3 100644 > --- a/drivers/gpu/drm/panthor/panthor_sched.c > +++ b/drivers/gpu/drm/panthor/panthor_sched.c > @@ -1364,8 +1364,12 @@ cs_slot_process_fatal_event_locked(struct panthor_device *ptdev, > fatal = cs_iface->output->fatal; > info = cs_iface->output->fatal_info; > > - if (group) > + if (group) { > + drm_warn(&ptdev->base, "CS_FATAL: pid=%d, comm=%s\n", > + group->task_info.pid, group->task_info.comm); > + > group->fatal_queues |= BIT(cs_id); > + } > > if (CS_EXCEPTION_TYPE(fatal) == DRM_PANTHOR_EXCEPTION_CS_UNRECOVERABLE) { > /* If this exception is unrecoverable, queue a reset, and make > @@ -1425,6 +1429,11 @@ cs_slot_process_fault_event_locked(struct panthor_device *ptdev, > spin_unlock(&queue->fence_ctx.lock); > } > > + if (group) { > + drm_warn(&ptdev->base, "CS_FAULT: pid=%d, comm=%s\n", > + group->task_info.pid, group->task_info.comm); > + } > + > drm_warn(&ptdev->base, > "CSG slot %d CS slot: %d\n" > "CS_FAULT.EXCEPTION_TYPE: 0x%x (%s)\n" > @@ -1641,11 +1650,15 @@ csg_slot_process_progress_timer_event_locked(struct panthor_device *ptdev, u32 c > > lockdep_assert_held(&sched->lock); > > - drm_warn(&ptdev->base, "CSG slot %d progress timeout\n", csg_id); > - > group = csg_slot->group; > - if (!drm_WARN_ON(&ptdev->base, !group)) > + if (!drm_WARN_ON(&ptdev->base, !group)) { > + drm_warn(&ptdev->base, "CSG_PROGRESS_TIMER_EVENT: pid=%d, comm=%s\n", > + group->task_info.pid, group->task_info.comm); > + > group->timedout = true; > + } > + > + drm_warn(&ptdev->base, "CSG slot %d progress timeout\n", csg_id); > > sched_queue_delayed_work(sched, tick, 0); > } > @@ -3227,7 +3240,8 @@ queue_timedout_job(struct drm_sched_job *sched_job) > struct panthor_scheduler *sched = ptdev->scheduler; > struct panthor_queue *queue = group->queues[job->queue_idx]; > > - drm_warn(&ptdev->base, "job timeout\n"); > + drm_warn(&ptdev->base, "job timeout: pid=%d, comm=%s, seqno=%llu\n", > + group->task_info.pid, group->task_info.comm, job->done_fence->seqno); > > drm_WARN_ON(&ptdev->base, atomic_read(&sched->reset.in_progress)); >
© 2016 - 2025 Red Hat, Inc.