To notify userspace about which task (if any) made the device get in a
wedge state, make use of drm_wedge_task_info parameter, filling it with
the task PID and name.
Signed-off-by: André Almeida <andrealmeid@igalia.com>
---
v8:
- Drop check before calling amdgpu_vm_put_task_info()
- Drop local variable `info`
v7:
- Remove struct cast, now we can use `info = &ti->task`
- Fix struct lifetime, move amdgpu_vm_put_task_info() after
drm_dev_wedged_event() call
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 13 +++++++++++--
drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 7 +++++--
2 files changed, 16 insertions(+), 4 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 8a0f36f33f13..a59f194e3360 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -6363,8 +6363,17 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
atomic_set(&adev->reset_domain->reset_res, r);
- if (!r)
- drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, NULL);
+ if (!r) {
+ struct amdgpu_task_info *ti = NULL;
+
+ if (job)
+ ti = amdgpu_vm_get_task_info_pasid(adev, job->pasid);
+
+ drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE,
+ ti ? &ti->task : NULL);
+
+ amdgpu_vm_put_task_info(ti);
+ }
return r;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 0c1381b527fe..1e24590ae144 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -89,6 +89,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
{
struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
struct amdgpu_job *job = to_amdgpu_job(s_job);
+ struct drm_wedge_task_info *info = NULL;
struct amdgpu_task_info *ti;
struct amdgpu_device *adev = ring->adev;
int idx;
@@ -125,7 +126,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
ti = amdgpu_vm_get_task_info_pasid(ring->adev, job->pasid);
if (ti) {
amdgpu_vm_print_task_info(adev, ti);
- amdgpu_vm_put_task_info(ti);
+ info = &ti->task;
}
/* attempt a per ring reset */
@@ -164,13 +165,15 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
if (amdgpu_ring_sched_ready(ring))
drm_sched_start(&ring->sched, 0);
dev_err(adev->dev, "Ring %s reset succeeded\n", ring->sched.name);
- drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, NULL);
+ drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, info);
goto exit;
}
dev_err(adev->dev, "Ring %s reset failure\n", ring->sched.name);
}
dma_fence_set_error(&s_job->s_fence->finished, -ETIME);
+ amdgpu_vm_put_task_info(ti);
+
if (amdgpu_device_should_recover_gpu(ring->adev)) {
struct amdgpu_reset_context reset_context;
memset(&reset_context, 0, sizeof(reset_context));
--
2.49.0
On 6/17/25 14:49, André Almeida wrote: > To notify userspace about which task (if any) made the device get in a > wedge state, make use of drm_wedge_task_info parameter, filling it with > the task PID and name. > > Signed-off-by: André Almeida <andrealmeid@igalia.com> Reviewed-by: Christian König <christian.koenig@amd.com> Do you have commit right for drm-misc-next? Regards, Christian. > --- > v8: > - Drop check before calling amdgpu_vm_put_task_info() > - Drop local variable `info` > v7: > - Remove struct cast, now we can use `info = &ti->task` > - Fix struct lifetime, move amdgpu_vm_put_task_info() after > drm_dev_wedged_event() call > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 13 +++++++++++-- > drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 7 +++++-- > 2 files changed, 16 insertions(+), 4 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 8a0f36f33f13..a59f194e3360 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -6363,8 +6363,17 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, > > atomic_set(&adev->reset_domain->reset_res, r); > > - if (!r) > - drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, NULL); > + if (!r) { > + struct amdgpu_task_info *ti = NULL; > + > + if (job) > + ti = amdgpu_vm_get_task_info_pasid(adev, job->pasid); > + > + drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, > + ti ? &ti->task : NULL); > + > + amdgpu_vm_put_task_info(ti); > + } > > return r; > } > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > index 0c1381b527fe..1e24590ae144 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > @@ -89,6 +89,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) > { > struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched); > struct amdgpu_job *job = to_amdgpu_job(s_job); > + struct drm_wedge_task_info *info = NULL; > struct amdgpu_task_info *ti; > struct amdgpu_device *adev = ring->adev; > int idx; > @@ -125,7 +126,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) > ti = amdgpu_vm_get_task_info_pasid(ring->adev, job->pasid); > if (ti) { > amdgpu_vm_print_task_info(adev, ti); > - amdgpu_vm_put_task_info(ti); > + info = &ti->task; > } > > /* attempt a per ring reset */ > @@ -164,13 +165,15 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) > if (amdgpu_ring_sched_ready(ring)) > drm_sched_start(&ring->sched, 0); > dev_err(adev->dev, "Ring %s reset succeeded\n", ring->sched.name); > - drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, NULL); > + drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, info); > goto exit; > } > dev_err(adev->dev, "Ring %s reset failure\n", ring->sched.name); > } > dma_fence_set_error(&s_job->s_fence->finished, -ETIME); > > + amdgpu_vm_put_task_info(ti); > + > if (amdgpu_device_should_recover_gpu(ring->adev)) { > struct amdgpu_reset_context reset_context; > memset(&reset_context, 0, sizeof(reset_context));
Em 17/06/2025 10:07, Christian König escreveu: > On 6/17/25 14:49, André Almeida wrote: >> To notify userspace about which task (if any) made the device get in a >> wedge state, make use of drm_wedge_task_info parameter, filling it with >> the task PID and name. >> >> Signed-off-by: André Almeida <andrealmeid@igalia.com> > > Reviewed-by: Christian König <christian.koenig@amd.com> > > Do you have commit right for drm-misc-next? > I've merged the series into drm-misc-next. Thanks! André
Em 17/06/2025 10:07, Christian König escreveu: > On 6/17/25 14:49, André Almeida wrote: >> To notify userspace about which task (if any) made the device get in a >> wedge state, make use of drm_wedge_task_info parameter, filling it with >> the task PID and name. >> >> Signed-off-by: André Almeida <andrealmeid@igalia.com> > > Reviewed-by: Christian König <christian.koenig@amd.com> > > Do you have commit right for drm-misc-next? > Thanks for the reviews! I do have access, but if you don't mind, can you push this one? > Regards, > Christian. > >> --- >> v8: >> - Drop check before calling amdgpu_vm_put_task_info() >> - Drop local variable `info` >> v7: >> - Remove struct cast, now we can use `info = &ti->task` >> - Fix struct lifetime, move amdgpu_vm_put_task_info() after >> drm_dev_wedged_event() call >> --- >> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 13 +++++++++++-- >> drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 7 +++++-- >> 2 files changed, 16 insertions(+), 4 deletions(-) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> index 8a0f36f33f13..a59f194e3360 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> @@ -6363,8 +6363,17 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, >> >> atomic_set(&adev->reset_domain->reset_res, r); >> >> - if (!r) >> - drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, NULL); >> + if (!r) { >> + struct amdgpu_task_info *ti = NULL; >> + >> + if (job) >> + ti = amdgpu_vm_get_task_info_pasid(adev, job->pasid); >> + >> + drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, >> + ti ? &ti->task : NULL); >> + >> + amdgpu_vm_put_task_info(ti); >> + } >> >> return r; >> } >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c >> index 0c1381b527fe..1e24590ae144 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c >> @@ -89,6 +89,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) >> { >> struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched); >> struct amdgpu_job *job = to_amdgpu_job(s_job); >> + struct drm_wedge_task_info *info = NULL; >> struct amdgpu_task_info *ti; >> struct amdgpu_device *adev = ring->adev; >> int idx; >> @@ -125,7 +126,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) >> ti = amdgpu_vm_get_task_info_pasid(ring->adev, job->pasid); >> if (ti) { >> amdgpu_vm_print_task_info(adev, ti); >> - amdgpu_vm_put_task_info(ti); >> + info = &ti->task; >> } >> >> /* attempt a per ring reset */ >> @@ -164,13 +165,15 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) >> if (amdgpu_ring_sched_ready(ring)) >> drm_sched_start(&ring->sched, 0); >> dev_err(adev->dev, "Ring %s reset succeeded\n", ring->sched.name); >> - drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, NULL); >> + drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, info); >> goto exit; >> } >> dev_err(adev->dev, "Ring %s reset failure\n", ring->sched.name); >> } >> dma_fence_set_error(&s_job->s_fence->finished, -ETIME); >> >> + amdgpu_vm_put_task_info(ti); >> + >> if (amdgpu_device_should_recover_gpu(ring->adev)) { >> struct amdgpu_reset_context reset_context; >> memset(&reset_context, 0, sizeof(reset_context)); >
On 6/17/25 15:22, André Almeida wrote: > Em 17/06/2025 10:07, Christian König escreveu: >> On 6/17/25 14:49, André Almeida wrote: >>> To notify userspace about which task (if any) made the device get in a >>> wedge state, make use of drm_wedge_task_info parameter, filling it with >>> the task PID and name. >>> >>> Signed-off-by: André Almeida <andrealmeid@igalia.com> >> >> Reviewed-by: Christian König <christian.koenig@amd.com> >> >> Do you have commit right for drm-misc-next? >> > > Thanks for the reviews! > > I do have access, but if you don't mind, can you push this one? Sure, but give me till the end of today. (And maybe ping me next week should I forget about it). Regards, Christian. > >> Regards, >> Christian. >> >>> --- >>> v8: >>> - Drop check before calling amdgpu_vm_put_task_info() >>> - Drop local variable `info` >>> v7: >>> - Remove struct cast, now we can use `info = &ti->task` >>> - Fix struct lifetime, move amdgpu_vm_put_task_info() after >>> drm_dev_wedged_event() call >>> --- >>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 13 +++++++++++-- >>> drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 7 +++++-- >>> 2 files changed, 16 insertions(+), 4 deletions(-) >>> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>> index 8a0f36f33f13..a59f194e3360 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>> @@ -6363,8 +6363,17 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, >>> atomic_set(&adev->reset_domain->reset_res, r); >>> - if (!r) >>> - drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, NULL); >>> + if (!r) { >>> + struct amdgpu_task_info *ti = NULL; >>> + >>> + if (job) >>> + ti = amdgpu_vm_get_task_info_pasid(adev, job->pasid); >>> + >>> + drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, >>> + ti ? &ti->task : NULL); >>> + >>> + amdgpu_vm_put_task_info(ti); >>> + } >>> return r; >>> } >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c >>> index 0c1381b527fe..1e24590ae144 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c >>> @@ -89,6 +89,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) >>> { >>> struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched); >>> struct amdgpu_job *job = to_amdgpu_job(s_job); >>> + struct drm_wedge_task_info *info = NULL; >>> struct amdgpu_task_info *ti; >>> struct amdgpu_device *adev = ring->adev; >>> int idx; >>> @@ -125,7 +126,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) >>> ti = amdgpu_vm_get_task_info_pasid(ring->adev, job->pasid); >>> if (ti) { >>> amdgpu_vm_print_task_info(adev, ti); >>> - amdgpu_vm_put_task_info(ti); >>> + info = &ti->task; >>> } >>> /* attempt a per ring reset */ >>> @@ -164,13 +165,15 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) >>> if (amdgpu_ring_sched_ready(ring)) >>> drm_sched_start(&ring->sched, 0); >>> dev_err(adev->dev, "Ring %s reset succeeded\n", ring->sched.name); >>> - drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, NULL); >>> + drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, info); >>> goto exit; >>> } >>> dev_err(adev->dev, "Ring %s reset failure\n", ring->sched.name); >>> } >>> dma_fence_set_error(&s_job->s_fence->finished, -ETIME); >>> + amdgpu_vm_put_task_info(ti); >>> + >>> if (amdgpu_device_should_recover_gpu(ring->adev)) { >>> struct amdgpu_reset_context reset_context; >>> memset(&reset_context, 0, sizeof(reset_context)); >> >
Hi Christian, Em 18/06/2025 04:29, Christian König escreveu: > On 6/17/25 15:22, André Almeida wrote: >> Em 17/06/2025 10:07, Christian König escreveu: >>> On 6/17/25 14:49, André Almeida wrote: >>>> To notify userspace about which task (if any) made the device get in a >>>> wedge state, make use of drm_wedge_task_info parameter, filling it with >>>> the task PID and name. >>>> >>>> Signed-off-by: André Almeida <andrealmeid@igalia.com> >>> >>> Reviewed-by: Christian König <christian.koenig@amd.com> >>> >>> Do you have commit right for drm-misc-next? >>> >> >> Thanks for the reviews! >> >> I do have access, but if you don't mind, can you push this one? > > Sure, but give me till the end of today. > It was already merged, no worries!
Em 17/06/2025 10:22, André Almeida escreveu: > Em 17/06/2025 10:07, Christian König escreveu: >> On 6/17/25 14:49, André Almeida wrote: >>> To notify userspace about which task (if any) made the device get in a >>> wedge state, make use of drm_wedge_task_info parameter, filling it with >>> the task PID and name. >>> >>> Signed-off-by: André Almeida <andrealmeid@igalia.com> >> >> Reviewed-by: Christian König <christian.koenig@amd.com> >> >> Do you have commit right for drm-misc-next? >> > > Thanks for the reviews! > > I do have access, but if you don't mind, can you push this one? > Never mind, I can push this one myself :)
© 2016 - 2025 Red Hat, Inc.