drivers/gpu/drm/scheduler/sched_entity.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-)
The drm_sched_job_unschedulable trace point can access
entity->dependency after it was cleared by the callback
installed in drm_sched_entity_add_dependency_cb, causing:
BUG: kernel NULL pointer dereference, address: 0000000000000020
[...]
Workqueue: comp_1.1.0 drm_sched_run_job_work [gpu_sched]
RIP: 0010:trace_event_raw_event_drm_sched_job_unschedulable+0x70/0xd0 [gpu_sched]
To fix this we either need to keep a reference to the fence before
setting up the callbacks, or move the trace_drm_sched_job_unschedulable
calls into drm_sched_entity_add_dependency_cb where they can be
done earlier.
Fixes: 76d97c870f29 ("drm/sched: Trace dependencies for GPU jobs")
Signed-off-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
---
drivers/gpu/drm/scheduler/sched_entity.c | 11 +++++++----
1 file changed, 7 insertions(+), 4 deletions(-)
diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
index 8867b95ab089..3d06f72531ba 100644
--- a/drivers/gpu/drm/scheduler/sched_entity.c
+++ b/drivers/gpu/drm/scheduler/sched_entity.c
@@ -391,7 +391,8 @@ EXPORT_SYMBOL(drm_sched_entity_set_priority);
* Add a callback to the current dependency of the entity to wake up the
* scheduler when the entity becomes available.
*/
-static bool drm_sched_entity_add_dependency_cb(struct drm_sched_entity *entity)
+static bool drm_sched_entity_add_dependency_cb(struct drm_sched_entity *entity,
+ struct drm_sched_job *sched_job)
{
struct drm_gpu_scheduler *sched = entity->rq->sched;
struct dma_fence *fence = entity->dependency;
@@ -421,6 +422,10 @@ static bool drm_sched_entity_add_dependency_cb(struct drm_sched_entity *entity)
entity->dependency = fence;
}
+ if (trace_drm_sched_job_unschedulable_enabled() &&
+ !test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &entity->dependency->flags))
+ trace_drm_sched_job_unschedulable(sched_job, entity->dependency);
+
if (!dma_fence_add_callback(entity->dependency, &entity->cb,
drm_sched_entity_wakeup))
return true;
@@ -461,10 +466,8 @@ struct drm_sched_job *drm_sched_entity_pop_job(struct drm_sched_entity *entity)
while ((entity->dependency =
drm_sched_job_dependency(sched_job, entity))) {
- if (drm_sched_entity_add_dependency_cb(entity)) {
- trace_drm_sched_job_unschedulable(sched_job, entity->dependency);
+ if (drm_sched_entity_add_dependency_cb(entity, sched_job))
return NULL;
- }
}
/* skip jobs from entity that marked guilty */
--
2.43.0
On Mon, 2025-09-01 at 14:40 +0200, Pierre-Eric Pelloux-Prayer wrote: > The drm_sched_job_unschedulable trace point can access > entity->dependency after it was cleared by the callback > installed in drm_sched_entity_add_dependency_cb, causing: > > BUG: kernel NULL pointer dereference, address: 0000000000000020 > [...] > Workqueue: comp_1.1.0 drm_sched_run_job_work [gpu_sched] > RIP: 0010:trace_event_raw_event_drm_sched_job_unschedulable+0x70/0xd0 [gpu_sched] > > To fix this we either need to keep a reference to the fence before > setting up the callbacks, or move the trace_drm_sched_job_unschedulable > calls into drm_sched_entity_add_dependency_cb where they can be > done earlier. > > Fixes: 76d97c870f29 ("drm/sched: Trace dependencies for GPU jobs") > > Signed-off-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> > Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com> Applied to drm-misc-next Thx P. > --- > drivers/gpu/drm/scheduler/sched_entity.c | 11 +++++++---- > 1 file changed, 7 insertions(+), 4 deletions(-) > > diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c > index 8867b95ab089..3d06f72531ba 100644 > --- a/drivers/gpu/drm/scheduler/sched_entity.c > +++ b/drivers/gpu/drm/scheduler/sched_entity.c > @@ -391,7 +391,8 @@ EXPORT_SYMBOL(drm_sched_entity_set_priority); > * Add a callback to the current dependency of the entity to wake up the > * scheduler when the entity becomes available. > */ > -static bool drm_sched_entity_add_dependency_cb(struct drm_sched_entity *entity) > +static bool drm_sched_entity_add_dependency_cb(struct drm_sched_entity *entity, > + struct drm_sched_job *sched_job) > { > struct drm_gpu_scheduler *sched = entity->rq->sched; > struct dma_fence *fence = entity->dependency; > @@ -421,6 +422,10 @@ static bool drm_sched_entity_add_dependency_cb(struct drm_sched_entity *entity) > entity->dependency = fence; > } > > + if (trace_drm_sched_job_unschedulable_enabled() && > + !test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &entity->dependency->flags)) > + trace_drm_sched_job_unschedulable(sched_job, entity->dependency); > + > if (!dma_fence_add_callback(entity->dependency, &entity->cb, > drm_sched_entity_wakeup)) > return true; > @@ -461,10 +466,8 @@ struct drm_sched_job *drm_sched_entity_pop_job(struct drm_sched_entity *entity) > > while ((entity->dependency = > drm_sched_job_dependency(sched_job, entity))) { > - if (drm_sched_entity_add_dependency_cb(entity)) { > - trace_drm_sched_job_unschedulable(sched_job, entity->dependency); > + if (drm_sched_entity_add_dependency_cb(entity, sched_job)) > return NULL; > - } > } > > /* skip jobs from entity that marked guilty */
On 02/09/2025 08:27, Philipp Stanner wrote: > On Mon, 2025-09-01 at 14:40 +0200, Pierre-Eric Pelloux-Prayer wrote: >> The drm_sched_job_unschedulable trace point can access >> entity->dependency after it was cleared by the callback >> installed in drm_sched_entity_add_dependency_cb, causing: >> >> BUG: kernel NULL pointer dereference, address: 0000000000000020 >> [...] >> Workqueue: comp_1.1.0 drm_sched_run_job_work [gpu_sched] >> RIP: 0010:trace_event_raw_event_drm_sched_job_unschedulable+0x70/0xd0 [gpu_sched] >> >> To fix this we either need to keep a reference to the fence before >> setting up the callbacks, or move the trace_drm_sched_job_unschedulable >> calls into drm_sched_entity_add_dependency_cb where they can be >> done earlier. >> >> Fixes: 76d97c870f29 ("drm/sched: Trace dependencies for GPU jobs") >> >> Signed-off-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> >> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com> > > Applied to drm-misc-next Shouldn't it have been drm-misc-fixes? Regards, Tvrtko >> --- >> drivers/gpu/drm/scheduler/sched_entity.c | 11 +++++++---- >> 1 file changed, 7 insertions(+), 4 deletions(-) >> >> diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c >> index 8867b95ab089..3d06f72531ba 100644 >> --- a/drivers/gpu/drm/scheduler/sched_entity.c >> +++ b/drivers/gpu/drm/scheduler/sched_entity.c >> @@ -391,7 +391,8 @@ EXPORT_SYMBOL(drm_sched_entity_set_priority); >> * Add a callback to the current dependency of the entity to wake up the >> * scheduler when the entity becomes available. >> */ >> -static bool drm_sched_entity_add_dependency_cb(struct drm_sched_entity *entity) >> +static bool drm_sched_entity_add_dependency_cb(struct drm_sched_entity *entity, >> + struct drm_sched_job *sched_job) >> { >> struct drm_gpu_scheduler *sched = entity->rq->sched; >> struct dma_fence *fence = entity->dependency; >> @@ -421,6 +422,10 @@ static bool drm_sched_entity_add_dependency_cb(struct drm_sched_entity *entity) >> entity->dependency = fence; >> } >> >> + if (trace_drm_sched_job_unschedulable_enabled() && >> + !test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &entity->dependency->flags)) >> + trace_drm_sched_job_unschedulable(sched_job, entity->dependency); >> + >> if (!dma_fence_add_callback(entity->dependency, &entity->cb, >> drm_sched_entity_wakeup)) >> return true; >> @@ -461,10 +466,8 @@ struct drm_sched_job *drm_sched_entity_pop_job(struct drm_sched_entity *entity) >> >> while ((entity->dependency = >> drm_sched_job_dependency(sched_job, entity))) { >> - if (drm_sched_entity_add_dependency_cb(entity)) { >> - trace_drm_sched_job_unschedulable(sched_job, entity->dependency); >> + if (drm_sched_entity_add_dependency_cb(entity, sched_job)) >> return NULL; >> - } >> } >> >> /* skip jobs from entity that marked guilty */ >
On Tue, 2025-09-02 at 08:59 +0100, Tvrtko Ursulin wrote: > > On 02/09/2025 08:27, Philipp Stanner wrote: > > On Mon, 2025-09-01 at 14:40 +0200, Pierre-Eric Pelloux-Prayer wrote: > > > The drm_sched_job_unschedulable trace point can access > > > entity->dependency after it was cleared by the callback > > > installed in drm_sched_entity_add_dependency_cb, causing: > > > > > > BUG: kernel NULL pointer dereference, address: 0000000000000020 > > > [...] > > > Workqueue: comp_1.1.0 drm_sched_run_job_work [gpu_sched] > > > RIP: 0010:trace_event_raw_event_drm_sched_job_unschedulable+0x70/0xd0 [gpu_sched] > > > > > > To fix this we either need to keep a reference to the fence before > > > setting up the callbacks, or move the trace_drm_sched_job_unschedulable > > > calls into drm_sched_entity_add_dependency_cb where they can be > > > done earlier. > > > > > > Fixes: 76d97c870f29 ("drm/sched: Trace dependencies for GPU jobs") > > > > > > Signed-off-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> > > > Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com> > > > > Applied to drm-misc-next > > Shouldn't it have been drm-misc-fixes? I considered that, but thought not: the fixed commit is only in this rc (v6.17), and in this case the committer guidelines say it should go to misc-next: https://drm.pages.freedesktop.org/maintainer-tools/committer/committer-drm-misc.html Same reason we don't need to +Cc stable. But correct me if I made a mistake. P. > > Regards, > > Tvrtko > > > > --- > > > drivers/gpu/drm/scheduler/sched_entity.c | 11 +++++++---- > > > 1 file changed, 7 insertions(+), 4 deletions(-) > > > > > > diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c > > > index 8867b95ab089..3d06f72531ba 100644 > > > --- a/drivers/gpu/drm/scheduler/sched_entity.c > > > +++ b/drivers/gpu/drm/scheduler/sched_entity.c > > > @@ -391,7 +391,8 @@ EXPORT_SYMBOL(drm_sched_entity_set_priority); > > > * Add a callback to the current dependency of the entity to wake up the > > > * scheduler when the entity becomes available. > > > */ > > > -static bool drm_sched_entity_add_dependency_cb(struct drm_sched_entity *entity) > > > +static bool drm_sched_entity_add_dependency_cb(struct drm_sched_entity *entity, > > > + struct drm_sched_job *sched_job) > > > { > > > struct drm_gpu_scheduler *sched = entity->rq->sched; > > > struct dma_fence *fence = entity->dependency; > > > @@ -421,6 +422,10 @@ static bool drm_sched_entity_add_dependency_cb(struct drm_sched_entity *entity) > > > entity->dependency = fence; > > > } > > > > > > + if (trace_drm_sched_job_unschedulable_enabled() && > > > + !test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &entity->dependency->flags)) > > > + trace_drm_sched_job_unschedulable(sched_job, entity->dependency); > > > + > > > if (!dma_fence_add_callback(entity->dependency, &entity->cb, > > > drm_sched_entity_wakeup)) > > > return true; > > > @@ -461,10 +466,8 @@ struct drm_sched_job *drm_sched_entity_pop_job(struct drm_sched_entity *entity) > > > > > > while ((entity->dependency = > > > drm_sched_job_dependency(sched_job, entity))) { > > > - if (drm_sched_entity_add_dependency_cb(entity)) { > > > - trace_drm_sched_job_unschedulable(sched_job, entity->dependency); > > > + if (drm_sched_entity_add_dependency_cb(entity, sched_job)) > > > return NULL; > > > - } > > > } > > > > > > /* skip jobs from entity that marked guilty */ > > >
On Tue, 2025-09-02 at 10:18 +0200, Philipp Stanner wrote: > On Tue, 2025-09-02 at 08:59 +0100, Tvrtko Ursulin wrote: > > > > On 02/09/2025 08:27, Philipp Stanner wrote: > > > On Mon, 2025-09-01 at 14:40 +0200, Pierre-Eric Pelloux-Prayer wrote: > > > > The drm_sched_job_unschedulable trace point can access > > > > entity->dependency after it was cleared by the callback > > > > installed in drm_sched_entity_add_dependency_cb, causing: > > > > > > > > BUG: kernel NULL pointer dereference, address: 0000000000000020 > > > > [...] > > > > Workqueue: comp_1.1.0 drm_sched_run_job_work [gpu_sched] > > > > RIP: 0010:trace_event_raw_event_drm_sched_job_unschedulable+0x70/0xd0 [gpu_sched] > > > > > > > > To fix this we either need to keep a reference to the fence before > > > > setting up the callbacks, or move the trace_drm_sched_job_unschedulable > > > > calls into drm_sched_entity_add_dependency_cb where they can be > > > > done earlier. > > > > > > > > Fixes: 76d97c870f29 ("drm/sched: Trace dependencies for GPU jobs") > > > > > > > > Signed-off-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> > > > > Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com> > > > > > > Applied to drm-misc-next > > > > Shouldn't it have been drm-misc-fixes? > > I considered that, but thought not: the fixed commit is only in this rc > (v6.17), and in this case the committer guidelines say it should go to > misc-next: > > https://drm.pages.freedesktop.org/maintainer-tools/committer/committer-drm-misc.html OK, wait, the opposite is the case. I slipped to the wrong branch in the diagram. Oh dear. My coffee machine broke a few days ago, and it shows. My bad, sorry. Let me ping the drm maintainers to sort that one out. P. > > Same reason we don't need to +Cc stable. > > But correct me if I made a mistake. > > P. > > > > > > Regards, > > > > Tvrtko > > > > > > --- > > > > drivers/gpu/drm/scheduler/sched_entity.c | 11 +++++++---- > > > > 1 file changed, 7 insertions(+), 4 deletions(-) > > > > > > > > diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c > > > > index 8867b95ab089..3d06f72531ba 100644 > > > > --- a/drivers/gpu/drm/scheduler/sched_entity.c > > > > +++ b/drivers/gpu/drm/scheduler/sched_entity.c > > > > @@ -391,7 +391,8 @@ EXPORT_SYMBOL(drm_sched_entity_set_priority); > > > > * Add a callback to the current dependency of the entity to wake up the > > > > * scheduler when the entity becomes available. > > > > */ > > > > -static bool drm_sched_entity_add_dependency_cb(struct drm_sched_entity *entity) > > > > +static bool drm_sched_entity_add_dependency_cb(struct drm_sched_entity *entity, > > > > + struct drm_sched_job *sched_job) > > > > { > > > > struct drm_gpu_scheduler *sched = entity->rq->sched; > > > > struct dma_fence *fence = entity->dependency; > > > > @@ -421,6 +422,10 @@ static bool drm_sched_entity_add_dependency_cb(struct drm_sched_entity *entity) > > > > entity->dependency = fence; > > > > } > > > > > > > > + if (trace_drm_sched_job_unschedulable_enabled() && > > > > + !test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &entity->dependency->flags)) > > > > + trace_drm_sched_job_unschedulable(sched_job, entity->dependency); > > > > + > > > > if (!dma_fence_add_callback(entity->dependency, &entity->cb, > > > > drm_sched_entity_wakeup)) > > > > return true; > > > > @@ -461,10 +466,8 @@ struct drm_sched_job *drm_sched_entity_pop_job(struct drm_sched_entity *entity) > > > > > > > > while ((entity->dependency = > > > > drm_sched_job_dependency(sched_job, entity))) { > > > > - if (drm_sched_entity_add_dependency_cb(entity)) { > > > > - trace_drm_sched_job_unschedulable(sched_job, entity->dependency); > > > > + if (drm_sched_entity_add_dependency_cb(entity, sched_job)) > > > > return NULL; > > > > - } > > > > } > > > > > > > > /* skip jobs from entity that marked guilty */ > > > > > >
On Tue, 2025-09-02 at 10:22 +0200, Philipp Stanner wrote: > On Tue, 2025-09-02 at 10:18 +0200, Philipp Stanner wrote: > > On Tue, 2025-09-02 at 08:59 +0100, Tvrtko Ursulin wrote: > > > > > > On 02/09/2025 08:27, Philipp Stanner wrote: > > > > On Mon, 2025-09-01 at 14:40 +0200, Pierre-Eric Pelloux-Prayer wrote: > > > > > The drm_sched_job_unschedulable trace point can access > > > > > entity->dependency after it was cleared by the callback > > > > > installed in drm_sched_entity_add_dependency_cb, causing: > > > > > > > > > > BUG: kernel NULL pointer dereference, address: 0000000000000020 > > > > > [...] > > > > > Workqueue: comp_1.1.0 drm_sched_run_job_work [gpu_sched] > > > > > RIP: 0010:trace_event_raw_event_drm_sched_job_unschedulable+0x70/0xd0 [gpu_sched] > > > > > > > > > > To fix this we either need to keep a reference to the fence before > > > > > setting up the callbacks, or move the trace_drm_sched_job_unschedulable > > > > > calls into drm_sched_entity_add_dependency_cb where they can be > > > > > done earlier. > > > > > > > > > > Fixes: 76d97c870f29 ("drm/sched: Trace dependencies for GPU jobs") > > > > > > > > > > Signed-off-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> > > > > > Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com> > > > > > > > > Applied to drm-misc-next > > > > > > Shouldn't it have been drm-misc-fixes? > > > > I considered that, but thought not: the fixed commit is only in this rc > > (v6.17), and in this case the committer guidelines say it should go to > > misc-next: > > > > https://drm.pages.freedesktop.org/maintainer-tools/committer/committer-drm-misc.html > > OK, wait, the opposite is the case. I slipped to the wrong branch in > the diagram. > > Oh dear. My coffee machine broke a few days ago, and it shows. > > My bad, sorry. Let me ping the drm maintainers to sort that one out. Maxime was so kind to fix this for me. Merci, Maxime, and sorry guys. Is now on drm-misc-fixes and we're good. P. > > P. > > > > > > > Same reason we don't need to +Cc stable. > > > > But correct me if I made a mistake. > > > > P. > > > > > > > > > > Regards, > > > > > > Tvrtko > > > > > > > > --- > > > > > drivers/gpu/drm/scheduler/sched_entity.c | 11 +++++++---- > > > > > 1 file changed, 7 insertions(+), 4 deletions(-) > > > > > > > > > > diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c > > > > > index 8867b95ab089..3d06f72531ba 100644 > > > > > --- a/drivers/gpu/drm/scheduler/sched_entity.c > > > > > +++ b/drivers/gpu/drm/scheduler/sched_entity.c > > > > > @@ -391,7 +391,8 @@ EXPORT_SYMBOL(drm_sched_entity_set_priority); > > > > > * Add a callback to the current dependency of the entity to wake up the > > > > > * scheduler when the entity becomes available. > > > > > */ > > > > > -static bool drm_sched_entity_add_dependency_cb(struct drm_sched_entity *entity) > > > > > +static bool drm_sched_entity_add_dependency_cb(struct drm_sched_entity *entity, > > > > > + struct drm_sched_job *sched_job) > > > > > { > > > > > struct drm_gpu_scheduler *sched = entity->rq->sched; > > > > > struct dma_fence *fence = entity->dependency; > > > > > @@ -421,6 +422,10 @@ static bool drm_sched_entity_add_dependency_cb(struct drm_sched_entity *entity) > > > > > entity->dependency = fence; > > > > > } > > > > > > > > > > + if (trace_drm_sched_job_unschedulable_enabled() && > > > > > + !test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &entity->dependency->flags)) > > > > > + trace_drm_sched_job_unschedulable(sched_job, entity->dependency); > > > > > + > > > > > if (!dma_fence_add_callback(entity->dependency, &entity->cb, > > > > > drm_sched_entity_wakeup)) > > > > > return true; > > > > > @@ -461,10 +466,8 @@ struct drm_sched_job *drm_sched_entity_pop_job(struct drm_sched_entity *entity) > > > > > > > > > > while ((entity->dependency = > > > > > drm_sched_job_dependency(sched_job, entity))) { > > > > > - if (drm_sched_entity_add_dependency_cb(entity)) { > > > > > - trace_drm_sched_job_unschedulable(sched_job, entity->dependency); > > > > > + if (drm_sched_entity_add_dependency_cb(entity, sched_job)) > > > > > return NULL; > > > > > - } > > > > > } > > > > > > > > > > /* skip jobs from entity that marked guilty */ > > > > > > > > > >
© 2016 - 2025 Red Hat, Inc.