Avoid waiting for the DRM scheduler job timedout handler, and instead, let
the DRM scheduler core signal the error fence immediately when HW job
submission fails.
That means we must also decrement the runtime-PM refcnt for the device,
because the job will never be enqueued or inflight.
Signed-off-by: Adrián Larumbe <adrian.larumbe@collabora.com>
---
drivers/gpu/drm/panfrost/panfrost_job.c | 20 ++++++++++++++++----
1 file changed, 16 insertions(+), 4 deletions(-)
diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c
index f640d211cc3a..3f4f0682d69d 100644
--- a/drivers/gpu/drm/panfrost/panfrost_job.c
+++ b/drivers/gpu/drm/panfrost/panfrost_job.c
@@ -195,7 +195,7 @@ panfrost_enqueue_job(struct panfrost_device *pfdev, int slot,
return 1;
}
-static void panfrost_job_hw_submit(struct panfrost_job *job, int js)
+static int panfrost_job_hw_submit(struct panfrost_job *job, int js)
{
struct panfrost_device *pfdev = job->pfdev;
unsigned int subslot;
@@ -207,10 +207,11 @@ static void panfrost_job_hw_submit(struct panfrost_job *job, int js)
ret = pm_runtime_get_sync(pfdev->base.dev);
if (ret < 0)
- return;
+ goto err_hwsubmit;
if (WARN_ON(job_read(pfdev, JS_COMMAND_NEXT(js)))) {
- return;
+ ret = -EINVAL;
+ goto err_hwsubmit;
}
cfg = panfrost_mmu_as_get(pfdev, job->mmu);
@@ -261,6 +262,12 @@ static void panfrost_job_hw_submit(struct panfrost_job *job, int js)
job, js, subslot, jc_head, cfg & 0xf);
}
spin_unlock(&pfdev->js->job_lock);
+
+ return 0;
+
+err_hwsubmit:
+ pm_runtime_put_autosuspend(pfdev->base.dev);
+ return ret;
}
static int panfrost_acquire_object_fences(struct drm_gem_object **bos,
@@ -382,6 +389,7 @@ static struct dma_fence *panfrost_job_run(struct drm_sched_job *sched_job)
struct panfrost_device *pfdev = job->pfdev;
int slot = panfrost_job_get_slot(job);
struct dma_fence *fence = NULL;
+ int ret;
if (unlikely(job->base.s_fence->finished.error))
return NULL;
@@ -400,7 +408,11 @@ static struct dma_fence *panfrost_job_run(struct drm_sched_job *sched_job)
dma_fence_put(job->done_fence);
job->done_fence = dma_fence_get(fence);
- panfrost_job_hw_submit(job, slot);
+ ret = panfrost_job_hw_submit(job, slot);
+ if (ret) {
+ dma_fence_put(job->done_fence);
+ return ERR_PTR(ret);
+ }
return fence;
}
--
2.47.0
On Thu, 28 Nov 2024 21:06:18 +0000
Adrián Larumbe <adrian.larumbe@collabora.com> wrote:
> Avoid waiting for the DRM scheduler job timedout handler, and instead, let
> the DRM scheduler core signal the error fence immediately when HW job
> submission fails.
>
> That means we must also decrement the runtime-PM refcnt for the device,
> because the job will never be enqueued or inflight.
>
> Signed-off-by: Adrián Larumbe <adrian.larumbe@collabora.com>
> ---
> drivers/gpu/drm/panfrost/panfrost_job.c | 20 ++++++++++++++++----
> 1 file changed, 16 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c
> index f640d211cc3a..3f4f0682d69d 100644
> --- a/drivers/gpu/drm/panfrost/panfrost_job.c
> +++ b/drivers/gpu/drm/panfrost/panfrost_job.c
> @@ -195,7 +195,7 @@ panfrost_enqueue_job(struct panfrost_device *pfdev, int slot,
> return 1;
> }
>
> -static void panfrost_job_hw_submit(struct panfrost_job *job, int js)
> +static int panfrost_job_hw_submit(struct panfrost_job *job, int js)
> {
> struct panfrost_device *pfdev = job->pfdev;
> unsigned int subslot;
> @@ -207,10 +207,11 @@ static void panfrost_job_hw_submit(struct panfrost_job *job, int js)
>
> ret = pm_runtime_get_sync(pfdev->base.dev);
> if (ret < 0)
> - return;
> + goto err_hwsubmit;
>
> if (WARN_ON(job_read(pfdev, JS_COMMAND_NEXT(js)))) {
> - return;
> + ret = -EINVAL;
> + goto err_hwsubmit;
> }
>
> cfg = panfrost_mmu_as_get(pfdev, job->mmu);
> @@ -261,6 +262,12 @@ static void panfrost_job_hw_submit(struct panfrost_job *job, int js)
> job, js, subslot, jc_head, cfg & 0xf);
> }
> spin_unlock(&pfdev->js->job_lock);
> +
> + return 0;
> +
> +err_hwsubmit:
> + pm_runtime_put_autosuspend(pfdev->base.dev);
> + return ret;
> }
>
> static int panfrost_acquire_object_fences(struct drm_gem_object **bos,
> @@ -382,6 +389,7 @@ static struct dma_fence *panfrost_job_run(struct drm_sched_job *sched_job)
> struct panfrost_device *pfdev = job->pfdev;
> int slot = panfrost_job_get_slot(job);
> struct dma_fence *fence = NULL;
> + int ret;
>
> if (unlikely(job->base.s_fence->finished.error))
> return NULL;
> @@ -400,7 +408,11 @@ static struct dma_fence *panfrost_job_run(struct drm_sched_job *sched_job)
> dma_fence_put(job->done_fence);
> job->done_fence = dma_fence_get(fence);
>
> - panfrost_job_hw_submit(job, slot);
> + ret = panfrost_job_hw_submit(job, slot);
> + if (ret) {
> + dma_fence_put(job->done_fence);
If you call dma_fence_put() here, you need to set job->done_fence to
NULL, otherwise dma_fence_put() will be called again on an already
freed fence in panfrost_job_cleanup(). Question is, do we really need
to call dma_fence_put(job->done_fence) here? Can't we let the job
destructor take care of that?
> + return ERR_PTR(ret);
> + }
>
> return fence;
> }
On Mon, 2 Dec 2024 10:21:51 +0100
Boris Brezillon <boris.brezillon@collabora.com> wrote:
> On Thu, 28 Nov 2024 21:06:18 +0000
> Adrián Larumbe <adrian.larumbe@collabora.com> wrote:
>
> > Avoid waiting for the DRM scheduler job timedout handler, and instead, let
> > the DRM scheduler core signal the error fence immediately when HW job
> > submission fails.
> >
> > That means we must also decrement the runtime-PM refcnt for the device,
> > because the job will never be enqueued or inflight.
> >
> > Signed-off-by: Adrián Larumbe <adrian.larumbe@collabora.com>
> > ---
> > drivers/gpu/drm/panfrost/panfrost_job.c | 20 ++++++++++++++++----
> > 1 file changed, 16 insertions(+), 4 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c
> > index f640d211cc3a..3f4f0682d69d 100644
> > --- a/drivers/gpu/drm/panfrost/panfrost_job.c
> > +++ b/drivers/gpu/drm/panfrost/panfrost_job.c
> > @@ -195,7 +195,7 @@ panfrost_enqueue_job(struct panfrost_device *pfdev, int slot,
> > return 1;
> > }
> >
> > -static void panfrost_job_hw_submit(struct panfrost_job *job, int js)
> > +static int panfrost_job_hw_submit(struct panfrost_job *job, int js)
> > {
> > struct panfrost_device *pfdev = job->pfdev;
> > unsigned int subslot;
> > @@ -207,10 +207,11 @@ static void panfrost_job_hw_submit(struct panfrost_job *job, int js)
> >
> > ret = pm_runtime_get_sync(pfdev->base.dev);
> > if (ret < 0)
> > - return;
> > + goto err_hwsubmit;
> >
> > if (WARN_ON(job_read(pfdev, JS_COMMAND_NEXT(js)))) {
> > - return;
> > + ret = -EINVAL;
> > + goto err_hwsubmit;
> > }
> >
> > cfg = panfrost_mmu_as_get(pfdev, job->mmu);
> > @@ -261,6 +262,12 @@ static void panfrost_job_hw_submit(struct panfrost_job *job, int js)
> > job, js, subslot, jc_head, cfg & 0xf);
> > }
> > spin_unlock(&pfdev->js->job_lock);
> > +
> > + return 0;
> > +
> > +err_hwsubmit:
> > + pm_runtime_put_autosuspend(pfdev->base.dev);
> > + return ret;
> > }
> >
> > static int panfrost_acquire_object_fences(struct drm_gem_object **bos,
> > @@ -382,6 +389,7 @@ static struct dma_fence *panfrost_job_run(struct drm_sched_job *sched_job)
> > struct panfrost_device *pfdev = job->pfdev;
> > int slot = panfrost_job_get_slot(job);
> > struct dma_fence *fence = NULL;
> > + int ret;
> >
> > if (unlikely(job->base.s_fence->finished.error))
> > return NULL;
> > @@ -400,7 +408,11 @@ static struct dma_fence *panfrost_job_run(struct drm_sched_job *sched_job)
> > dma_fence_put(job->done_fence);
> > job->done_fence = dma_fence_get(fence);
> >
> > - panfrost_job_hw_submit(job, slot);
> > + ret = panfrost_job_hw_submit(job, slot);
> > + if (ret) {
> > + dma_fence_put(job->done_fence);
>
> If you call dma_fence_put() here, you need to set job->done_fence to
> NULL, otherwise dma_fence_put() will be called again on an already
> freed fence in panfrost_job_cleanup(). Question is, do we really need
> to call dma_fence_put(job->done_fence) here? Can't we let the job
> destructor take care of that?
My bad, it's the ref you've taken to return to drm_sched you're
releasing here. I'd go for `dma_fence_put(fence)` to make that clear.
With this minor change, the patch is
Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com>
>
> > + return ERR_PTR(ret);
> > + }
> >
> > return fence;
> > }
>
© 2016 - 2026 Red Hat, Inc.