[PATCH v4 03/10] drm/panfrost: Handle job HW submit errors

Adrián Larumbe posted 10 patches 4 months, 1 week ago
There is a newer version of this series
[PATCH v4 03/10] drm/panfrost: Handle job HW submit errors
Posted by Adrián Larumbe 4 months, 1 week ago
Avoid waiting for the DRM scheduler job timedout handler, and instead, let
the DRM scheduler core signal the error fence immediately when HW job
submission fails.

That means we must also decrement the runtime-PM refcnt for the device,
because the job will never be enqueued or inflight.

Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com>
Signed-off-by: Adrián Larumbe <adrian.larumbe@collabora.com>
---
 drivers/gpu/drm/panfrost/panfrost_job.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c
index a0123d0a1b7d..3f60adc9b69d 100644
--- a/drivers/gpu/drm/panfrost/panfrost_job.c
+++ b/drivers/gpu/drm/panfrost/panfrost_job.c
@@ -196,7 +196,7 @@ panfrost_enqueue_job(struct panfrost_device *pfdev, int slot,
 	return 1;
 }
 
-static void panfrost_job_hw_submit(struct panfrost_job *job, int js)
+static int panfrost_job_hw_submit(struct panfrost_job *job, int js)
 {
 	struct panfrost_device *pfdev = job->pfdev;
 	unsigned int subslot;
@@ -208,10 +208,11 @@ static void panfrost_job_hw_submit(struct panfrost_job *job, int js)
 
 	ret = pm_runtime_get_sync(pfdev->base.dev);
 	if (ret < 0)
-		return;
+		goto err_hwsubmit;
 
 	if (WARN_ON(job_read(pfdev, JS_COMMAND_NEXT(js)))) {
-		return;
+		ret = -EINVAL;
+		goto err_hwsubmit;
 	}
 
 	cfg = panfrost_mmu_as_get(pfdev, job->mmu);
@@ -262,6 +263,12 @@ static void panfrost_job_hw_submit(struct panfrost_job *job, int js)
 			job, js, subslot, jc_head, cfg & 0xf);
 	}
 	spin_unlock(&pfdev->js->job_lock);
+
+	return 0;
+
+err_hwsubmit:
+	pm_runtime_put_autosuspend(pfdev->base.dev);
+	return ret;
 }
 
 static int panfrost_acquire_object_fences(struct drm_gem_object **bos,
@@ -384,6 +391,7 @@ static struct dma_fence *panfrost_job_run(struct drm_sched_job *sched_job)
 	struct panfrost_device *pfdev = job->pfdev;
 	int slot = panfrost_job_get_slot(job);
 	struct dma_fence *fence = NULL;
+	int ret;
 
 	if (job->ctx->destroyed)
 		return ERR_PTR(-ECANCELED);
@@ -405,7 +413,11 @@ static struct dma_fence *panfrost_job_run(struct drm_sched_job *sched_job)
 		dma_fence_put(job->done_fence);
 	job->done_fence = dma_fence_get(fence);
 
-	panfrost_job_hw_submit(job, slot);
+	ret = panfrost_job_hw_submit(job, slot);
+	if (ret) {
+		dma_fence_put(fence);
+		return ERR_PTR(ret);
+	}
 
 	return fence;
 }
-- 
2.51.0

Re: [PATCH v4 03/10] drm/panfrost: Handle job HW submit errors
Posted by Steven Price 4 months ago
On 01/10/2025 03:20, Adrián Larumbe wrote:
> Avoid waiting for the DRM scheduler job timedout handler, and instead, let
> the DRM scheduler core signal the error fence immediately when HW job
> submission fails.
> 
> That means we must also decrement the runtime-PM refcnt for the device,
> because the job will never be enqueued or inflight.
> 
> Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com>
> Signed-off-by: Adrián Larumbe <adrian.larumbe@collabora.com>
> ---
>  drivers/gpu/drm/panfrost/panfrost_job.c | 20 ++++++++++++++++----
>  1 file changed, 16 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c
> index a0123d0a1b7d..3f60adc9b69d 100644
> --- a/drivers/gpu/drm/panfrost/panfrost_job.c
> +++ b/drivers/gpu/drm/panfrost/panfrost_job.c
> @@ -196,7 +196,7 @@ panfrost_enqueue_job(struct panfrost_device *pfdev, int slot,
>  	return 1;
>  }
>  
> -static void panfrost_job_hw_submit(struct panfrost_job *job, int js)
> +static int panfrost_job_hw_submit(struct panfrost_job *job, int js)
>  {
>  	struct panfrost_device *pfdev = job->pfdev;
>  	unsigned int subslot;
> @@ -208,10 +208,11 @@ static void panfrost_job_hw_submit(struct panfrost_job *job, int js)
>  
>  	ret = pm_runtime_get_sync(pfdev->base.dev);
>  	if (ret < 0)
> -		return;
> +		goto err_hwsubmit;
>  
>  	if (WARN_ON(job_read(pfdev, JS_COMMAND_NEXT(js)))) {
> -		return;
> +		ret = -EINVAL;
> +		goto err_hwsubmit;
>  	}
>  
>  	cfg = panfrost_mmu_as_get(pfdev, job->mmu);
> @@ -262,6 +263,12 @@ static void panfrost_job_hw_submit(struct panfrost_job *job, int js)
>  			job, js, subslot, jc_head, cfg & 0xf);
>  	}
>  	spin_unlock(&pfdev->js->job_lock);
> +
> +	return 0;
> +
> +err_hwsubmit:
> +	pm_runtime_put_autosuspend(pfdev->base.dev);

I think you're missing something here. You've put a call to
pm_runtime_put_autosuspend() here which matches the call to
pm_runtime_get_sync() that we do earlier in the function. But there's no
corresponding panfrost_devfreq_record_idle() (but the first thing this
function does is panfrost_devfreq_record_busy()).

So unless I'm missing something (very possible) then this is going to
mess up the devfreq accounting. A simple fix would be just to move the
panfrost_devfreq_record_busy() call down in the function.

Thanks,
Steve

> +	return ret;
>  }
>  
>  static int panfrost_acquire_object_fences(struct drm_gem_object **bos,
> @@ -384,6 +391,7 @@ static struct dma_fence *panfrost_job_run(struct drm_sched_job *sched_job)
>  	struct panfrost_device *pfdev = job->pfdev;
>  	int slot = panfrost_job_get_slot(job);
>  	struct dma_fence *fence = NULL;
> +	int ret;
>  
>  	if (job->ctx->destroyed)
>  		return ERR_PTR(-ECANCELED);
> @@ -405,7 +413,11 @@ static struct dma_fence *panfrost_job_run(struct drm_sched_job *sched_job)
>  		dma_fence_put(job->done_fence);
>  	job->done_fence = dma_fence_get(fence);
>  
> -	panfrost_job_hw_submit(job, slot);
> +	ret = panfrost_job_hw_submit(job, slot);
> +	if (ret) {
> +		dma_fence_put(fence);
> +		return ERR_PTR(ret);
> +	}
>  
>  	return fence;
>  }

Re: [PATCH v4 03/10] drm/panfrost: Handle job HW submit errors
Posted by Adrián Larumbe 4 months ago
On 06.10.2025 17:07, Steven Price wrote:
> On 01/10/2025 03:20, Adrián Larumbe wrote:
> > Avoid waiting for the DRM scheduler job timedout handler, and instead, let
> > the DRM scheduler core signal the error fence immediately when HW job
> > submission fails.
> >
> > That means we must also decrement the runtime-PM refcnt for the device,
> > because the job will never be enqueued or inflight.
> >
> > Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com>
> > Signed-off-by: Adrián Larumbe <adrian.larumbe@collabora.com>
> > ---
> >  drivers/gpu/drm/panfrost/panfrost_job.c | 20 ++++++++++++++++----
> >  1 file changed, 16 insertions(+), 4 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c
> > index a0123d0a1b7d..3f60adc9b69d 100644
> > --- a/drivers/gpu/drm/panfrost/panfrost_job.c
> > +++ b/drivers/gpu/drm/panfrost/panfrost_job.c
> > @@ -196,7 +196,7 @@ panfrost_enqueue_job(struct panfrost_device *pfdev, int slot,
> >  	return 1;
> >  }
> >
> > -static void panfrost_job_hw_submit(struct panfrost_job *job, int js)
> > +static int panfrost_job_hw_submit(struct panfrost_job *job, int js)
> >  {
> >  	struct panfrost_device *pfdev = job->pfdev;
> >  	unsigned int subslot;
> > @@ -208,10 +208,11 @@ static void panfrost_job_hw_submit(struct panfrost_job *job, int js)
> >
> >  	ret = pm_runtime_get_sync(pfdev->base.dev);
> >  	if (ret < 0)
> > -		return;
> > +		goto err_hwsubmit;
> >
> >  	if (WARN_ON(job_read(pfdev, JS_COMMAND_NEXT(js)))) {
> > -		return;
> > +		ret = -EINVAL;
> > +		goto err_hwsubmit;
> >  	}
> >
> >  	cfg = panfrost_mmu_as_get(pfdev, job->mmu);
> > @@ -262,6 +263,12 @@ static void panfrost_job_hw_submit(struct panfrost_job *job, int js)
> >  			job, js, subslot, jc_head, cfg & 0xf);
> >  	}
> >  	spin_unlock(&pfdev->js->job_lock);
> > +
> > +	return 0;
> > +
> > +err_hwsubmit:
> > +	pm_runtime_put_autosuspend(pfdev->base.dev);
>
> I think you're missing something here. You've put a call to
> pm_runtime_put_autosuspend() here which matches the call to
> pm_runtime_get_sync() that we do earlier in the function. But there's no
> corresponding panfrost_devfreq_record_idle() (but the first thing this
> function does is panfrost_devfreq_record_busy()).
>
> So unless I'm missing something (very possible) then this is going to
> mess up the devfreq accounting. A simple fix would be just to move the
> panfrost_devfreq_record_busy() call down in the function.

You didn't miss anything, I completely forgot to keep the devfreq busy
count balanced after this change.

I've moved panfrost_devfreq_record_busy() right after the point the function
can no longer result in an error, as you suggested.

> Thanks,
> Steve
>
> > +	return ret;
> >  }
> >
> >  static int panfrost_acquire_object_fences(struct drm_gem_object **bos,
> > @@ -384,6 +391,7 @@ static struct dma_fence *panfrost_job_run(struct drm_sched_job *sched_job)
> >  	struct panfrost_device *pfdev = job->pfdev;
> >  	int slot = panfrost_job_get_slot(job);
> >  	struct dma_fence *fence = NULL;
> > +	int ret;
> >
> >  	if (job->ctx->destroyed)
> >  		return ERR_PTR(-ECANCELED);
> > @@ -405,7 +413,11 @@ static struct dma_fence *panfrost_job_run(struct drm_sched_job *sched_job)
> >  		dma_fence_put(job->done_fence);
> >  	job->done_fence = dma_fence_get(fence);
> >
> > -	panfrost_job_hw_submit(job, slot);
> > +	ret = panfrost_job_hw_submit(job, slot);
> > +	if (ret) {
> > +		dma_fence_put(fence);
> > +		return ERR_PTR(ret);
> > +	}
> >
> >  	return fence;
> >  }

Adrian Larumbe