[PATCH 2/4] panthor: save panthor_file in panthor_group

Chia-I Wu posted 4 patches 3 months, 2 weeks ago
There is a newer version of this series
[PATCH 2/4] panthor: save panthor_file in panthor_group
Posted by Chia-I Wu 3 months, 2 weeks ago
We would like to access panthor_file from panthor_group on gpu errors.
Because panthour_group can outlive drm_file, add refcount to
panthor_file to ensure its lifetime.

Signed-off-by: Chia-I Wu <olvaffe@gmail.com>
---
 drivers/gpu/drm/panthor/panthor_device.h | 16 ++++++++++++++++
 drivers/gpu/drm/panthor/panthor_drv.c    | 15 ++++++++++++++-
 drivers/gpu/drm/panthor/panthor_mmu.c    |  1 +
 drivers/gpu/drm/panthor/panthor_sched.c  |  6 ++++++
 4 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/panthor/panthor_device.h b/drivers/gpu/drm/panthor/panthor_device.h
index 4fc7cf2aeed57..75ae6fd3a5128 100644
--- a/drivers/gpu/drm/panthor/panthor_device.h
+++ b/drivers/gpu/drm/panthor/panthor_device.h
@@ -256,8 +256,24 @@ struct panthor_file {
 
 	/** @stats: cycle and timestamp measures for job execution. */
 	struct panthor_gpu_usage stats;
+
+	/** @refcount: ref count of this file */
+	struct kref refcount;
 };
 
+static inline struct panthor_file *panthor_file_get(struct panthor_file *pfile)
+{
+	kref_get(&pfile->refcount);
+	return pfile;
+}
+
+void panthor_file_release(struct kref *kref);
+
+static inline void panthor_file_put(struct panthor_file *pfile)
+{
+	kref_put(&pfile->refcount, panthor_file_release);
+}
+
 int panthor_device_init(struct panthor_device *ptdev);
 void panthor_device_unplug(struct panthor_device *ptdev);
 
diff --git a/drivers/gpu/drm/panthor/panthor_drv.c b/drivers/gpu/drm/panthor/panthor_drv.c
index 775a66c394544..aea9609684b77 100644
--- a/drivers/gpu/drm/panthor/panthor_drv.c
+++ b/drivers/gpu/drm/panthor/panthor_drv.c
@@ -1393,6 +1393,16 @@ static int panthor_ioctl_set_user_mmio_offset(struct drm_device *ddev,
 	return 0;
 }
 
+void panthor_file_release(struct kref *kref)
+{
+	struct panthor_file *pfile =
+		container_of(kref, struct panthor_file, refcount);
+
+	WARN_ON(pfile->vms || pfile->groups);
+
+	kfree(pfile);
+}
+
 static int
 panthor_open(struct drm_device *ddev, struct drm_file *file)
 {
@@ -1426,6 +1436,8 @@ panthor_open(struct drm_device *ddev, struct drm_file *file)
 	if (ret)
 		goto err_destroy_vm_pool;
 
+	kref_init(&pfile->refcount);
+
 	file->driver_priv = pfile;
 	return 0;
 
@@ -1442,10 +1454,11 @@ panthor_postclose(struct drm_device *ddev, struct drm_file *file)
 {
 	struct panthor_file *pfile = file->driver_priv;
 
+	/* destroy vm and group handles now to avoid circular references */
 	panthor_group_pool_destroy(pfile);
 	panthor_vm_pool_destroy(pfile);
 
-	kfree(pfile);
+	panthor_file_put(pfile);
 }
 
 static const struct drm_ioctl_desc panthor_drm_driver_ioctls[] = {
diff --git a/drivers/gpu/drm/panthor/panthor_mmu.c b/drivers/gpu/drm/panthor/panthor_mmu.c
index b39ea6acc6a96..ccbcfe11420ac 100644
--- a/drivers/gpu/drm/panthor/panthor_mmu.c
+++ b/drivers/gpu/drm/panthor/panthor_mmu.c
@@ -1604,6 +1604,7 @@ void panthor_vm_pool_destroy(struct panthor_file *pfile)
 
 	xa_destroy(&pfile->vms->xa);
 	kfree(pfile->vms);
+	pfile->vms = NULL;
 }
 
 /**
diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
index a2248f692a030..485072904cd7d 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.c
+++ b/drivers/gpu/drm/panthor/panthor_sched.c
@@ -535,6 +535,9 @@ struct panthor_group {
 	/** @ptdev: Device. */
 	struct panthor_device *ptdev;
 
+	/** @pfile: File this group is created from. */
+	struct panthor_file *pfile;
+
 	/** @vm: VM bound to the group. */
 	struct panthor_vm *vm;
 
@@ -919,6 +922,7 @@ static void group_release_work(struct work_struct *work)
 	panthor_kernel_bo_destroy(group->syncobjs);
 
 	panthor_vm_put(group->vm);
+	panthor_file_put(group->pfile);
 	kfree(group);
 }
 
@@ -3467,6 +3471,8 @@ int panthor_group_create(struct panthor_file *pfile,
 	INIT_WORK(&group->tiler_oom_work, group_tiler_oom_work);
 	INIT_WORK(&group->release_work, group_release_work);
 
+	group->pfile = panthor_file_get(pfile);
+
 	group->vm = panthor_vm_pool_get_vm(pfile->vms, group_args->vm_id);
 	if (!group->vm) {
 		ret = -EINVAL;
-- 
2.50.0.714.g196bf9f422-goog
Re: [PATCH 2/4] panthor: save panthor_file in panthor_group
Posted by Boris Brezillon 3 months, 2 weeks ago
On Fri, 20 Jun 2025 16:50:51 -0700
Chia-I Wu <olvaffe@gmail.com> wrote:

> We would like to access panthor_file from panthor_group on gpu errors.
> Because panthour_group can outlive drm_file, add refcount to
> panthor_file to ensure its lifetime.

I'm not a huge fan of refcounting panthor_file because people tend to
put resource they expect to be released when the last handle goes away,
and if we don't refcount these sub-resources they might live longer
than they are meant to. Also not a huge fan of the circular referencing
that exists between file and groups after this change.

How about we move the process info to a sub-object that's refcounted
and let both panthor_file and panthor_group take a ref on this object
instead?

> 
> Signed-off-by: Chia-I Wu <olvaffe@gmail.com>
> ---
>  drivers/gpu/drm/panthor/panthor_device.h | 16 ++++++++++++++++
>  drivers/gpu/drm/panthor/panthor_drv.c    | 15 ++++++++++++++-
>  drivers/gpu/drm/panthor/panthor_mmu.c    |  1 +
>  drivers/gpu/drm/panthor/panthor_sched.c  |  6 ++++++
>  4 files changed, 37 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/panthor/panthor_device.h b/drivers/gpu/drm/panthor/panthor_device.h
> index 4fc7cf2aeed57..75ae6fd3a5128 100644
> --- a/drivers/gpu/drm/panthor/panthor_device.h
> +++ b/drivers/gpu/drm/panthor/panthor_device.h
> @@ -256,8 +256,24 @@ struct panthor_file {
>  
>  	/** @stats: cycle and timestamp measures for job execution. */
>  	struct panthor_gpu_usage stats;
> +
> +	/** @refcount: ref count of this file */
> +	struct kref refcount;
>  };
>  
> +static inline struct panthor_file *panthor_file_get(struct panthor_file *pfile)
> +{
> +	kref_get(&pfile->refcount);
> +	return pfile;
> +}
> +
> +void panthor_file_release(struct kref *kref);
> +
> +static inline void panthor_file_put(struct panthor_file *pfile)
> +{
> +	kref_put(&pfile->refcount, panthor_file_release);
> +}
> +
>  int panthor_device_init(struct panthor_device *ptdev);
>  void panthor_device_unplug(struct panthor_device *ptdev);
>  
> diff --git a/drivers/gpu/drm/panthor/panthor_drv.c b/drivers/gpu/drm/panthor/panthor_drv.c
> index 775a66c394544..aea9609684b77 100644
> --- a/drivers/gpu/drm/panthor/panthor_drv.c
> +++ b/drivers/gpu/drm/panthor/panthor_drv.c
> @@ -1393,6 +1393,16 @@ static int panthor_ioctl_set_user_mmio_offset(struct drm_device *ddev,
>  	return 0;
>  }
>  
> +void panthor_file_release(struct kref *kref)
> +{
> +	struct panthor_file *pfile =
> +		container_of(kref, struct panthor_file, refcount);
> +
> +	WARN_ON(pfile->vms || pfile->groups);
> +
> +	kfree(pfile);
> +}
> +
>  static int
>  panthor_open(struct drm_device *ddev, struct drm_file *file)
>  {
> @@ -1426,6 +1436,8 @@ panthor_open(struct drm_device *ddev, struct drm_file *file)
>  	if (ret)
>  		goto err_destroy_vm_pool;
>  
> +	kref_init(&pfile->refcount);
> +
>  	file->driver_priv = pfile;
>  	return 0;
>  
> @@ -1442,10 +1454,11 @@ panthor_postclose(struct drm_device *ddev, struct drm_file *file)
>  {
>  	struct panthor_file *pfile = file->driver_priv;
>  
> +	/* destroy vm and group handles now to avoid circular references */
>  	panthor_group_pool_destroy(pfile);
>  	panthor_vm_pool_destroy(pfile);
>  
> -	kfree(pfile);
> +	panthor_file_put(pfile);
>  }
>  
>  static const struct drm_ioctl_desc panthor_drm_driver_ioctls[] = {
> diff --git a/drivers/gpu/drm/panthor/panthor_mmu.c b/drivers/gpu/drm/panthor/panthor_mmu.c
> index b39ea6acc6a96..ccbcfe11420ac 100644
> --- a/drivers/gpu/drm/panthor/panthor_mmu.c
> +++ b/drivers/gpu/drm/panthor/panthor_mmu.c
> @@ -1604,6 +1604,7 @@ void panthor_vm_pool_destroy(struct panthor_file *pfile)
>  
>  	xa_destroy(&pfile->vms->xa);
>  	kfree(pfile->vms);
> +	pfile->vms = NULL;
>  }
>  
>  /**
> diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
> index a2248f692a030..485072904cd7d 100644
> --- a/drivers/gpu/drm/panthor/panthor_sched.c
> +++ b/drivers/gpu/drm/panthor/panthor_sched.c
> @@ -535,6 +535,9 @@ struct panthor_group {
>  	/** @ptdev: Device. */
>  	struct panthor_device *ptdev;
>  
> +	/** @pfile: File this group is created from. */
> +	struct panthor_file *pfile;
> +
>  	/** @vm: VM bound to the group. */
>  	struct panthor_vm *vm;
>  
> @@ -919,6 +922,7 @@ static void group_release_work(struct work_struct *work)
>  	panthor_kernel_bo_destroy(group->syncobjs);
>  
>  	panthor_vm_put(group->vm);
> +	panthor_file_put(group->pfile);
>  	kfree(group);
>  }
>  
> @@ -3467,6 +3471,8 @@ int panthor_group_create(struct panthor_file *pfile,
>  	INIT_WORK(&group->tiler_oom_work, group_tiler_oom_work);
>  	INIT_WORK(&group->release_work, group_release_work);
>  
> +	group->pfile = panthor_file_get(pfile);
> +
>  	group->vm = panthor_vm_pool_get_vm(pfile->vms, group_args->vm_id);
>  	if (!group->vm) {
>  		ret = -EINVAL;
Re: [PATCH 2/4] panthor: save panthor_file in panthor_group
Posted by Liviu Dudau 3 months, 2 weeks ago
On Mon, Jun 23, 2025 at 08:21:22AM +0200, Boris Brezillon wrote:
> On Fri, 20 Jun 2025 16:50:51 -0700
> Chia-I Wu <olvaffe@gmail.com> wrote:
> 
> > We would like to access panthor_file from panthor_group on gpu errors.
> > Because panthour_group can outlive drm_file, add refcount to
> > panthor_file to ensure its lifetime.
> 
> I'm not a huge fan of refcounting panthor_file because people tend to
> put resource they expect to be released when the last handle goes away,
> and if we don't refcount these sub-resources they might live longer
> than they are meant to. Also not a huge fan of the circular referencing
> that exists between file and groups after this change.
> 
> How about we move the process info to a sub-object that's refcounted
> and let both panthor_file and panthor_group take a ref on this object
> instead?

I agree with Boris on this. One alternative is to put the pid and comm in
the panthor_group struct as panthor_file makes no use of the fields.

Best regards,
Liviu

> 
> > 
> > Signed-off-by: Chia-I Wu <olvaffe@gmail.com>
> > ---
> >  drivers/gpu/drm/panthor/panthor_device.h | 16 ++++++++++++++++
> >  drivers/gpu/drm/panthor/panthor_drv.c    | 15 ++++++++++++++-
> >  drivers/gpu/drm/panthor/panthor_mmu.c    |  1 +
> >  drivers/gpu/drm/panthor/panthor_sched.c  |  6 ++++++
> >  4 files changed, 37 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/gpu/drm/panthor/panthor_device.h b/drivers/gpu/drm/panthor/panthor_device.h
> > index 4fc7cf2aeed57..75ae6fd3a5128 100644
> > --- a/drivers/gpu/drm/panthor/panthor_device.h
> > +++ b/drivers/gpu/drm/panthor/panthor_device.h
> > @@ -256,8 +256,24 @@ struct panthor_file {
> >  
> >  	/** @stats: cycle and timestamp measures for job execution. */
> >  	struct panthor_gpu_usage stats;
> > +
> > +	/** @refcount: ref count of this file */
> > +	struct kref refcount;
> >  };
> >  
> > +static inline struct panthor_file *panthor_file_get(struct panthor_file *pfile)
> > +{
> > +	kref_get(&pfile->refcount);
> > +	return pfile;
> > +}
> > +
> > +void panthor_file_release(struct kref *kref);
> > +
> > +static inline void panthor_file_put(struct panthor_file *pfile)
> > +{
> > +	kref_put(&pfile->refcount, panthor_file_release);
> > +}
> > +
> >  int panthor_device_init(struct panthor_device *ptdev);
> >  void panthor_device_unplug(struct panthor_device *ptdev);
> >  
> > diff --git a/drivers/gpu/drm/panthor/panthor_drv.c b/drivers/gpu/drm/panthor/panthor_drv.c
> > index 775a66c394544..aea9609684b77 100644
> > --- a/drivers/gpu/drm/panthor/panthor_drv.c
> > +++ b/drivers/gpu/drm/panthor/panthor_drv.c
> > @@ -1393,6 +1393,16 @@ static int panthor_ioctl_set_user_mmio_offset(struct drm_device *ddev,
> >  	return 0;
> >  }
> >  
> > +void panthor_file_release(struct kref *kref)
> > +{
> > +	struct panthor_file *pfile =
> > +		container_of(kref, struct panthor_file, refcount);
> > +
> > +	WARN_ON(pfile->vms || pfile->groups);
> > +
> > +	kfree(pfile);
> > +}
> > +
> >  static int
> >  panthor_open(struct drm_device *ddev, struct drm_file *file)
> >  {
> > @@ -1426,6 +1436,8 @@ panthor_open(struct drm_device *ddev, struct drm_file *file)
> >  	if (ret)
> >  		goto err_destroy_vm_pool;
> >  
> > +	kref_init(&pfile->refcount);
> > +
> >  	file->driver_priv = pfile;
> >  	return 0;
> >  
> > @@ -1442,10 +1454,11 @@ panthor_postclose(struct drm_device *ddev, struct drm_file *file)
> >  {
> >  	struct panthor_file *pfile = file->driver_priv;
> >  
> > +	/* destroy vm and group handles now to avoid circular references */
> >  	panthor_group_pool_destroy(pfile);
> >  	panthor_vm_pool_destroy(pfile);
> >  
> > -	kfree(pfile);
> > +	panthor_file_put(pfile);
> >  }
> >  
> >  static const struct drm_ioctl_desc panthor_drm_driver_ioctls[] = {
> > diff --git a/drivers/gpu/drm/panthor/panthor_mmu.c b/drivers/gpu/drm/panthor/panthor_mmu.c
> > index b39ea6acc6a96..ccbcfe11420ac 100644
> > --- a/drivers/gpu/drm/panthor/panthor_mmu.c
> > +++ b/drivers/gpu/drm/panthor/panthor_mmu.c
> > @@ -1604,6 +1604,7 @@ void panthor_vm_pool_destroy(struct panthor_file *pfile)
> >  
> >  	xa_destroy(&pfile->vms->xa);
> >  	kfree(pfile->vms);
> > +	pfile->vms = NULL;
> >  }
> >  
> >  /**
> > diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
> > index a2248f692a030..485072904cd7d 100644
> > --- a/drivers/gpu/drm/panthor/panthor_sched.c
> > +++ b/drivers/gpu/drm/panthor/panthor_sched.c
> > @@ -535,6 +535,9 @@ struct panthor_group {
> >  	/** @ptdev: Device. */
> >  	struct panthor_device *ptdev;
> >  
> > +	/** @pfile: File this group is created from. */
> > +	struct panthor_file *pfile;
> > +
> >  	/** @vm: VM bound to the group. */
> >  	struct panthor_vm *vm;
> >  
> > @@ -919,6 +922,7 @@ static void group_release_work(struct work_struct *work)
> >  	panthor_kernel_bo_destroy(group->syncobjs);
> >  
> >  	panthor_vm_put(group->vm);
> > +	panthor_file_put(group->pfile);
> >  	kfree(group);
> >  }
> >  
> > @@ -3467,6 +3471,8 @@ int panthor_group_create(struct panthor_file *pfile,
> >  	INIT_WORK(&group->tiler_oom_work, group_tiler_oom_work);
> >  	INIT_WORK(&group->release_work, group_release_work);
> >  
> > +	group->pfile = panthor_file_get(pfile);
> > +
> >  	group->vm = panthor_vm_pool_get_vm(pfile->vms, group_args->vm_id);
> >  	if (!group->vm) {
> >  		ret = -EINVAL;
> 

-- 
====================
| I would like to |
| fix the world,  |
| but they're not |
| giving me the   |
 \ source code!  /
  ---------------
    ¯\_(ツ)_/¯
Re: [PATCH 2/4] panthor: save panthor_file in panthor_group
Posted by Chia-I Wu 2 months, 3 weeks ago
Hi,

On Mon, Jun 23, 2025 at 2:07 AM Liviu Dudau <liviu.dudau@arm.com> wrote:
>
> On Mon, Jun 23, 2025 at 08:21:22AM +0200, Boris Brezillon wrote:
> > On Fri, 20 Jun 2025 16:50:51 -0700
> > Chia-I Wu <olvaffe@gmail.com> wrote:
> >
> > > We would like to access panthor_file from panthor_group on gpu errors.
> > > Because panthour_group can outlive drm_file, add refcount to
> > > panthor_file to ensure its lifetime.
> >
> > I'm not a huge fan of refcounting panthor_file because people tend to
> > put resource they expect to be released when the last handle goes away,
> > and if we don't refcount these sub-resources they might live longer
> > than they are meant to. Also not a huge fan of the circular referencing
> > that exists between file and groups after this change.
> >
> > How about we move the process info to a sub-object that's refcounted
> > and let both panthor_file and panthor_group take a ref on this object
> > instead?
>
> I agree with Boris on this. One alternative is to put the pid and comm in
> the panthor_group struct as panthor_file makes no use of the fields.
I took this suggestion in v2 because, when the task that opened the
node differs from the task that created the group, we are more
interested in the latter.