drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 40 ++++++++++++++++---------- 1 file changed, 25 insertions(+), 15 deletions(-)
The amdgpu_vm_get_task_info_pasid() function previously called
amdgpu_vm_get_vm_from_pasid() which returns a raw VM pointer after
releasing the pasids xarray lock. The caller then dereferences
vm->task_info without any lifetime protection.
Race condition:
CPU 0 (lookup) CPU 1 (release)
------------------ ------------------
amdgpu_vm_get_task_info_pasid()
xa_lock()
vm = xa_load(pasids)
xa_unlock()
amdgpu_vm_fini()
xa_erase_irq(pasids)
// teardown continues
kfree(fpriv)
// VM freed (embedded in fpriv)
vm->task_info // potential UAF
This can leave the VM pointer dangling because struct amdgpu_vm is
embedded in struct amdgpu_fpriv which is freed via kfree(fpriv) in
amdgpu_file_release_kms() after amdgpu_vm_fini() returns.
Fix this by acquiring the task_info reference while holding the
xarray lock. This avoids the window where the VM could be freed
between the lookup and the dereference.
Cache vm->task_info in a local variable before attempting to take a
reference, which keeps the lookup straightforward inside the locked
section. Use kref_get_unless_zero() to safely handle the case where
task_info's refcount is already being decremented to zero by another
thread in the teardown path.
Note: An RCU-based approach was considered but is not currently
feasible because: (1) the pasids xarray is initialized without
XA_FLAGS_RCU, and (2) struct amdgpu_fpriv is freed with kfree()
rather than kfree_rcu(). A future refactoring could enable RCU
if needed for performance.
Also remove the unsafe helper function amdgpu_vm_get_vm_from_pasid()
to prevent future misuse.
Fixes: b8f67b9ddf4f ("drm/amdgpu: change vm->task_info handling")
Signed-off-by: Fan Wu <fanwu01@zju.edu.cn>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 40 ++++++++++++++++----------
1 file changed, 25 insertions(+), 15 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index f2beb980e3c3..7e8621c9b661 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2468,19 +2468,6 @@ static void amdgpu_vm_destroy_task_info(struct kref *kref)
kfree(ti);
}
-static inline struct amdgpu_vm *
-amdgpu_vm_get_vm_from_pasid(struct amdgpu_device *adev, u32 pasid)
-{
- struct amdgpu_vm *vm;
- unsigned long flags;
-
- xa_lock_irqsave(&adev->vm_manager.pasids, flags);
- vm = xa_load(&adev->vm_manager.pasids, pasid);
- xa_unlock_irqrestore(&adev->vm_manager.pasids, flags);
-
- return vm;
-}
-
/**
* amdgpu_vm_put_task_info - reference down the vm task_info ptr
*
@@ -2527,8 +2514,31 @@ amdgpu_vm_get_task_info_vm(struct amdgpu_vm *vm)
struct amdgpu_task_info *
amdgpu_vm_get_task_info_pasid(struct amdgpu_device *adev, u32 pasid)
{
- return amdgpu_vm_get_task_info_vm(
- amdgpu_vm_get_vm_from_pasid(adev, pasid));
+ struct amdgpu_vm *vm;
+ unsigned long flags;
+ struct amdgpu_task_info *ti = NULL;
+
+ /*
+ * Acquire the task_info reference while holding the pasids xarray
+ * lock to prevent a race with amdgpu_vm_fini() which removes the
+ * PASID mapping before freeing the VM (embedded in struct amdgpu_fpriv).
+ * Without this, the VM could be freed between xa_load() return and
+ * the task_info dereference.
+ */
+ xa_lock_irqsave(&adev->vm_manager.pasids, flags);
+ vm = xa_load(&adev->vm_manager.pasids, pasid);
+ if (vm) {
+ /*
+ * Cache vm->task_info in a local variable before
+ * attempting to take a reference.
+ */
+ ti = vm->task_info;
+ if (ti && !kref_get_unless_zero(&ti->refcount))
+ ti = NULL;
+ }
+ xa_unlock_irqrestore(&adev->vm_manager.pasids, flags);
+
+ return ti;
}
static int amdgpu_vm_create_task_info(struct amdgpu_vm *vm)
--
2.34.1
On 3/9/26 17:04, Fan Wu wrote:
> The amdgpu_vm_get_task_info_pasid() function previously called
> amdgpu_vm_get_vm_from_pasid() which returns a raw VM pointer after
> releasing the pasids xarray lock. The caller then dereferences
> vm->task_info without any lifetime protection.
>
> Race condition:
>
> CPU 0 (lookup) CPU 1 (release)
> ------------------ ------------------
> amdgpu_vm_get_task_info_pasid()
> xa_lock()
> vm = xa_load(pasids)
> xa_unlock()
> amdgpu_vm_fini()
> xa_erase_irq(pasids)
> // teardown continues
> kfree(fpriv)
> // VM freed (embedded in fpriv)
> vm->task_info // potential UAF
>
> This can leave the VM pointer dangling because struct amdgpu_vm is
> embedded in struct amdgpu_fpriv which is freed via kfree(fpriv) in
> amdgpu_file_release_kms() after amdgpu_vm_fini() returns.
>
> Fix this by acquiring the task_info reference while holding the
> xarray lock. This avoids the window where the VM could be freed
> between the lookup and the dereference.
>
> Cache vm->task_info in a local variable before attempting to take a
> reference, which keeps the lookup straightforward inside the locked
> section. Use kref_get_unless_zero() to safely handle the case where
> task_info's refcount is already being decremented to zero by another
> thread in the teardown path.
>
> Note: An RCU-based approach was considered but is not currently
> feasible because: (1) the pasids xarray is initialized without
> XA_FLAGS_RCU, and (2) struct amdgpu_fpriv is freed with kfree()
> rather than kfree_rcu(). A future refactoring could enable RCU
> if needed for performance.
>
> Also remove the unsafe helper function amdgpu_vm_get_vm_from_pasid()
> to prevent future misuse.
>
> Fixes: b8f67b9ddf4f ("drm/amdgpu: change vm->task_info handling")
> Signed-off-by: Fan Wu <fanwu01@zju.edu.cn>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 40 ++++++++++++++++----------
> 1 file changed, 25 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index f2beb980e3c3..7e8621c9b661 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -2468,19 +2468,6 @@ static void amdgpu_vm_destroy_task_info(struct kref *kref)
> kfree(ti);
> }
>
> -static inline struct amdgpu_vm *
> -amdgpu_vm_get_vm_from_pasid(struct amdgpu_device *adev, u32 pasid)
> -{
> - struct amdgpu_vm *vm;
> - unsigned long flags;
> -
> - xa_lock_irqsave(&adev->vm_manager.pasids, flags);
> - vm = xa_load(&adev->vm_manager.pasids, pasid);
> - xa_unlock_irqrestore(&adev->vm_manager.pasids, flags);
> -
> - return vm;
> -}
> -
> /**
> * amdgpu_vm_put_task_info - reference down the vm task_info ptr
> *
> @@ -2527,8 +2514,31 @@ amdgpu_vm_get_task_info_vm(struct amdgpu_vm *vm)
> struct amdgpu_task_info *
> amdgpu_vm_get_task_info_pasid(struct amdgpu_device *adev, u32 pasid)
> {
> - return amdgpu_vm_get_task_info_vm(
> - amdgpu_vm_get_vm_from_pasid(adev, pasid));
> + struct amdgpu_vm *vm;
> + unsigned long flags;
> + struct amdgpu_task_info *ti = NULL;
> +
> + /*
> + * Acquire the task_info reference while holding the pasids xarray
> + * lock to prevent a race with amdgpu_vm_fini() which removes the
> + * PASID mapping before freeing the VM (embedded in struct amdgpu_fpriv).
> + * Without this, the VM could be freed between xa_load() return and
> + * the task_info dereference.
That the VM is freed is irrelevant, the point is that we need to grab the reference to the task info before we drop that one.
> + */
> + xa_lock_irqsave(&adev->vm_manager.pasids, flags);
> + vm = xa_load(&adev->vm_manager.pasids, pasid);
> + if (vm) {
> + /*
> + * Cache vm->task_info in a local variable before
> + * attempting to take a reference.
> + */
Please drop that comment, taking the task info into a local variable is actually superflous.
> + ti = vm->task_info;
> + if (ti && !kref_get_unless_zero(&ti->refcount))
That is unecessary as wel, the task info is dropped after the VM is removed from pasid mapping.
So just using kref_get() is sufficient.
Regards,
Christian.
> + ti = NULL;
> + }
> + xa_unlock_irqrestore(&adev->vm_manager.pasids, flags);
> +
> + return ti;
> }
>
> static int amdgpu_vm_create_task_info(struct amdgpu_vm *vm)
> --
> 2.34.1
>
amdgpu_vm_get_task_info_pasid() currently looks up the VM from the
PASID xarray, drops the xarray lock, and only then grabs the task_info
reference through amdgpu_vm_get_task_info_vm().
Take the task_info reference directly while holding the PASID xarray
lock instead. This keeps the lookup and reference acquisition in the
same critical section.
The task_info is dropped only after the VM is removed from the PASID
mapping, so a regular kref_get() is sufficient here.
Also remove the now unnecessary amdgpu_vm_get_vm_from_pasid() helper.
Fixes: b8f67b9ddf4f ("drm/amdgpu: change vm->task_info handling")
Signed-off-by: Fan Wu <fanwu01@zju.edu.cn>
---
v2:
- grab the task_info reference directly under the PASID xarray lock
- drop the extra local-variable comment
- use kref_get() instead of kref_get_unless_zero()
- simplify the changelog
drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 35 +++++++++++++++-----------
1 file changed, 20 insertions(+), 15 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index f2beb980e3c3..706ca8dd65d2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2468,19 +2468,6 @@ static void amdgpu_vm_destroy_task_info(struct kref *kref)
kfree(ti);
}
-static inline struct amdgpu_vm *
-amdgpu_vm_get_vm_from_pasid(struct amdgpu_device *adev, u32 pasid)
-{
- struct amdgpu_vm *vm;
- unsigned long flags;
-
- xa_lock_irqsave(&adev->vm_manager.pasids, flags);
- vm = xa_load(&adev->vm_manager.pasids, pasid);
- xa_unlock_irqrestore(&adev->vm_manager.pasids, flags);
-
- return vm;
-}
-
/**
* amdgpu_vm_put_task_info - reference down the vm task_info ptr
*
@@ -2527,8 +2514,26 @@ amdgpu_vm_get_task_info_vm(struct amdgpu_vm *vm)
struct amdgpu_task_info *
amdgpu_vm_get_task_info_pasid(struct amdgpu_device *adev, u32 pasid)
{
- return amdgpu_vm_get_task_info_vm(
- amdgpu_vm_get_vm_from_pasid(adev, pasid));
+ struct amdgpu_vm *vm;
+ unsigned long flags;
+ struct amdgpu_task_info *ti = NULL;
+
+ /*
+ * Acquire the task_info reference while holding the pasids xarray
+ * lock to prevent a race with amdgpu_vm_fini() which removes the
+ * PASID mapping before freeing the VM (embedded in struct amdgpu_fpriv).
+ * Without this, the VM could be freed between xa_load() return and
+ * the task_info dereference.
+ */
+ xa_lock_irqsave(&adev->vm_manager.pasids, flags);
+ vm = xa_load(&adev->vm_manager.pasids, pasid);
+ if (vm && vm->task_info) {
+ ti = vm->task_info;
+ kref_get(&ti->refcount);
+ }
+ xa_unlock_irqrestore(&adev->vm_manager.pasids, flags);
+
+ return ti;
}
static int amdgpu_vm_create_task_info(struct amdgpu_vm *vm)
--
2.34.1
© 2016 - 2026 Red Hat, Inc.