When using GPUVM in immediate mode, it is necessary to call
drm_gpuvm_unlink() from the fence signalling critical path. However,
unlink may call drm_gpuvm_bo_put(), which causes some challenges:
1. drm_gpuvm_bo_put() often requires you to take resv locks, which you
can't do from the fence signalling critical path.
2. drm_gpuvm_bo_put() calls drm_gem_object_put(), which is often going
to be unsafe to call from the fence signalling critical path.
To solve these issues, add a deferred version of drm_gpuvm_unlink() that
adds the vm_bo to a deferred cleanup list, and then clean it up later.
The new methods take the GEMs GPUVA lock internally rather than letting
the caller do it because it also needs to perform an operation after
releasing the mutex again. This is to prevent freeing the GEM while
holding the mutex (more info as comments in the patch). This means that
the new methods can only be used with DRM_GPUVM_IMMEDIATE_MODE.
Signed-off-by: Alice Ryhl <aliceryhl@google.com>
---
drivers/gpu/drm/drm_gpuvm.c | 167 ++++++++++++++++++++++++++++++++++++++++++++
include/drm/drm_gpuvm.h | 26 +++++++
2 files changed, 193 insertions(+)
diff --git a/drivers/gpu/drm/drm_gpuvm.c b/drivers/gpu/drm/drm_gpuvm.c
index 78a1a4f095095e9379bdf604d583f6c8b9863ccb..849b6c08f83dcba832eda372bd3ce62b540e144b 100644
--- a/drivers/gpu/drm/drm_gpuvm.c
+++ b/drivers/gpu/drm/drm_gpuvm.c
@@ -876,6 +876,25 @@ __drm_gpuvm_bo_list_add(struct drm_gpuvm *gpuvm, spinlock_t *lock,
cond_spin_unlock(lock, !!lock);
}
+/**
+ * drm_gpuvm_bo_is_dead() - check whether this vm_bo is scheduled for cleanup
+ * @vm_bo: the &drm_gpuvm_bo
+ *
+ * When a vm_bo is scheduled for cleanup using the bo_defer list, it is not
+ * immediately removed from the evict and extobj lists. Therefore, anyone
+ * iterating these lists should skip entries that are being destroyed.
+ *
+ * Checking the refcount without incrementing it is okay as long as the lock
+ * protecting the evict/extobj list is held for as long as you are using the
+ * vm_bo, because even if the refcount hits zero while you are using it, freeing
+ * the vm_bo requires taking the list's lock.
+ */
+static bool
+drm_gpuvm_bo_is_dead(struct drm_gpuvm_bo *vm_bo)
+{
+ return !kref_read(&vm_bo->kref);
+}
+
/**
* drm_gpuvm_bo_list_add() - insert a vm_bo into the given list
* @__vm_bo: the &drm_gpuvm_bo
@@ -1081,6 +1100,9 @@ drm_gpuvm_init(struct drm_gpuvm *gpuvm, const char *name,
INIT_LIST_HEAD(&gpuvm->evict.list);
spin_lock_init(&gpuvm->evict.lock);
+ INIT_LIST_HEAD(&gpuvm->bo_defer.list);
+ spin_lock_init(&gpuvm->bo_defer.lock);
+
kref_init(&gpuvm->kref);
gpuvm->name = name ? name : "unknown";
@@ -1122,6 +1144,8 @@ drm_gpuvm_fini(struct drm_gpuvm *gpuvm)
"Extobj list should be empty.\n");
drm_WARN(gpuvm->drm, !list_empty(&gpuvm->evict.list),
"Evict list should be empty.\n");
+ drm_WARN(gpuvm->drm, !list_empty(&gpuvm->bo_defer.list),
+ "VM BO cleanup list should be empty.\n");
drm_gem_object_put(gpuvm->r_obj);
}
@@ -1217,6 +1241,9 @@ drm_gpuvm_prepare_objects_locked(struct drm_gpuvm *gpuvm,
drm_gpuvm_resv_assert_held(gpuvm);
list_for_each_entry(vm_bo, &gpuvm->extobj.list, list.entry.extobj) {
+ if (drm_gpuvm_bo_is_dead(vm_bo))
+ continue;
+
ret = exec_prepare_obj(exec, vm_bo->obj, num_fences);
if (ret)
break;
@@ -1460,6 +1487,9 @@ drm_gpuvm_validate_locked(struct drm_gpuvm *gpuvm, struct drm_exec *exec)
list_for_each_entry_safe(vm_bo, next, &gpuvm->evict.list,
list.entry.evict) {
+ if (drm_gpuvm_bo_is_dead(vm_bo))
+ continue;
+
ret = ops->vm_bo_validate(vm_bo, exec);
if (ret)
break;
@@ -1560,6 +1590,7 @@ drm_gpuvm_bo_create(struct drm_gpuvm *gpuvm,
INIT_LIST_HEAD(&vm_bo->list.entry.extobj);
INIT_LIST_HEAD(&vm_bo->list.entry.evict);
+ INIT_LIST_HEAD(&vm_bo->list.entry.bo_defer);
return vm_bo;
}
@@ -1621,6 +1652,106 @@ drm_gpuvm_bo_put(struct drm_gpuvm_bo *vm_bo)
}
EXPORT_SYMBOL_GPL(drm_gpuvm_bo_put);
+static void
+drm_gpuvm_bo_defer_locked(struct kref *kref)
+{
+ struct drm_gpuvm_bo *vm_bo = container_of(kref, struct drm_gpuvm_bo,
+ kref);
+ struct drm_gpuvm *gpuvm = vm_bo->vm;
+
+ if (!drm_gpuvm_resv_protected(gpuvm)) {
+ drm_gpuvm_bo_list_del(vm_bo, extobj, true);
+ drm_gpuvm_bo_list_del(vm_bo, evict, true);
+ }
+
+ list_del(&vm_bo->list.entry.gem);
+ mutex_unlock(&vm_bo->obj->gpuva.lock);
+}
+
+/**
+ * drm_gpuvm_bo_put_deferred() - drop a struct drm_gpuvm_bo reference with
+ * deferred cleanup
+ * @vm_bo: the &drm_gpuvm_bo to release the reference of
+ *
+ * This releases a reference to @vm_bo.
+ *
+ * This might take and release the GEMs GPUVA lock. You should call
+ * drm_gpuvm_bo_deferred_cleanup() later to complete the cleanup process.
+ *
+ * Returns: true if vm_bo is being destroyed, false otherwise.
+ */
+bool
+drm_gpuvm_bo_put_deferred(struct drm_gpuvm_bo *vm_bo)
+{
+ bool defer;
+
+ drm_WARN_ON(vm_bo->vm->drm, !drm_gpuvm_immediate_mode(vm_bo->vm));
+
+ if (!vm_bo)
+ return false;
+
+ defer = kref_put_mutex(&vm_bo->kref, drm_gpuvm_bo_defer_locked,
+ &vm_bo->obj->gpuva.lock);
+
+ /*
+ * It's important that the GEM stays alive for the duration in which
+ * drm_gpuvm_bo_defer_locked() holds the mutex, but the instant we add
+ * the vm_bo to bo_defer, another thread might call
+ * drm_gpuvm_bo_deferred_cleanup() and put the GEM. For this reason, we
+ * add the vm_bo to bo_defer after releasing the GEM's mutex.
+ */
+ if (defer)
+ drm_gpuvm_bo_list_add(vm_bo, bo_defer, true);
+
+ return defer;
+}
+EXPORT_SYMBOL_GPL(drm_gpuvm_bo_put_deferred);
+
+/**
+ * drm_gpuvm_bo_deferred_cleanup() - clean up BOs in the deferred list
+ * deferred cleanup
+ * @gpuvm: the VM to clean up
+ *
+ * Cleans up &drm_gpuvm_bo instances in the deferred cleanup list.
+ */
+void
+drm_gpuvm_bo_deferred_cleanup(struct drm_gpuvm *gpuvm)
+{
+ const struct drm_gpuvm_ops *ops = gpuvm->ops;
+ struct drm_gpuvm_bo *vm_bo;
+ struct drm_gem_object *obj;
+ LIST_HEAD(bo_defer);
+
+ spin_lock(&gpuvm->bo_defer.lock);
+ list_replace_init(&gpuvm->bo_defer.list, &bo_defer);
+ spin_unlock(&gpuvm->bo_defer.lock);
+
+ if (drm_gpuvm_resv_protected(gpuvm)) {
+ dma_resv_lock(drm_gpuvm_resv(gpuvm), NULL);
+ list_for_each_entry(vm_bo, &bo_defer, list.entry.bo_defer) {
+ drm_gpuvm_bo_list_del(vm_bo, extobj, false);
+ drm_gpuvm_bo_list_del(vm_bo, evict, false);
+ }
+ dma_resv_unlock(drm_gpuvm_resv(gpuvm));
+ }
+
+ while (!list_empty(&bo_defer)) {
+ vm_bo = list_first_entry(&bo_defer,
+ struct drm_gpuvm_bo, list.entry.bo_defer);
+ obj = vm_bo->obj;
+
+ list_del(&vm_bo->list.entry.bo_defer);
+ if (ops && ops->vm_bo_free)
+ ops->vm_bo_free(vm_bo);
+ else
+ kfree(vm_bo);
+
+ drm_gpuvm_put(gpuvm);
+ drm_gem_object_put(obj);
+ }
+}
+EXPORT_SYMBOL_GPL(drm_gpuvm_bo_deferred_cleanup);
+
static struct drm_gpuvm_bo *
__drm_gpuvm_bo_find(struct drm_gpuvm *gpuvm,
struct drm_gem_object *obj)
@@ -1948,6 +2079,42 @@ drm_gpuva_unlink(struct drm_gpuva *va)
}
EXPORT_SYMBOL_GPL(drm_gpuva_unlink);
+/**
+ * drm_gpuva_unlink_defer() - unlink a &drm_gpuva with deferred vm_bo cleanup
+ * @va: the &drm_gpuva to unlink
+ *
+ * Similar to drm_gpuva_unlink(), but uses drm_gpuvm_bo_put_deferred() and takes
+ * the lock for the caller.
+ */
+void
+drm_gpuva_unlink_defer(struct drm_gpuva *va)
+{
+ struct drm_gem_object *obj = va->gem.obj;
+ struct drm_gpuvm_bo *vm_bo = va->vm_bo;
+
+ if (unlikely(!obj))
+ return;
+
+ drm_WARN_ON(vm_bo->vm->drm, !drm_gpuvm_immediate_mode(vm_bo->vm));
+
+ mutex_lock(&obj->gpuva.lock);
+ list_del_init(&va->gem.entry);
+
+ /*
+ * This is drm_gpuvm_bo_put_deferred() slightly modified since we
+ * already hold the mutex. It's important that we add the vm_bo to
+ * bo_defer after releasing the mutex for the same reason as in
+ * drm_gpuvm_bo_put_deferred().
+ */
+ if (kref_put(&vm_bo->kref, drm_gpuvm_bo_defer_locked))
+ drm_gpuvm_bo_list_add(vm_bo, bo_defer, true);
+ else
+ mutex_unlock(&obj->gpuva.lock);
+
+ va->vm_bo = NULL;
+}
+EXPORT_SYMBOL_GPL(drm_gpuva_unlink_defer);
+
/**
* drm_gpuva_find_first() - find the first &drm_gpuva in the given range
* @gpuvm: the &drm_gpuvm to search in
diff --git a/include/drm/drm_gpuvm.h b/include/drm/drm_gpuvm.h
index 727b8f336fad0d853998e4379cbd374155468e18..1f80389e14312f552a8abc95d12f52f83beb9be8 100644
--- a/include/drm/drm_gpuvm.h
+++ b/include/drm/drm_gpuvm.h
@@ -152,6 +152,7 @@ void drm_gpuva_remove(struct drm_gpuva *va);
void drm_gpuva_link(struct drm_gpuva *va, struct drm_gpuvm_bo *vm_bo);
void drm_gpuva_unlink(struct drm_gpuva *va);
+void drm_gpuva_unlink_defer(struct drm_gpuva *va);
struct drm_gpuva *drm_gpuva_find(struct drm_gpuvm *gpuvm,
u64 addr, u64 range);
@@ -331,6 +332,22 @@ struct drm_gpuvm {
*/
spinlock_t lock;
} evict;
+
+ /**
+ * @bo_defer: structure holding vm_bos that need to be destroyed
+ */
+ struct {
+ /**
+ * @bo_defer.list: &list_head storing &drm_gpuvm_bos that need
+ * to be destroyed
+ */
+ struct list_head list;
+
+ /**
+ * @bo_defer.lock: spinlock to protect the bo_defer list
+ */
+ spinlock_t lock;
+ } bo_defer;
};
void drm_gpuvm_init(struct drm_gpuvm *gpuvm, const char *name,
@@ -714,6 +731,12 @@ struct drm_gpuvm_bo {
* &drm_gpuvms evict list.
*/
struct list_head evict;
+
+ /**
+ * @list.entry.bo_defer: List entry to attach to
+ * the &drm_gpuvms bo_defer list.
+ */
+ struct list_head bo_defer;
} entry;
} list;
};
@@ -746,6 +769,9 @@ drm_gpuvm_bo_get(struct drm_gpuvm_bo *vm_bo)
bool drm_gpuvm_bo_put(struct drm_gpuvm_bo *vm_bo);
+bool drm_gpuvm_bo_put_deferred(struct drm_gpuvm_bo *vm_bo);
+void drm_gpuvm_bo_deferred_cleanup(struct drm_gpuvm *gpuvm);
+
struct drm_gpuvm_bo *
drm_gpuvm_bo_find(struct drm_gpuvm *gpuvm,
struct drm_gem_object *obj);
--
2.51.0.355.g5224444f11-goog
On Fri, 05 Sep 2025 12:11:28 +0000 Alice Ryhl <aliceryhl@google.com> wrote: > When using GPUVM in immediate mode, it is necessary to call > drm_gpuvm_unlink() from the fence signalling critical path. However, > unlink may call drm_gpuvm_bo_put(), which causes some challenges: > > 1. drm_gpuvm_bo_put() often requires you to take resv locks, which you > can't do from the fence signalling critical path. > 2. drm_gpuvm_bo_put() calls drm_gem_object_put(), which is often going > to be unsafe to call from the fence signalling critical path. > > To solve these issues, add a deferred version of drm_gpuvm_unlink() that > adds the vm_bo to a deferred cleanup list, and then clean it up later. > > The new methods take the GEMs GPUVA lock internally rather than letting > the caller do it because it also needs to perform an operation after > releasing the mutex again. This is to prevent freeing the GEM while > holding the mutex (more info as comments in the patch). This means that > the new methods can only be used with DRM_GPUVM_IMMEDIATE_MODE. > > Signed-off-by: Alice Ryhl <aliceryhl@google.com> > --- > drivers/gpu/drm/drm_gpuvm.c | 167 ++++++++++++++++++++++++++++++++++++++++++++ > include/drm/drm_gpuvm.h | 26 +++++++ > 2 files changed, 193 insertions(+) > > diff --git a/drivers/gpu/drm/drm_gpuvm.c b/drivers/gpu/drm/drm_gpuvm.c > index 78a1a4f095095e9379bdf604d583f6c8b9863ccb..849b6c08f83dcba832eda372bd3ce62b540e144b 100644 > --- a/drivers/gpu/drm/drm_gpuvm.c > +++ b/drivers/gpu/drm/drm_gpuvm.c > @@ -876,6 +876,25 @@ __drm_gpuvm_bo_list_add(struct drm_gpuvm *gpuvm, spinlock_t *lock, > cond_spin_unlock(lock, !!lock); > } > > +/** > + * drm_gpuvm_bo_is_dead() - check whether this vm_bo is scheduled for cleanup > + * @vm_bo: the &drm_gpuvm_bo > + * > + * When a vm_bo is scheduled for cleanup using the bo_defer list, it is not > + * immediately removed from the evict and extobj lists. Maybe mention that it can't be, because those lists are protected with the resv lock and we can't take this lock when we're in immediate mode? > Therefore, anyone > + * iterating these lists should skip entries that are being destroyed. > + * > + * Checking the refcount without incrementing it is okay as long as the lock > + * protecting the evict/extobj list is held for as long as you are using the > + * vm_bo, because even if the refcount hits zero while you are using it, freeing > + * the vm_bo requires taking the list's lock. > + */ > +static bool > +drm_gpuvm_bo_is_dead(struct drm_gpuvm_bo *vm_bo) > +{ > + return !kref_read(&vm_bo->kref); I'm not too sure I like the idea of [ab]using vm_bo::kref to defer the vm_bo release. I get why it's done like that, but I'm wondering why we don't defer the release of drm_gpuva objects instead (which is really what's being released in va_unlink()). I can imagine drivers wanting to attach resources to the gpuva that can't be released in the dma-signalling path in the future, and if we're doing that at the gpuva level, we also get rid of this kref dance, since the va will hold a vm_bo ref until it's destroyed. Any particular reason you went for vm_bo destruction deferral instead of gpuva? > +} > + > /** > * drm_gpuvm_bo_list_add() - insert a vm_bo into the given list > * @__vm_bo: the &drm_gpuvm_bo > @@ -1081,6 +1100,9 @@ drm_gpuvm_init(struct drm_gpuvm *gpuvm, const char *name, > INIT_LIST_HEAD(&gpuvm->evict.list); > spin_lock_init(&gpuvm->evict.lock); > > + INIT_LIST_HEAD(&gpuvm->bo_defer.list); > + spin_lock_init(&gpuvm->bo_defer.lock); > + > kref_init(&gpuvm->kref); > > gpuvm->name = name ? name : "unknown"; > @@ -1122,6 +1144,8 @@ drm_gpuvm_fini(struct drm_gpuvm *gpuvm) > "Extobj list should be empty.\n"); > drm_WARN(gpuvm->drm, !list_empty(&gpuvm->evict.list), > "Evict list should be empty.\n"); > + drm_WARN(gpuvm->drm, !list_empty(&gpuvm->bo_defer.list), > + "VM BO cleanup list should be empty.\n"); > > drm_gem_object_put(gpuvm->r_obj); > } > @@ -1217,6 +1241,9 @@ drm_gpuvm_prepare_objects_locked(struct drm_gpuvm *gpuvm, > > drm_gpuvm_resv_assert_held(gpuvm); > list_for_each_entry(vm_bo, &gpuvm->extobj.list, list.entry.extobj) { > + if (drm_gpuvm_bo_is_dead(vm_bo)) > + continue; > + > ret = exec_prepare_obj(exec, vm_bo->obj, num_fences); > if (ret) > break; > @@ -1460,6 +1487,9 @@ drm_gpuvm_validate_locked(struct drm_gpuvm *gpuvm, struct drm_exec *exec) > > list_for_each_entry_safe(vm_bo, next, &gpuvm->evict.list, > list.entry.evict) { > + if (drm_gpuvm_bo_is_dead(vm_bo)) > + continue; > + > ret = ops->vm_bo_validate(vm_bo, exec); > if (ret) > break; > @@ -1560,6 +1590,7 @@ drm_gpuvm_bo_create(struct drm_gpuvm *gpuvm, > > INIT_LIST_HEAD(&vm_bo->list.entry.extobj); > INIT_LIST_HEAD(&vm_bo->list.entry.evict); > + INIT_LIST_HEAD(&vm_bo->list.entry.bo_defer); > > return vm_bo; > } > @@ -1621,6 +1652,106 @@ drm_gpuvm_bo_put(struct drm_gpuvm_bo *vm_bo) > } > EXPORT_SYMBOL_GPL(drm_gpuvm_bo_put); > > +static void > +drm_gpuvm_bo_defer_locked(struct kref *kref) > +{ > + struct drm_gpuvm_bo *vm_bo = container_of(kref, struct drm_gpuvm_bo, > + kref); > + struct drm_gpuvm *gpuvm = vm_bo->vm; > + > + if (!drm_gpuvm_resv_protected(gpuvm)) { > + drm_gpuvm_bo_list_del(vm_bo, extobj, true); > + drm_gpuvm_bo_list_del(vm_bo, evict, true); > + } > + > + list_del(&vm_bo->list.entry.gem); > + mutex_unlock(&vm_bo->obj->gpuva.lock); I got tricked by this implicit unlock, and the conditional unlocks it creates in drm_gpuva_unlink_defer(). Honestly, I'd rather see this unlocked moved to drm_gpuva_unlink_defer() and a conditional unlock added to drm_gpuvm_bo_put_deferred(), because it's easier to reason about when the lock/unlock calls are in the same function (kref_put_mutex() being the equivalent of a conditional lock). > +} > + > +/** > + * drm_gpuvm_bo_put_deferred() - drop a struct drm_gpuvm_bo reference with > + * deferred cleanup > + * @vm_bo: the &drm_gpuvm_bo to release the reference of > + * > + * This releases a reference to @vm_bo. > + * > + * This might take and release the GEMs GPUVA lock. You should call > + * drm_gpuvm_bo_deferred_cleanup() later to complete the cleanup process. > + * > + * Returns: true if vm_bo is being destroyed, false otherwise. > + */ > +bool > +drm_gpuvm_bo_put_deferred(struct drm_gpuvm_bo *vm_bo) > +{ > + bool defer; > + > + drm_WARN_ON(vm_bo->vm->drm, !drm_gpuvm_immediate_mode(vm_bo->vm)); > + > + if (!vm_bo) > + return false; > + > + defer = kref_put_mutex(&vm_bo->kref, drm_gpuvm_bo_defer_locked, > + &vm_bo->obj->gpuva.lock); > + > + /* > + * It's important that the GEM stays alive for the duration in which > + * drm_gpuvm_bo_defer_locked() holds the mutex, but the instant we add > + * the vm_bo to bo_defer, another thread might call > + * drm_gpuvm_bo_deferred_cleanup() and put the GEM. For this reason, we > + * add the vm_bo to bo_defer after releasing the GEM's mutex. > + */ > + if (defer) > + drm_gpuvm_bo_list_add(vm_bo, bo_defer, true); > + > + return defer; > +} > +EXPORT_SYMBOL_GPL(drm_gpuvm_bo_put_deferred); > + > +/** > + * drm_gpuvm_bo_deferred_cleanup() - clean up BOs in the deferred list > + * deferred cleanup > + * @gpuvm: the VM to clean up > + * > + * Cleans up &drm_gpuvm_bo instances in the deferred cleanup list. > + */ > +void > +drm_gpuvm_bo_deferred_cleanup(struct drm_gpuvm *gpuvm) > +{ > + const struct drm_gpuvm_ops *ops = gpuvm->ops; > + struct drm_gpuvm_bo *vm_bo; > + struct drm_gem_object *obj; > + LIST_HEAD(bo_defer); > + > + spin_lock(&gpuvm->bo_defer.lock); > + list_replace_init(&gpuvm->bo_defer.list, &bo_defer); > + spin_unlock(&gpuvm->bo_defer.lock); > + > + if (drm_gpuvm_resv_protected(gpuvm)) { > + dma_resv_lock(drm_gpuvm_resv(gpuvm), NULL); > + list_for_each_entry(vm_bo, &bo_defer, list.entry.bo_defer) { > + drm_gpuvm_bo_list_del(vm_bo, extobj, false); > + drm_gpuvm_bo_list_del(vm_bo, evict, false); > + } > + dma_resv_unlock(drm_gpuvm_resv(gpuvm)); > + } > + > + while (!list_empty(&bo_defer)) { > + vm_bo = list_first_entry(&bo_defer, > + struct drm_gpuvm_bo, list.entry.bo_defer); > + obj = vm_bo->obj; > + > + list_del(&vm_bo->list.entry.bo_defer); > + if (ops && ops->vm_bo_free) > + ops->vm_bo_free(vm_bo); > + else > + kfree(vm_bo); > + > + drm_gpuvm_put(gpuvm); > + drm_gem_object_put(obj); > + } > +} > +EXPORT_SYMBOL_GPL(drm_gpuvm_bo_deferred_cleanup); > + > static struct drm_gpuvm_bo * > __drm_gpuvm_bo_find(struct drm_gpuvm *gpuvm, > struct drm_gem_object *obj) > @@ -1948,6 +2079,42 @@ drm_gpuva_unlink(struct drm_gpuva *va) > } > EXPORT_SYMBOL_GPL(drm_gpuva_unlink); > > +/** > + * drm_gpuva_unlink_defer() - unlink a &drm_gpuva with deferred vm_bo cleanup > + * @va: the &drm_gpuva to unlink > + * > + * Similar to drm_gpuva_unlink(), but uses drm_gpuvm_bo_put_deferred() and takes > + * the lock for the caller. > + */ > +void > +drm_gpuva_unlink_defer(struct drm_gpuva *va) > +{ > + struct drm_gem_object *obj = va->gem.obj; > + struct drm_gpuvm_bo *vm_bo = va->vm_bo; > + > + if (unlikely(!obj)) > + return; > + > + drm_WARN_ON(vm_bo->vm->drm, !drm_gpuvm_immediate_mode(vm_bo->vm)); > + > + mutex_lock(&obj->gpuva.lock); > + list_del_init(&va->gem.entry); > + > + /* > + * This is drm_gpuvm_bo_put_deferred() slightly modified since we > + * already hold the mutex. It's important that we add the vm_bo to > + * bo_defer after releasing the mutex for the same reason as in > + * drm_gpuvm_bo_put_deferred(). > + */ > + if (kref_put(&vm_bo->kref, drm_gpuvm_bo_defer_locked)) > + drm_gpuvm_bo_list_add(vm_bo, bo_defer, true); > + else > + mutex_unlock(&obj->gpuva.lock); > + > + va->vm_bo = NULL; > +} > +EXPORT_SYMBOL_GPL(drm_gpuva_unlink_defer); > + > /** > * drm_gpuva_find_first() - find the first &drm_gpuva in the given range > * @gpuvm: the &drm_gpuvm to search in > diff --git a/include/drm/drm_gpuvm.h b/include/drm/drm_gpuvm.h > index 727b8f336fad0d853998e4379cbd374155468e18..1f80389e14312f552a8abc95d12f52f83beb9be8 100644 > --- a/include/drm/drm_gpuvm.h > +++ b/include/drm/drm_gpuvm.h > @@ -152,6 +152,7 @@ void drm_gpuva_remove(struct drm_gpuva *va); > > void drm_gpuva_link(struct drm_gpuva *va, struct drm_gpuvm_bo *vm_bo); > void drm_gpuva_unlink(struct drm_gpuva *va); > +void drm_gpuva_unlink_defer(struct drm_gpuva *va); > > struct drm_gpuva *drm_gpuva_find(struct drm_gpuvm *gpuvm, > u64 addr, u64 range); > @@ -331,6 +332,22 @@ struct drm_gpuvm { > */ > spinlock_t lock; > } evict; > + > + /** > + * @bo_defer: structure holding vm_bos that need to be destroyed > + */ > + struct { > + /** > + * @bo_defer.list: &list_head storing &drm_gpuvm_bos that need > + * to be destroyed > + */ > + struct list_head list; > + > + /** > + * @bo_defer.lock: spinlock to protect the bo_defer list > + */ > + spinlock_t lock; > + } bo_defer; > }; > > void drm_gpuvm_init(struct drm_gpuvm *gpuvm, const char *name, > @@ -714,6 +731,12 @@ struct drm_gpuvm_bo { > * &drm_gpuvms evict list. > */ > struct list_head evict; > + > + /** > + * @list.entry.bo_defer: List entry to attach to > + * the &drm_gpuvms bo_defer list. > + */ > + struct list_head bo_defer; > } entry; > } list; > }; > @@ -746,6 +769,9 @@ drm_gpuvm_bo_get(struct drm_gpuvm_bo *vm_bo) > > bool drm_gpuvm_bo_put(struct drm_gpuvm_bo *vm_bo); > > +bool drm_gpuvm_bo_put_deferred(struct drm_gpuvm_bo *vm_bo); > +void drm_gpuvm_bo_deferred_cleanup(struct drm_gpuvm *gpuvm); > + > struct drm_gpuvm_bo * > drm_gpuvm_bo_find(struct drm_gpuvm *gpuvm, > struct drm_gem_object *obj); >
On Fri, Sep 5, 2025 at 3:25 PM Boris Brezillon <boris.brezillon@collabora.com> wrote: > > On Fri, 05 Sep 2025 12:11:28 +0000 > Alice Ryhl <aliceryhl@google.com> wrote: > > > When using GPUVM in immediate mode, it is necessary to call > > drm_gpuvm_unlink() from the fence signalling critical path. However, > > unlink may call drm_gpuvm_bo_put(), which causes some challenges: > > > > 1. drm_gpuvm_bo_put() often requires you to take resv locks, which you > > can't do from the fence signalling critical path. > > 2. drm_gpuvm_bo_put() calls drm_gem_object_put(), which is often going > > to be unsafe to call from the fence signalling critical path. > > > > To solve these issues, add a deferred version of drm_gpuvm_unlink() that > > adds the vm_bo to a deferred cleanup list, and then clean it up later. > > > > The new methods take the GEMs GPUVA lock internally rather than letting > > the caller do it because it also needs to perform an operation after > > releasing the mutex again. This is to prevent freeing the GEM while > > holding the mutex (more info as comments in the patch). This means that > > the new methods can only be used with DRM_GPUVM_IMMEDIATE_MODE. > > > > Signed-off-by: Alice Ryhl <aliceryhl@google.com> > > --- > > drivers/gpu/drm/drm_gpuvm.c | 167 ++++++++++++++++++++++++++++++++++++++++++++ > > include/drm/drm_gpuvm.h | 26 +++++++ > > 2 files changed, 193 insertions(+) > > > > diff --git a/drivers/gpu/drm/drm_gpuvm.c b/drivers/gpu/drm/drm_gpuvm.c > > index 78a1a4f095095e9379bdf604d583f6c8b9863ccb..849b6c08f83dcba832eda372bd3ce62b540e144b 100644 > > --- a/drivers/gpu/drm/drm_gpuvm.c > > +++ b/drivers/gpu/drm/drm_gpuvm.c > > @@ -876,6 +876,25 @@ __drm_gpuvm_bo_list_add(struct drm_gpuvm *gpuvm, spinlock_t *lock, > > cond_spin_unlock(lock, !!lock); > > } > > > > +/** > > + * drm_gpuvm_bo_is_dead() - check whether this vm_bo is scheduled for cleanup > > + * @vm_bo: the &drm_gpuvm_bo > > + * > > + * When a vm_bo is scheduled for cleanup using the bo_defer list, it is not > > + * immediately removed from the evict and extobj lists. > > Maybe mention that it can't be, because those lists are protected with > the resv lock and we can't take this lock when we're in immediate mode? Ok. > > Therefore, anyone > > + * iterating these lists should skip entries that are being destroyed. > > + * > > + * Checking the refcount without incrementing it is okay as long as the lock > > + * protecting the evict/extobj list is held for as long as you are using the > > + * vm_bo, because even if the refcount hits zero while you are using it, freeing > > + * the vm_bo requires taking the list's lock. > > + */ > > +static bool > > +drm_gpuvm_bo_is_dead(struct drm_gpuvm_bo *vm_bo) > > +{ > > + return !kref_read(&vm_bo->kref); > > I'm not too sure I like the idea of [ab]using vm_bo::kref to defer the > vm_bo release. I get why it's done like that, but I'm wondering why we > don't defer the release of drm_gpuva objects instead (which is really > what's being released in va_unlink()). I can imagine drivers wanting to > attach resources to the gpuva that can't be released in the > dma-signalling path in the future, and if we're doing that at the gpuva > level, we also get rid of this kref dance, since the va will hold a > vm_bo ref until it's destroyed. > > Any particular reason you went for vm_bo destruction deferral instead > of gpuva? All of the things that were unsafe to release in the signalling path were tied to the vm_bo, so that is why I went for vm_bo cleanup. Another advantage is that it lets us use the same deferred logic for the vm_bo_put() call that drops the refcount from vm_bo_obtain(). Of course if gpuvas might have resources that need deferred cleanup, that might change the situation somewhat. > > +} > > + > > /** > > * drm_gpuvm_bo_list_add() - insert a vm_bo into the given list > > * @__vm_bo: the &drm_gpuvm_bo > > @@ -1081,6 +1100,9 @@ drm_gpuvm_init(struct drm_gpuvm *gpuvm, const char *name, > > INIT_LIST_HEAD(&gpuvm->evict.list); > > spin_lock_init(&gpuvm->evict.lock); > > > > + INIT_LIST_HEAD(&gpuvm->bo_defer.list); > > + spin_lock_init(&gpuvm->bo_defer.lock); > > + > > kref_init(&gpuvm->kref); > > > > gpuvm->name = name ? name : "unknown"; > > @@ -1122,6 +1144,8 @@ drm_gpuvm_fini(struct drm_gpuvm *gpuvm) > > "Extobj list should be empty.\n"); > > drm_WARN(gpuvm->drm, !list_empty(&gpuvm->evict.list), > > "Evict list should be empty.\n"); > > + drm_WARN(gpuvm->drm, !list_empty(&gpuvm->bo_defer.list), > > + "VM BO cleanup list should be empty.\n"); > > > > drm_gem_object_put(gpuvm->r_obj); > > } > > @@ -1217,6 +1241,9 @@ drm_gpuvm_prepare_objects_locked(struct drm_gpuvm *gpuvm, > > > > drm_gpuvm_resv_assert_held(gpuvm); > > list_for_each_entry(vm_bo, &gpuvm->extobj.list, list.entry.extobj) { > > + if (drm_gpuvm_bo_is_dead(vm_bo)) > > + continue; > > + > > ret = exec_prepare_obj(exec, vm_bo->obj, num_fences); > > if (ret) > > break; > > @@ -1460,6 +1487,9 @@ drm_gpuvm_validate_locked(struct drm_gpuvm *gpuvm, struct drm_exec *exec) > > > > list_for_each_entry_safe(vm_bo, next, &gpuvm->evict.list, > > list.entry.evict) { > > + if (drm_gpuvm_bo_is_dead(vm_bo)) > > + continue; > > + > > ret = ops->vm_bo_validate(vm_bo, exec); > > if (ret) > > break; > > @@ -1560,6 +1590,7 @@ drm_gpuvm_bo_create(struct drm_gpuvm *gpuvm, > > > > INIT_LIST_HEAD(&vm_bo->list.entry.extobj); > > INIT_LIST_HEAD(&vm_bo->list.entry.evict); > > + INIT_LIST_HEAD(&vm_bo->list.entry.bo_defer); > > > > return vm_bo; > > } > > @@ -1621,6 +1652,106 @@ drm_gpuvm_bo_put(struct drm_gpuvm_bo *vm_bo) > > } > > EXPORT_SYMBOL_GPL(drm_gpuvm_bo_put); > > > > +static void > > +drm_gpuvm_bo_defer_locked(struct kref *kref) > > +{ > > + struct drm_gpuvm_bo *vm_bo = container_of(kref, struct drm_gpuvm_bo, > > + kref); > > + struct drm_gpuvm *gpuvm = vm_bo->vm; > > + > > + if (!drm_gpuvm_resv_protected(gpuvm)) { > > + drm_gpuvm_bo_list_del(vm_bo, extobj, true); > > + drm_gpuvm_bo_list_del(vm_bo, evict, true); > > + } > > + > > + list_del(&vm_bo->list.entry.gem); > > + mutex_unlock(&vm_bo->obj->gpuva.lock); > > I got tricked by this implicit unlock, and the conditional unlocks it > creates in drm_gpuva_unlink_defer(). Honestly, I'd rather see this > unlocked moved to drm_gpuva_unlink_defer() and a conditional unlock > added to drm_gpuvm_bo_put_deferred(), because it's easier to reason > about when the lock/unlock calls are in the same function > (kref_put_mutex() being the equivalent of a conditional lock). Ok. I followed the docs of kref_put_mutex() that say to unlock it from the function. Alice
On Fri Sep 5, 2025 at 8:18 PM CEST, Alice Ryhl wrote: > On Fri, Sep 5, 2025 at 3:25 PM Boris Brezillon > <boris.brezillon@collabora.com> wrote: >> On Fri, 05 Sep 2025 12:11:28 +0000 >> Alice Ryhl <aliceryhl@google.com> wrote: >> > +static bool >> > +drm_gpuvm_bo_is_dead(struct drm_gpuvm_bo *vm_bo) >> > +{ >> > + return !kref_read(&vm_bo->kref); >> >> I'm not too sure I like the idea of [ab]using vm_bo::kref to defer the >> vm_bo release. I get why it's done like that, but I'm wondering why we >> don't defer the release of drm_gpuva objects instead (which is really >> what's being released in va_unlink()). I can imagine drivers wanting to >> attach resources to the gpuva that can't be released in the >> dma-signalling path in the future, and if we're doing that at the gpuva >> level, we also get rid of this kref dance, since the va will hold a >> vm_bo ref until it's destroyed. >> >> Any particular reason you went for vm_bo destruction deferral instead >> of gpuva? > > All of the things that were unsafe to release in the signalling path > were tied to the vm_bo, so that is why I went for vm_bo cleanup. > Another advantage is that it lets us use the same deferred logic for > the vm_bo_put() call that drops the refcount from vm_bo_obtain(). > > Of course if gpuvas might have resources that need deferred cleanup, > that might change the situation somewhat. I think we want to track PT(E) allocations, or rather reference counts of page table structures carried by the drm_gpuva, but we don't need to release them on drm_gpuva_unlink(), which is where we drop the reference count of the vm_bo. Deferring drm_gpuva_unlink() isn't really an option I think, the GEMs list of VM_BOs and the VM_BOs list of VAs is usually used in ttm_device_funcs::move to map or unmap all VAs associated with a GEM object. I think PT(E) reference counts etc. should be rather released when the drm_gpuva is freed, i.e. page table allocations can be bound to the lifetime of a drm_gpuva. Given that, I think that eventually we'll need a cleanup list for those as well, since once they're removed from the VM tree (in the fence signalling critical path), we loose access otherwise. >> > +static void >> > +drm_gpuvm_bo_defer_locked(struct kref *kref) >> > +{ >> > + struct drm_gpuvm_bo *vm_bo = container_of(kref, struct drm_gpuvm_bo, >> > + kref); >> > + struct drm_gpuvm *gpuvm = vm_bo->vm; >> > + >> > + if (!drm_gpuvm_resv_protected(gpuvm)) { >> > + drm_gpuvm_bo_list_del(vm_bo, extobj, true); >> > + drm_gpuvm_bo_list_del(vm_bo, evict, true); >> > + } >> > + >> > + list_del(&vm_bo->list.entry.gem); >> > + mutex_unlock(&vm_bo->obj->gpuva.lock); >> >> I got tricked by this implicit unlock, and the conditional unlocks it >> creates in drm_gpuva_unlink_defer(). Honestly, I'd rather see this >> unlocked moved to drm_gpuva_unlink_defer() and a conditional unlock >> added to drm_gpuvm_bo_put_deferred(), because it's easier to reason >> about when the lock/unlock calls are in the same function >> (kref_put_mutex() being the equivalent of a conditional lock). > > Ok. I followed the docs of kref_put_mutex() that say to unlock it from > the function. Yes, please keep it the way it is, I don't want to deviate from what is documented and everyone else does. Besides that, I also think it's a little less error prone.
On Sat, Sep 06, 2025 at 12:47:36AM +0200, Danilo Krummrich wrote: > On Fri Sep 5, 2025 at 8:18 PM CEST, Alice Ryhl wrote: > > On Fri, Sep 5, 2025 at 3:25 PM Boris Brezillon > > <boris.brezillon@collabora.com> wrote: > >> On Fri, 05 Sep 2025 12:11:28 +0000 > >> Alice Ryhl <aliceryhl@google.com> wrote: > >> > +static bool > >> > +drm_gpuvm_bo_is_dead(struct drm_gpuvm_bo *vm_bo) > >> > +{ > >> > + return !kref_read(&vm_bo->kref); > >> > >> I'm not too sure I like the idea of [ab]using vm_bo::kref to defer the > >> vm_bo release. I get why it's done like that, but I'm wondering why we > >> don't defer the release of drm_gpuva objects instead (which is really > >> what's being released in va_unlink()). I can imagine drivers wanting to > >> attach resources to the gpuva that can't be released in the > >> dma-signalling path in the future, and if we're doing that at the gpuva > >> level, we also get rid of this kref dance, since the va will hold a > >> vm_bo ref until it's destroyed. > >> > >> Any particular reason you went for vm_bo destruction deferral instead > >> of gpuva? > > > > All of the things that were unsafe to release in the signalling path > > were tied to the vm_bo, so that is why I went for vm_bo cleanup. > > Another advantage is that it lets us use the same deferred logic for > > the vm_bo_put() call that drops the refcount from vm_bo_obtain(). > > > > Of course if gpuvas might have resources that need deferred cleanup, > > that might change the situation somewhat. > > I think we want to track PT(E) allocations, or rather reference counts of page > table structures carried by the drm_gpuva, but we don't need to release them on > drm_gpuva_unlink(), which is where we drop the reference count of the vm_bo. > > Deferring drm_gpuva_unlink() isn't really an option I think, the GEMs list of > VM_BOs and the VM_BOs list of VAs is usually used in ttm_device_funcs::move to > map or unmap all VAs associated with a GEM object. > > I think PT(E) reference counts etc. should be rather released when the drm_gpuva > is freed, i.e. page table allocations can be bound to the lifetime of a > drm_gpuva. Given that, I think that eventually we'll need a cleanup list for > those as well, since once they're removed from the VM tree (in the fence > signalling critical path), we loose access otherwise. Hmm. Another more conceptual issue with deferring gpuva is that "immediate mode" is defined as having the GPUVM match the GPU's actual address space at all times, which deferred gpuva cleanup would go against. Deferring vm_bo cleanup doesn't have this issue because even though the vm_bo isn't kfreed immediately, all GPUVM apis still treat it as-if it isn't there anymore. > >> > +static void > >> > +drm_gpuvm_bo_defer_locked(struct kref *kref) > >> > +{ > >> > + struct drm_gpuvm_bo *vm_bo = container_of(kref, struct drm_gpuvm_bo, > >> > + kref); > >> > + struct drm_gpuvm *gpuvm = vm_bo->vm; > >> > + > >> > + if (!drm_gpuvm_resv_protected(gpuvm)) { > >> > + drm_gpuvm_bo_list_del(vm_bo, extobj, true); > >> > + drm_gpuvm_bo_list_del(vm_bo, evict, true); > >> > + } > >> > + > >> > + list_del(&vm_bo->list.entry.gem); > >> > + mutex_unlock(&vm_bo->obj->gpuva.lock); > >> > >> I got tricked by this implicit unlock, and the conditional unlocks it > >> creates in drm_gpuva_unlink_defer(). Honestly, I'd rather see this > >> unlocked moved to drm_gpuva_unlink_defer() and a conditional unlock > >> added to drm_gpuvm_bo_put_deferred(), because it's easier to reason > >> about when the lock/unlock calls are in the same function > >> (kref_put_mutex() being the equivalent of a conditional lock). > > > > Ok. I followed the docs of kref_put_mutex() that say to unlock it from > > the function. > > Yes, please keep it the way it is, I don't want to deviate from what is > documented and everyone else does. Besides that, I also think it's a little > less error prone. I gave it a try: bool drm_gpuvm_bo_put_deferred(struct drm_gpuvm_bo *vm_bo) { drm_WARN_ON(vm_bo->vm->drm, !drm_gpuvm_immediate_mode(vm_bo->vm)); if (!vm_bo) return false; if (kref_put_mutex(&vm_bo->kref, drm_gpuvm_bo_defer_locked, &vm_bo->obj->gpuva.lock)) { /* * It's important that the GEM stays alive for the duration in which * drm_gpuvm_bo_defer_locked() holds the mutex, but the instant we add * the vm_bo to bo_defer, another thread might call * drm_gpuvm_bo_deferred_cleanup() and put the GEM. For this reason, we * add the vm_bo to bo_defer *after* releasing the GEM's mutex. */ mutex_unlock(&vm_bo->obj->gpuva.lock); drm_gpuvm_bo_list_add(vm_bo, bo_defer, true); return true; } return false; } void drm_gpuva_unlink_defer(struct drm_gpuva *va) { struct drm_gem_object *obj = va->gem.obj; struct drm_gpuvm_bo *vm_bo = va->vm_bo; bool should_defer_bo; if (unlikely(!obj)) return; drm_WARN_ON(vm_bo->vm->drm, !drm_gpuvm_immediate_mode(vm_bo->vm)); mutex_lock(&obj->gpuva.lock); list_del_init(&va->gem.entry); /* * This is drm_gpuvm_bo_put_deferred() slightly modified since we * already hold the mutex. It's important that we add the vm_bo to * bo_defer after releasing the mutex for the same reason as in * drm_gpuvm_bo_put_deferred(). */ should_defer_bo = kref_put(&vm_bo->kref, drm_gpuvm_bo_defer_locked); mutex_unlock(&obj->gpuva.lock); if (should_defer_bo) drm_gpuvm_bo_list_add(vm_bo, bo_defer, true); va->vm_bo = NULL; } I do think it looks relatively nice like this, particularly drm_gpuva_unlink_defer(). But that's also the one not using kref_put_mutex(). Alice
On Sun, 7 Sep 2025 11:15:20 +0000 Alice Ryhl <aliceryhl@google.com> wrote: > On Sat, Sep 06, 2025 at 12:47:36AM +0200, Danilo Krummrich wrote: > > On Fri Sep 5, 2025 at 8:18 PM CEST, Alice Ryhl wrote: > > > On Fri, Sep 5, 2025 at 3:25 PM Boris Brezillon > > > <boris.brezillon@collabora.com> wrote: > > >> On Fri, 05 Sep 2025 12:11:28 +0000 > > >> Alice Ryhl <aliceryhl@google.com> wrote: > > >> > +static bool > > >> > +drm_gpuvm_bo_is_dead(struct drm_gpuvm_bo *vm_bo) > > >> > +{ > > >> > + return !kref_read(&vm_bo->kref); > > >> > > >> I'm not too sure I like the idea of [ab]using vm_bo::kref to defer the > > >> vm_bo release. I get why it's done like that, but I'm wondering why we > > >> don't defer the release of drm_gpuva objects instead (which is really > > >> what's being released in va_unlink()). I can imagine drivers wanting to > > >> attach resources to the gpuva that can't be released in the > > >> dma-signalling path in the future, and if we're doing that at the gpuva > > >> level, we also get rid of this kref dance, since the va will hold a > > >> vm_bo ref until it's destroyed. > > >> > > >> Any particular reason you went for vm_bo destruction deferral instead > > >> of gpuva? > > > > > > All of the things that were unsafe to release in the signalling path > > > were tied to the vm_bo, so that is why I went for vm_bo cleanup. > > > Another advantage is that it lets us use the same deferred logic for > > > the vm_bo_put() call that drops the refcount from vm_bo_obtain(). > > > > > > Of course if gpuvas might have resources that need deferred cleanup, > > > that might change the situation somewhat. > > > > I think we want to track PT(E) allocations, or rather reference counts of page > > table structures carried by the drm_gpuva, but we don't need to release them on > > drm_gpuva_unlink(), which is where we drop the reference count of the vm_bo. > > > > Deferring drm_gpuva_unlink() isn't really an option I think, the GEMs list of > > VM_BOs and the VM_BOs list of VAs is usually used in ttm_device_funcs::move to > > map or unmap all VAs associated with a GEM object. > > > > I think PT(E) reference counts etc. should be rather released when the drm_gpuva > > is freed, i.e. page table allocations can be bound to the lifetime of a > > drm_gpuva. Given that, I think that eventually we'll need a cleanup list for > > those as well, since once they're removed from the VM tree (in the fence > > signalling critical path), we loose access otherwise. > > Hmm. Another more conceptual issue with deferring gpuva is that > "immediate mode" is defined as having the GPUVM match the GPU's actual > address space at all times, which deferred gpuva cleanup would go > against. > > Deferring vm_bo cleanup doesn't have this issue because even though the > vm_bo isn't kfreed immediately, all GPUVM apis still treat it as-if it > isn't there anymore. > > > >> > +static void > > >> > +drm_gpuvm_bo_defer_locked(struct kref *kref) > > >> > +{ > > >> > + struct drm_gpuvm_bo *vm_bo = container_of(kref, struct drm_gpuvm_bo, > > >> > + kref); > > >> > + struct drm_gpuvm *gpuvm = vm_bo->vm; > > >> > + > > >> > + if (!drm_gpuvm_resv_protected(gpuvm)) { > > >> > + drm_gpuvm_bo_list_del(vm_bo, extobj, true); > > >> > + drm_gpuvm_bo_list_del(vm_bo, evict, true); > > >> > + } > > >> > + > > >> > + list_del(&vm_bo->list.entry.gem); > > >> > + mutex_unlock(&vm_bo->obj->gpuva.lock); > > >> > > >> I got tricked by this implicit unlock, and the conditional unlocks it > > >> creates in drm_gpuva_unlink_defer(). Honestly, I'd rather see this > > >> unlocked moved to drm_gpuva_unlink_defer() and a conditional unlock > > >> added to drm_gpuvm_bo_put_deferred(), because it's easier to reason > > >> about when the lock/unlock calls are in the same function > > >> (kref_put_mutex() being the equivalent of a conditional lock). > > > > > > Ok. I followed the docs of kref_put_mutex() that say to unlock it from > > > the function. > > > > Yes, please keep it the way it is, I don't want to deviate from what is > > documented and everyone else does. Besides that, I also think it's a little > > less error prone. > > I gave it a try: > > bool > drm_gpuvm_bo_put_deferred(struct drm_gpuvm_bo *vm_bo) > { > drm_WARN_ON(vm_bo->vm->drm, !drm_gpuvm_immediate_mode(vm_bo->vm)); > > if (!vm_bo) > return false; > > if (kref_put_mutex(&vm_bo->kref, drm_gpuvm_bo_defer_locked, > &vm_bo->obj->gpuva.lock)) { > /* > * It's important that the GEM stays alive for the duration in which > * drm_gpuvm_bo_defer_locked() holds the mutex, but the instant we add > * the vm_bo to bo_defer, another thread might call > * drm_gpuvm_bo_deferred_cleanup() and put the GEM. For this reason, we > * add the vm_bo to bo_defer *after* releasing the GEM's mutex. > */ > mutex_unlock(&vm_bo->obj->gpuva.lock); > drm_gpuvm_bo_list_add(vm_bo, bo_defer, true); > return true; > } > > return false; > } > > void > drm_gpuva_unlink_defer(struct drm_gpuva *va) > { > struct drm_gem_object *obj = va->gem.obj; > struct drm_gpuvm_bo *vm_bo = va->vm_bo; > bool should_defer_bo; > > if (unlikely(!obj)) > return; > > drm_WARN_ON(vm_bo->vm->drm, !drm_gpuvm_immediate_mode(vm_bo->vm)); > > mutex_lock(&obj->gpuva.lock); > list_del_init(&va->gem.entry); > > /* > * This is drm_gpuvm_bo_put_deferred() slightly modified since we > * already hold the mutex. It's important that we add the vm_bo to > * bo_defer after releasing the mutex for the same reason as in > * drm_gpuvm_bo_put_deferred(). > */ > should_defer_bo = kref_put(&vm_bo->kref, drm_gpuvm_bo_defer_locked); > mutex_unlock(&obj->gpuva.lock); > if (should_defer_bo) > drm_gpuvm_bo_list_add(vm_bo, bo_defer, true); > > va->vm_bo = NULL; > } > > I do think it looks relatively nice like this, particularly > drm_gpuva_unlink_defer(). I agree. > But that's also the one not using > kref_put_mutex(). Yeah, but that's the thing. I guess if drm_gpuvm_bo_defer_locked() was only called from kref_put_mutex() this would be okay (though I still have a hard time with those functions taking locks that have to be released by the caller, but at least that's a well-known/documented pattern). But it's also currently called from drm_gpuva_unlink_defer() where the lock is taken but not released. I guess if the function name was reflecting that (drm_gpuvm_bo_defer_locked_and_unlock()?), and with a comment explaining why the lock is conditionally released in the caller that would be acceptable, but I still find this locking scheme quite confusing...
On Sun Sep 7, 2025 at 1:15 PM CEST, Alice Ryhl wrote: > On Sat, Sep 06, 2025 at 12:47:36AM +0200, Danilo Krummrich wrote: >> On Fri Sep 5, 2025 at 8:18 PM CEST, Alice Ryhl wrote: >> > On Fri, Sep 5, 2025 at 3:25 PM Boris Brezillon >> > <boris.brezillon@collabora.com> wrote: >> >> On Fri, 05 Sep 2025 12:11:28 +0000 >> >> Alice Ryhl <aliceryhl@google.com> wrote: >> >> > +static bool >> >> > +drm_gpuvm_bo_is_dead(struct drm_gpuvm_bo *vm_bo) >> >> > +{ >> >> > + return !kref_read(&vm_bo->kref); >> >> >> >> I'm not too sure I like the idea of [ab]using vm_bo::kref to defer the >> >> vm_bo release. I get why it's done like that, but I'm wondering why we >> >> don't defer the release of drm_gpuva objects instead (which is really >> >> what's being released in va_unlink()). I can imagine drivers wanting to >> >> attach resources to the gpuva that can't be released in the >> >> dma-signalling path in the future, and if we're doing that at the gpuva >> >> level, we also get rid of this kref dance, since the va will hold a >> >> vm_bo ref until it's destroyed. >> >> >> >> Any particular reason you went for vm_bo destruction deferral instead >> >> of gpuva? >> > >> > All of the things that were unsafe to release in the signalling path >> > were tied to the vm_bo, so that is why I went for vm_bo cleanup. >> > Another advantage is that it lets us use the same deferred logic for >> > the vm_bo_put() call that drops the refcount from vm_bo_obtain(). >> > >> > Of course if gpuvas might have resources that need deferred cleanup, >> > that might change the situation somewhat. >> >> I think we want to track PT(E) allocations, or rather reference counts of page >> table structures carried by the drm_gpuva, but we don't need to release them on >> drm_gpuva_unlink(), which is where we drop the reference count of the vm_bo. >> >> Deferring drm_gpuva_unlink() isn't really an option I think, the GEMs list of >> VM_BOs and the VM_BOs list of VAs is usually used in ttm_device_funcs::move to >> map or unmap all VAs associated with a GEM object. >> >> I think PT(E) reference counts etc. should be rather released when the drm_gpuva >> is freed, i.e. page table allocations can be bound to the lifetime of a >> drm_gpuva. Given that, I think that eventually we'll need a cleanup list for >> those as well, since once they're removed from the VM tree (in the fence >> signalling critical path), we loose access otherwise. > > Hmm. Another more conceptual issue with deferring gpuva is that > "immediate mode" is defined as having the GPUVM match the GPU's actual > address space at all times, which deferred gpuva cleanup would go > against. Depends on what "deferred gpuva cleanup" means. What needs to happen in the run_job() is drm_gpuva_unlink() and drm_gpuva_unmap(). Freeing the drm_gpuva, inluding releasing the assoiciated driver specific resources, can be deferred. > Deferring vm_bo cleanup doesn't have this issue because even though the > vm_bo isn't kfreed immediately, all GPUVM apis still treat it as-if it > isn't there anymore.
On Sun, Sep 07, 2025 at 01:28:05PM +0200, Danilo Krummrich wrote: > On Sun Sep 7, 2025 at 1:15 PM CEST, Alice Ryhl wrote: > > On Sat, Sep 06, 2025 at 12:47:36AM +0200, Danilo Krummrich wrote: > >> On Fri Sep 5, 2025 at 8:18 PM CEST, Alice Ryhl wrote: > >> > On Fri, Sep 5, 2025 at 3:25 PM Boris Brezillon > >> > <boris.brezillon@collabora.com> wrote: > >> >> On Fri, 05 Sep 2025 12:11:28 +0000 > >> >> Alice Ryhl <aliceryhl@google.com> wrote: > >> >> > +static bool > >> >> > +drm_gpuvm_bo_is_dead(struct drm_gpuvm_bo *vm_bo) > >> >> > +{ > >> >> > + return !kref_read(&vm_bo->kref); > >> >> > >> >> I'm not too sure I like the idea of [ab]using vm_bo::kref to defer the > >> >> vm_bo release. I get why it's done like that, but I'm wondering why we > >> >> don't defer the release of drm_gpuva objects instead (which is really > >> >> what's being released in va_unlink()). I can imagine drivers wanting to > >> >> attach resources to the gpuva that can't be released in the > >> >> dma-signalling path in the future, and if we're doing that at the gpuva > >> >> level, we also get rid of this kref dance, since the va will hold a > >> >> vm_bo ref until it's destroyed. > >> >> > >> >> Any particular reason you went for vm_bo destruction deferral instead > >> >> of gpuva? > >> > > >> > All of the things that were unsafe to release in the signalling path > >> > were tied to the vm_bo, so that is why I went for vm_bo cleanup. > >> > Another advantage is that it lets us use the same deferred logic for > >> > the vm_bo_put() call that drops the refcount from vm_bo_obtain(). > >> > > >> > Of course if gpuvas might have resources that need deferred cleanup, > >> > that might change the situation somewhat. > >> > >> I think we want to track PT(E) allocations, or rather reference counts of page > >> table structures carried by the drm_gpuva, but we don't need to release them on > >> drm_gpuva_unlink(), which is where we drop the reference count of the vm_bo. > >> > >> Deferring drm_gpuva_unlink() isn't really an option I think, the GEMs list of > >> VM_BOs and the VM_BOs list of VAs is usually used in ttm_device_funcs::move to > >> map or unmap all VAs associated with a GEM object. > >> > >> I think PT(E) reference counts etc. should be rather released when the drm_gpuva > >> is freed, i.e. page table allocations can be bound to the lifetime of a > >> drm_gpuva. Given that, I think that eventually we'll need a cleanup list for > >> those as well, since once they're removed from the VM tree (in the fence > >> signalling critical path), we loose access otherwise. > > > > Hmm. Another more conceptual issue with deferring gpuva is that > > "immediate mode" is defined as having the GPUVM match the GPU's actual > > address space at all times, which deferred gpuva cleanup would go > > against. > > Depends on what "deferred gpuva cleanup" means. > > What needs to happen in the run_job() is drm_gpuva_unlink() and > drm_gpuva_unmap(). Freeing the drm_gpuva, inluding releasing the assoiciated > driver specific resources, can be deferred. Yeah I guess we could have unlink remove the gpuva, but then allow the end-user to attach the gpuva to a list of gpuvas to kfree deferred. That way, the drm_gpuva_unlink() is not deferred but any resources it has can be. Of course, this approach also makes deferred gpuva cleanup somewhat orthogonal to this patch. One annoying part is that we don't have an gpuvm ops operation for freeing gpuva, and if we add one for this, it would *only* be used in this case as most drivers explicitly kfree gpuvas, which could be confusing for end-users. Alice
Hi Alice, On Sun, 7 Sep 2025 11:39:41 +0000 Alice Ryhl <aliceryhl@google.com> wrote: > On Sun, Sep 07, 2025 at 01:28:05PM +0200, Danilo Krummrich wrote: > > On Sun Sep 7, 2025 at 1:15 PM CEST, Alice Ryhl wrote: > > > On Sat, Sep 06, 2025 at 12:47:36AM +0200, Danilo Krummrich wrote: > > >> On Fri Sep 5, 2025 at 8:18 PM CEST, Alice Ryhl wrote: > > >> > On Fri, Sep 5, 2025 at 3:25 PM Boris Brezillon > > >> > <boris.brezillon@collabora.com> wrote: > > >> >> On Fri, 05 Sep 2025 12:11:28 +0000 > > >> >> Alice Ryhl <aliceryhl@google.com> wrote: > > >> >> > +static bool > > >> >> > +drm_gpuvm_bo_is_dead(struct drm_gpuvm_bo *vm_bo) > > >> >> > +{ > > >> >> > + return !kref_read(&vm_bo->kref); > > >> >> > > >> >> I'm not too sure I like the idea of [ab]using vm_bo::kref to defer the > > >> >> vm_bo release. I get why it's done like that, but I'm wondering why we > > >> >> don't defer the release of drm_gpuva objects instead (which is really > > >> >> what's being released in va_unlink()). I can imagine drivers wanting to > > >> >> attach resources to the gpuva that can't be released in the > > >> >> dma-signalling path in the future, and if we're doing that at the gpuva > > >> >> level, we also get rid of this kref dance, since the va will hold a > > >> >> vm_bo ref until it's destroyed. > > >> >> > > >> >> Any particular reason you went for vm_bo destruction deferral instead > > >> >> of gpuva? > > >> > > > >> > All of the things that were unsafe to release in the signalling path > > >> > were tied to the vm_bo, so that is why I went for vm_bo cleanup. > > >> > Another advantage is that it lets us use the same deferred logic for > > >> > the vm_bo_put() call that drops the refcount from vm_bo_obtain(). > > >> > > > >> > Of course if gpuvas might have resources that need deferred cleanup, > > >> > that might change the situation somewhat. > > >> > > >> I think we want to track PT(E) allocations, or rather reference counts of page > > >> table structures carried by the drm_gpuva, but we don't need to release them on > > >> drm_gpuva_unlink(), which is where we drop the reference count of the vm_bo. > > >> > > >> Deferring drm_gpuva_unlink() isn't really an option I think, the GEMs list of > > >> VM_BOs and the VM_BOs list of VAs is usually used in ttm_device_funcs::move to > > >> map or unmap all VAs associated with a GEM object. > > >> > > >> I think PT(E) reference counts etc. should be rather released when the drm_gpuva > > >> is freed, i.e. page table allocations can be bound to the lifetime of a > > >> drm_gpuva. Given that, I think that eventually we'll need a cleanup list for > > >> those as well, since once they're removed from the VM tree (in the fence > > >> signalling critical path), we loose access otherwise. > > > > > > Hmm. Another more conceptual issue with deferring gpuva is that > > > "immediate mode" is defined as having the GPUVM match the GPU's actual > > > address space at all times, which deferred gpuva cleanup would go > > > against. > > > > Depends on what "deferred gpuva cleanup" means. > > > > What needs to happen in the run_job() is drm_gpuva_unlink() and > > drm_gpuva_unmap(). Freeing the drm_gpuva, inluding releasing the assoiciated > > driver specific resources, can be deferred. > > Yeah I guess we could have unlink remove the gpuva, but then allow the > end-user to attach the gpuva to a list of gpuvas to kfree deferred. That > way, the drm_gpuva_unlink() is not deferred but any resources it has can > be. This ^. > > Of course, this approach also makes deferred gpuva cleanup somewhat > orthogonal to this patch. Well, yes and no, because if you go for gpuva deferred cleanup, you don't really need the fancy kref_put() you have in this patch, it's just a regular vm_bo_put() that's called in the deferred gpuva path on the vm_bo attached to the gpuva being released. > > One annoying part is that we don't have an gpuvm ops operation for > freeing gpuva, and if we add one for this, it would *only* be used in > this case as most drivers explicitly kfree gpuvas, which could be > confusing for end-users. Also not sure ::vm_bo_free() was meant to be used like that. It was for drivers that need to control the drm_gpuvm_bo allocation, not those that rely on the default implementation (kmalloc). Given how things are described in the the doc, it feels weird to have a ::vm_bo_free() without ::vm_bo_alloc(). So, if we decide to go this way (which I'm still not convinced we should, given ultimately we might want to defer gpuvas cleanup), the ::vm_bo_free() doc should be extended to cover this 'deferred vm_bo free' case. Regards, Boris
On Mon, Sep 08, 2025 at 09:11:40AM +0200, Boris Brezillon wrote: > Hi Alice, > > On Sun, 7 Sep 2025 11:39:41 +0000 > Alice Ryhl <aliceryhl@google.com> wrote: > > > Yeah I guess we could have unlink remove the gpuva, but then allow the > > end-user to attach the gpuva to a list of gpuvas to kfree deferred. That > > way, the drm_gpuva_unlink() is not deferred but any resources it has can > > be. > > This ^. > > > > > Of course, this approach also makes deferred gpuva cleanup somewhat > > orthogonal to this patch. > > Well, yes and no, because if you go for gpuva deferred cleanup, you > don't really need the fancy kref_put() you have in this patch, it's > just a regular vm_bo_put() that's called in the deferred gpuva path on > the vm_bo attached to the gpuva being released. Ok, so what you suggest is that on gpuva_unlink() we remove the gpuva from the vm_bo's list, but then instead of putting the vm_bo's refcount, we add the gpuva to a list, and in the deferred cleanup codepath we iterate gpuvas and drop vm_bo refcounts *at that point*. Is that understood correctly? That means we don't immediately remove the vm_bo from the gem.gpuva list, but the gpuva list in the vm_bo will be empty. I guess you already have to handle such vm_bos anyway since you can already have an empty vm_bo in between vm_bo_obtain() and the first call to gpuva_link(). One disadvantage is that we might end up preparing or unevicting a GEM object that doesn't have any VAs left, which the current approach avoids. > > One annoying part is that we don't have an gpuvm ops operation for > > freeing gpuva, and if we add one for this, it would *only* be used in > > this case as most drivers explicitly kfree gpuvas, which could be > > confusing for end-users. > > Also not sure ::vm_bo_free() was meant to be used like that. It was for > drivers that need to control the drm_gpuvm_bo allocation, not those > that rely on the default implementation (kmalloc). Given how things > are described in the the doc, it feels weird to have a ::vm_bo_free() > without ::vm_bo_alloc(). So, if we decide to go this way (which I'm > still not convinced we should, given ultimately we might want to defer > gpuvas cleanup), the ::vm_bo_free() doc should be extended to cover > this 'deferred vm_bo free' case. I can implement vm_bo_alloc() too, but I think it seems like a pretty natural way to use vm_bo_free(). Alice
On Mon, 8 Sep 2025 08:26:13 +0000 Alice Ryhl <aliceryhl@google.com> wrote: > On Mon, Sep 08, 2025 at 09:11:40AM +0200, Boris Brezillon wrote: > > Hi Alice, > > > > On Sun, 7 Sep 2025 11:39:41 +0000 > > Alice Ryhl <aliceryhl@google.com> wrote: > > > > > Yeah I guess we could have unlink remove the gpuva, but then allow the > > > end-user to attach the gpuva to a list of gpuvas to kfree deferred. That > > > way, the drm_gpuva_unlink() is not deferred but any resources it has can > > > be. > > > > This ^. > > > > > > > > Of course, this approach also makes deferred gpuva cleanup somewhat > > > orthogonal to this patch. > > > > Well, yes and no, because if you go for gpuva deferred cleanup, you > > don't really need the fancy kref_put() you have in this patch, it's > > just a regular vm_bo_put() that's called in the deferred gpuva path on > > the vm_bo attached to the gpuva being released. > > Ok, so what you suggest is that on gpuva_unlink() we remove the gpuva > from the vm_bo's list, but then instead of putting the vm_bo's refcount, > we add the gpuva to a list, and in the deferred cleanup codepath we > iterate gpuvas and drop vm_bo refcounts *at that point*. Is that > understood correctly? Yes, that's what I'm suggesting. > > That means we don't immediately remove the vm_bo from the gem.gpuva > list, but the gpuva list in the vm_bo will be empty. I guess you already > have to handle such vm_bos anyway since you can already have an empty > vm_bo in between vm_bo_obtain() and the first call to gpuva_link(). > > One disadvantage is that we might end up preparing or unevicting a GEM > object that doesn't have any VAs left, which the current approach > avoids. Fair enough, though, as you said, the unnecessary ::prepare() already exists in the "queue-map-operation" path, because the object is added to the extobj list as soon as vm_bo_obtain() is called, not at _map() time. This being said, I don't know if it really matters in practice, because: 1. if buffer eviction happens too often, the system will already suffer from the constant swapout/swapin already, it's not a single buffer that's going to make a difference 2. hopefully the buffer will be gone before the next job is submitted most of the time 3. we can flush the gpuva destruction list before preparing/unevicting GEM objects, so we don't end up processing vm_bos that no longer exist. Actually, I think this step is good to have regardless of what we decide here, because the less elements we have to iterate the better, and in the submit path, we've already acquired all GEM locks to do that cleanup, so we're probably better off doing it right away. > > > > One annoying part is that we don't have an gpuvm ops operation for > > > freeing gpuva, and if we add one for this, it would *only* be used in > > > this case as most drivers explicitly kfree gpuvas, which could be > > > confusing for end-users. > > > > Also not sure ::vm_bo_free() was meant to be used like that. It was for > > drivers that need to control the drm_gpuvm_bo allocation, not those > > that rely on the default implementation (kmalloc). Given how things > > are described in the the doc, it feels weird to have a ::vm_bo_free() > > without ::vm_bo_alloc(). So, if we decide to go this way (which I'm > > still not convinced we should, given ultimately we might want to defer > > gpuvas cleanup), the ::vm_bo_free() doc should be extended to cover > > this 'deferred vm_bo free' case. > > I can implement vm_bo_alloc() too, but I think it seems like a pretty > natural way to use vm_bo_free(). Not necessarily saying we should have a vm_bo_alloc(), just saying it feels weird as it is now because of the asymmetry and how things are described in the doc.
On Mon Sep 8, 2025 at 10:26 AM CEST, Alice Ryhl wrote: > On Mon, Sep 08, 2025 at 09:11:40AM +0200, Boris Brezillon wrote: >> Hi Alice, >> >> On Sun, 7 Sep 2025 11:39:41 +0000 >> Alice Ryhl <aliceryhl@google.com> wrote: >> >> > Yeah I guess we could have unlink remove the gpuva, but then allow the >> > end-user to attach the gpuva to a list of gpuvas to kfree deferred. That >> > way, the drm_gpuva_unlink() is not deferred but any resources it has can >> > be. >> >> This ^. >> >> > >> > Of course, this approach also makes deferred gpuva cleanup somewhat >> > orthogonal to this patch. >> >> Well, yes and no, because if you go for gpuva deferred cleanup, you >> don't really need the fancy kref_put() you have in this patch, it's >> just a regular vm_bo_put() that's called in the deferred gpuva path on >> the vm_bo attached to the gpuva being released. > > Ok, so what you suggest is that on gpuva_unlink() we remove the gpuva > from the vm_bo's list, but then instead of putting the vm_bo's refcount, > we add the gpuva to a list, and in the deferred cleanup codepath we > iterate gpuvas and drop vm_bo refcounts *at that point*. Is that > understood correctly? It has to be a special unlink function though, since otherwise drm_gpuva_link(); drm_gpuva_unlink(); drm_gpuva_link(); drm_gpuva_unlink(); leaks the VM_BO. Sounds a bit messy, but my concern is really about the below: > That means we don't immediately remove the vm_bo from the gem.gpuva > list, but the gpuva list in the vm_bo will be empty. I guess you already > have to handle such vm_bos anyway since you can already have an empty > vm_bo in between vm_bo_obtain() and the first call to gpuva_link(). > > One disadvantage is that we might end up preparing or unevicting a GEM > object that doesn't have any VAs left, which the current approach > avoids. Yeah, we really want to avoid that.
On Mon, 08 Sep 2025 10:47:25 +0200 "Danilo Krummrich" <dakr@kernel.org> wrote: > On Mon Sep 8, 2025 at 10:26 AM CEST, Alice Ryhl wrote: > > On Mon, Sep 08, 2025 at 09:11:40AM +0200, Boris Brezillon wrote: > >> Hi Alice, > >> > >> On Sun, 7 Sep 2025 11:39:41 +0000 > >> Alice Ryhl <aliceryhl@google.com> wrote: > >> > >> > Yeah I guess we could have unlink remove the gpuva, but then allow the > >> > end-user to attach the gpuva to a list of gpuvas to kfree deferred. That > >> > way, the drm_gpuva_unlink() is not deferred but any resources it has can > >> > be. > >> > >> This ^. > >> > >> > > >> > Of course, this approach also makes deferred gpuva cleanup somewhat > >> > orthogonal to this patch. > >> > >> Well, yes and no, because if you go for gpuva deferred cleanup, you > >> don't really need the fancy kref_put() you have in this patch, it's > >> just a regular vm_bo_put() that's called in the deferred gpuva path on > >> the vm_bo attached to the gpuva being released. > > > > Ok, so what you suggest is that on gpuva_unlink() we remove the gpuva > > from the vm_bo's list, but then instead of putting the vm_bo's refcount, > > we add the gpuva to a list, and in the deferred cleanup codepath we > > iterate gpuvas and drop vm_bo refcounts *at that point*. Is that > > understood correctly? > > It has to be a special unlink function though, since otherwise > > drm_gpuva_link(); > drm_gpuva_unlink(); > drm_gpuva_link(); > drm_gpuva_unlink(); > > leaks the VM_BO. I'm not following. Yes there's going to be a drm_gpuva_unlink_defer_put() that skips the va->vm_bo = NULL; drm_gpuvm_bo_put(vm_bo); and adds the gpuva to a list for deferred destruction. But I'm not seeing where the leak is. Once the gpuva has been put in this list, there should be no existing component referring to this object, and it's going to be destroyed or recycled, but not re-used as-is. > Sounds a bit messy, but my concern is really about the below: > > > That means we don't immediately remove the vm_bo from the gem.gpuva > > list, but the gpuva list in the vm_bo will be empty. I guess you already > > have to handle such vm_bos anyway since you can already have an empty > > vm_bo in between vm_bo_obtain() and the first call to gpuva_link(). > > > > One disadvantage is that we might end up preparing or unevicting a GEM > > object that doesn't have any VAs left, which the current approach > > avoids. > > Yeah, we really want to avoid that. I think I agree that we want to avoid it, but I'm not too sure about the solution that involves playing with vm_bo::kref. I'm particularly worried by the fact drivers can iterate the evict/extobj lists directly, and can thus see objects scheduled for destruction. I know there's a gpuvm_bo_is_dead() helper, and drivers should be aware of the risks, but I don't feel comfortable about this. And since we've mentioned the possibility of having to support gpuva destruction deferral too, I'm wondering it wouldn't be cleaner to just go for this approach from the start (gpuva owns a ref to a vm_bo, which gets released when the gpuva object is released).
On Mon Sep 8, 2025 at 12:20 PM CEST, Boris Brezillon wrote: > I'm not following. Yes there's going to be a > drm_gpuva_unlink_defer_put() that skips the > > va->vm_bo = NULL; > drm_gpuvm_bo_put(vm_bo); > > and adds the gpuva to a list for deferred destruction. But I'm not > seeing where the leak is. Once the gpuva has been put in this list, > there should be no existing component referring to this object, and it's > going to be destroyed or recycled, but not re-used as-is. I'm saying exactly what you say: "has to be a special unlink function" -> drm_gpuva_unlink_defer_put(). :) >> Yeah, we really want to avoid that. > > I think I agree that we want to avoid it, but I'm not too sure about > the solution that involves playing with vm_bo::kref. I'm particularly > worried by the fact drivers can iterate the evict/extobj lists > directly, and can thus see objects scheduled for destruction. I know > there's a gpuvm_bo_is_dead() helper, and drivers should be aware of the > risks, but I don't feel comfortable about this. No, drivers can't iterate the evict/extobj lists directly; or at least this is not intended by GPUVM's API and if drivers do so, this is considered peeking into GPUVM internals, so drivers are on their own anyways. Iterators, such as for_each_vm_bo_in_list() are not exposed to drivers. > And since we've mentioned the possibility of having to support > gpuva destruction deferral too, I'm wondering it wouldn't be cleaner > to just go for this approach from the start (gpuva owns a ref to a > vm_bo, which gets released when the gpuva object is released).
On Mon, 08 Sep 2025 13:11:32 +0200 "Danilo Krummrich" <dakr@kernel.org> wrote: > On Mon Sep 8, 2025 at 12:20 PM CEST, Boris Brezillon wrote: > > I'm not following. Yes there's going to be a > > drm_gpuva_unlink_defer_put() that skips the > > > > va->vm_bo = NULL; > > drm_gpuvm_bo_put(vm_bo); > > > > and adds the gpuva to a list for deferred destruction. But I'm not > > seeing where the leak is. Once the gpuva has been put in this list, > > there should be no existing component referring to this object, and it's > > going to be destroyed or recycled, but not re-used as-is. > > I'm saying exactly what you say: "has to be a special unlink function" -> > drm_gpuva_unlink_defer_put(). :) I don't see how calling drm_gpuva_unlink() instead of drm_gpuva_unlink_defer_put() would leak the vm_bo though. I mean, it would certainly be wrong because you'd be calling cleanup methods that are expected to be called with the resv lock held from the dma-signalling path, but that's a different issue, no? Anyway, if we're going to allow gpuva cleanup/destruction deferral, we'll either need to do that through a different function, or through some specialization of drm_gpuva_unlink() that does things differently based on the immediate/non-immediate mode (or some other flag). > > >> Yeah, we really want to avoid that. > > > > I think I agree that we want to avoid it, but I'm not too sure about > > the solution that involves playing with vm_bo::kref. I'm particularly > > worried by the fact drivers can iterate the evict/extobj lists > > directly, and can thus see objects scheduled for destruction. I know > > there's a gpuvm_bo_is_dead() helper, and drivers should be aware of the > > risks, but I don't feel comfortable about this. > > No, drivers can't iterate the evict/extobj lists directly; or at least this is > not intended by GPUVM's API and if drivers do so, this is considered peeking > into GPUVM internals, so drivers are on their own anyways. > > Iterators, such as for_each_vm_bo_in_list() are not exposed to drivers. Okay, that's a good thing. I thought Xe was doing some funky stuff with the list... > > > And since we've mentioned the possibility of having to support > > gpuva destruction deferral too, I'm wondering it wouldn't be cleaner > > to just go for this approach from the start (gpuva owns a ref to a > > vm_bo, which gets released when the gpuva object is released).
On 9/8/25 2:11 PM, Boris Brezillon wrote: > On Mon, 08 Sep 2025 13:11:32 +0200 > "Danilo Krummrich" <dakr@kernel.org> wrote: >> I'm saying exactly what you say: "has to be a special unlink function" -> >> drm_gpuva_unlink_defer_put(). :) > > I don't see how calling drm_gpuva_unlink() instead of > drm_gpuva_unlink_defer_put() would leak the vm_bo though. Initially (i.e. a few mails back), it sounded to me as if you'd propose to drop the drm_gpuva's vm_bo reference only when it is freed. >> No, drivers can't iterate the evict/extobj lists directly; or at least this is >> not intended by GPUVM's API and if drivers do so, this is considered peeking >> into GPUVM internals, so drivers are on their own anyways. >> >> Iterators, such as for_each_vm_bo_in_list() are not exposed to drivers. > > Okay, that's a good thing. I thought Xe was doing some funky stuff with > the list... Maybe, I don't know. If they do so, the should send patches adding the corresponding iterators and provide a rationale why drivers need to access those lists directly and why we can't provide an API that handles the overall use-case, such as drm_gpuvm_prepare_objects(), etc.
Hi, On 9/8/25 14:20, Danilo Krummrich wrote: > On 9/8/25 2:11 PM, Boris Brezillon wrote: >> On Mon, 08 Sep 2025 13:11:32 +0200 >> "Danilo Krummrich" <dakr@kernel.org> wrote: >>> I'm saying exactly what you say: "has to be a special unlink function" -> >>> drm_gpuva_unlink_defer_put(). :) >> I don't see how calling drm_gpuva_unlink() instead of >> drm_gpuva_unlink_defer_put() would leak the vm_bo though. > Initially (i.e. a few mails back), it sounded to me as if you'd propose to drop > the drm_gpuva's vm_bo reference only when it is freed. > >>> No, drivers can't iterate the evict/extobj lists directly; or at least this is >>> not intended by GPUVM's API and if drivers do so, this is considered peeking >>> into GPUVM internals, so drivers are on their own anyways. >>> >>> Iterators, such as for_each_vm_bo_in_list() are not exposed to drivers. >> Okay, that's a good thing. I thought Xe was doing some funky stuff with >> the list... > Maybe, I don't know. If they do so, the should send patches adding the > corresponding iterators and provide a rationale why drivers need to access those > lists directly and why we can't provide an API that handles the overall > use-case, such as drm_gpuvm_prepare_objects(), etc. We're using the drm_gpuvm_*for_each* macros in drm_gpuvm.h, assuming from name and docs they are driver api. Also the drm_gem_for_each_gpuvm_bo(), although this usage could easily be converted to a helper. So I don't think we're abusing internals ATM. If so we should ofc fix that. IMO if some iteration macros or members that are exposed in the driver-facing headers need to be private (where it's not totally obvious) they should be marked as such or moved to private headers. Thanks, Thomas
On 9/9/25 12:39 PM, Thomas Hellström wrote: > On 9/8/25 14:20, Danilo Krummrich wrote: >> On 9/8/25 2:11 PM, Boris Brezillon wrote: >>> On Mon, 08 Sep 2025 13:11:32 +0200 >>> "Danilo Krummrich" <dakr@kernel.org> wrote: >>>> I'm saying exactly what you say: "has to be a special unlink function" -> >>>> drm_gpuva_unlink_defer_put(). :) >>> I don't see how calling drm_gpuva_unlink() instead of >>> drm_gpuva_unlink_defer_put() would leak the vm_bo though. >> Initially (i.e. a few mails back), it sounded to me as if you'd propose to drop >> the drm_gpuva's vm_bo reference only when it is freed. >> >>>> No, drivers can't iterate the evict/extobj lists directly; or at least this is >>>> not intended by GPUVM's API and if drivers do so, this is considered peeking >>>> into GPUVM internals, so drivers are on their own anyways. >>>> >>>> Iterators, such as for_each_vm_bo_in_list() are not exposed to drivers. >>> Okay, that's a good thing. I thought Xe was doing some funky stuff with >>> the list... >> Maybe, I don't know. If they do so, the should send patches adding the >> corresponding iterators and provide a rationale why drivers need to access those >> lists directly and why we can't provide an API that handles the overall >> use-case, such as drm_gpuvm_prepare_objects(), etc. > > We're using the drm_gpuvm_*for_each* macros in drm_gpuvm.h, assuming from name > and docs they are driver api. > > Also the drm_gem_for_each_gpuvm_bo(), although this usage could easily be > converted to a helper. We were talking about the extobj/evict lists, the ones you mention are fine of course. :)
On Tue, 2025-09-09 at 12:47 +0200, Danilo Krummrich wrote: > On 9/9/25 12:39 PM, Thomas Hellström wrote: > > On 9/8/25 14:20, Danilo Krummrich wrote: > > > On 9/8/25 2:11 PM, Boris Brezillon wrote: > > > > On Mon, 08 Sep 2025 13:11:32 +0200 > > > > "Danilo Krummrich" <dakr@kernel.org> wrote: > > > > > I'm saying exactly what you say: "has to be a special unlink > > > > > function" -> > > > > > drm_gpuva_unlink_defer_put(). :) > > > > I don't see how calling drm_gpuva_unlink() instead of > > > > drm_gpuva_unlink_defer_put() would leak the vm_bo though. > > > Initially (i.e. a few mails back), it sounded to me as if you'd > > > propose to drop > > > the drm_gpuva's vm_bo reference only when it is freed. > > > > > > > > No, drivers can't iterate the evict/extobj lists directly; or > > > > > at least this is > > > > > not intended by GPUVM's API and if drivers do so, this is > > > > > considered peeking > > > > > into GPUVM internals, so drivers are on their own anyways. > > > > > > > > > > Iterators, such as for_each_vm_bo_in_list() are not exposed > > > > > to drivers. > > > > Okay, that's a good thing. I thought Xe was doing some funky > > > > stuff with > > > > the list... > > > Maybe, I don't know. If they do so, the should send patches > > > adding the > > > corresponding iterators and provide a rationale why drivers need > > > to access those > > > lists directly and why we can't provide an API that handles the > > > overall > > > use-case, such as drm_gpuvm_prepare_objects(), etc. > > > > We're using the drm_gpuvm_*for_each* macros in drm_gpuvm.h, > > assuming from name > > and docs they are driver api. > > > > Also the drm_gem_for_each_gpuvm_bo(), although this usage could > > easily be > > converted to a helper. > > We were talking about the extobj/evict lists, the ones you mention > are fine of > course. :) > Hmm. Now on closer inspection it looks like we're checking for evict list empty, It looks like rebinding after validation may in theory evict some bos to system memory and then we'd rerun the validation step if the evict list was not empty. We could of course add a helper for that or if there are better suggestions to handle that situation, that'd be fine as well. Thanks, Thomas
On Tue, Sep 9, 2025 at 1:11 PM Thomas Hellström <thomas.hellstrom@linux.intel.com> wrote: > > On Tue, 2025-09-09 at 12:47 +0200, Danilo Krummrich wrote: > > On 9/9/25 12:39 PM, Thomas Hellström wrote: > > > On 9/8/25 14:20, Danilo Krummrich wrote: > > > > On 9/8/25 2:11 PM, Boris Brezillon wrote: > > > > > On Mon, 08 Sep 2025 13:11:32 +0200 > > > > > "Danilo Krummrich" <dakr@kernel.org> wrote: > > > > > > No, drivers can't iterate the evict/extobj lists directly; or > > > > > > at least this is > > > > > > not intended by GPUVM's API and if drivers do so, this is > > > > > > considered peeking > > > > > > into GPUVM internals, so drivers are on their own anyways. > > > > > > > > > > > > Iterators, such as for_each_vm_bo_in_list() are not exposed > > > > > > to drivers. > > > > > Okay, that's a good thing. I thought Xe was doing some funky > > > > > stuff with > > > > > the list... > > > > Maybe, I don't know. If they do so, the should send patches > > > > adding the > > > > corresponding iterators and provide a rationale why drivers need > > > > to access those > > > > lists directly and why we can't provide an API that handles the > > > > overall > > > > use-case, such as drm_gpuvm_prepare_objects(), etc. > > > > > > We're using the drm_gpuvm_*for_each* macros in drm_gpuvm.h, > > > assuming from name > > > and docs they are driver api. > > > > > > Also the drm_gem_for_each_gpuvm_bo(), although this usage could > > > easily be > > > converted to a helper. > > > > We were talking about the extobj/evict lists, the ones you mention > > are fine of > > course. :) > > > > Hmm. Now on closer inspection it looks like we're checking for evict > list empty, It looks like rebinding after validation may in theory > evict some bos to system memory and then we'd rerun the validation step > if the evict list was not empty. > > We could of course add a helper for that or if there are better > suggestions to handle that situation, that'd be fine as well. I don't think evict list empty means that there are no evicted GEMs. It's possible for an extobj to be missing from the evict list in some scenarios. That's why drm_gpuvm_prepare_objects_locked() checks evicted on the extobj list to ensure that the evicted list is up-to-date when you call into drm_gpuvm_validate_locked(). Alice
On 9/9/25 1:24 PM, Alice Ryhl wrote: > On Tue, Sep 9, 2025 at 1:11 PM Thomas Hellström > <thomas.hellstrom@linux.intel.com> wrote: >> >> On Tue, 2025-09-09 at 12:47 +0200, Danilo Krummrich wrote: >>> On 9/9/25 12:39 PM, Thomas Hellström wrote: >>>> On 9/8/25 14:20, Danilo Krummrich wrote: >>>>> On 9/8/25 2:11 PM, Boris Brezillon wrote: >>>>>> On Mon, 08 Sep 2025 13:11:32 +0200 >>>>>> "Danilo Krummrich" <dakr@kernel.org> wrote: >>>>>>> No, drivers can't iterate the evict/extobj lists directly; or >>>>>>> at least this is >>>>>>> not intended by GPUVM's API and if drivers do so, this is >>>>>>> considered peeking >>>>>>> into GPUVM internals, so drivers are on their own anyways. >>>>>>> >>>>>>> Iterators, such as for_each_vm_bo_in_list() are not exposed >>>>>>> to drivers. >>>>>> Okay, that's a good thing. I thought Xe was doing some funky >>>>>> stuff with >>>>>> the list... >>>>> Maybe, I don't know. If they do so, the should send patches >>>>> adding the >>>>> corresponding iterators and provide a rationale why drivers need >>>>> to access those >>>>> lists directly and why we can't provide an API that handles the >>>>> overall >>>>> use-case, such as drm_gpuvm_prepare_objects(), etc. >>>> >>>> We're using the drm_gpuvm_*for_each* macros in drm_gpuvm.h, >>>> assuming from name >>>> and docs they are driver api. >>>> >>>> Also the drm_gem_for_each_gpuvm_bo(), although this usage could >>>> easily be >>>> converted to a helper. >>> >>> We were talking about the extobj/evict lists, the ones you mention >>> are fine of >>> course. :) >>> >> >> Hmm. Now on closer inspection it looks like we're checking for evict >> list empty, It looks like rebinding after validation may in theory >> evict some bos to system memory and then we'd rerun the validation step >> if the evict list was not empty. >> >> We could of course add a helper for that or if there are better >> suggestions to handle that situation, that'd be fine as well. > > I don't think evict list empty means that there are no evicted GEMs. > It's possible for an extobj to be missing from the evict list in some > scenarios. That's why drm_gpuvm_prepare_objects_locked() checks > evicted on the extobj list to ensure that the evicted list is > up-to-date when you call into drm_gpuvm_validate_locked(). Indeed, though I would expect that Xe considers that? It was Thomas who proposed the logic you describe here back then IIRC. :)
On Tue, 2025-09-09 at 13:28 +0200, Danilo Krummrich wrote: > On 9/9/25 1:24 PM, Alice Ryhl wrote: > > On Tue, Sep 9, 2025 at 1:11 PM Thomas Hellström > > <thomas.hellstrom@linux.intel.com> wrote: > > > > > > On Tue, 2025-09-09 at 12:47 +0200, Danilo Krummrich wrote: > > > > On 9/9/25 12:39 PM, Thomas Hellström wrote: > > > > > On 9/8/25 14:20, Danilo Krummrich wrote: > > > > > > On 9/8/25 2:11 PM, Boris Brezillon wrote: > > > > > > > On Mon, 08 Sep 2025 13:11:32 +0200 > > > > > > > "Danilo Krummrich" <dakr@kernel.org> wrote: > > > > > > > > No, drivers can't iterate the evict/extobj lists > > > > > > > > directly; or > > > > > > > > at least this is > > > > > > > > not intended by GPUVM's API and if drivers do so, this > > > > > > > > is > > > > > > > > considered peeking > > > > > > > > into GPUVM internals, so drivers are on their own > > > > > > > > anyways. > > > > > > > > > > > > > > > > Iterators, such as for_each_vm_bo_in_list() are not > > > > > > > > exposed > > > > > > > > to drivers. > > > > > > > Okay, that's a good thing. I thought Xe was doing some > > > > > > > funky > > > > > > > stuff with > > > > > > > the list... > > > > > > Maybe, I don't know. If they do so, the should send patches > > > > > > adding the > > > > > > corresponding iterators and provide a rationale why drivers > > > > > > need > > > > > > to access those > > > > > > lists directly and why we can't provide an API that handles > > > > > > the > > > > > > overall > > > > > > use-case, such as drm_gpuvm_prepare_objects(), etc. > > > > > > > > > > We're using the drm_gpuvm_*for_each* macros in drm_gpuvm.h, > > > > > assuming from name > > > > > and docs they are driver api. > > > > > > > > > > Also the drm_gem_for_each_gpuvm_bo(), although this usage > > > > > could > > > > > easily be > > > > > converted to a helper. > > > > > > > > We were talking about the extobj/evict lists, the ones you > > > > mention > > > > are fine of > > > > course. :) > > > > > > > > > > Hmm. Now on closer inspection it looks like we're checking for > > > evict > > > list empty, It looks like rebinding after validation may in > > > theory > > > evict some bos to system memory and then we'd rerun the > > > validation step > > > if the evict list was not empty. > > > > > > We could of course add a helper for that or if there are better > > > suggestions to handle that situation, that'd be fine as well. > > > > I don't think evict list empty means that there are no evicted > > GEMs. > > It's possible for an extobj to be missing from the evict list in > > some > > scenarios. That's why drm_gpuvm_prepare_objects_locked() checks > > evicted on the extobj list to ensure that the evicted list is > > up-to-date when you call into drm_gpuvm_validate_locked(). > > Indeed, though I would expect that Xe considers that? It was Thomas > who proposed > the logic you describe here back then IIRC. :) > Yeah I don't think that eviction-while-validating could happen to extobjs, but rather to local objects, if it happens at all anymore, we've made a lot of changes in that area. But moving forward both the extobj scenario and local object scenarios will probably have to be considered in OOM situations, but then we'd of course need to suggest suitable additions to drm_gpuvm to handle that. /Thomas
On Sun Sep 7, 2025 at 1:39 PM CEST, Alice Ryhl wrote: > On Sun, Sep 07, 2025 at 01:28:05PM +0200, Danilo Krummrich wrote: >> On Sun Sep 7, 2025 at 1:15 PM CEST, Alice Ryhl wrote: >> > On Sat, Sep 06, 2025 at 12:47:36AM +0200, Danilo Krummrich wrote: >> >> On Fri Sep 5, 2025 at 8:18 PM CEST, Alice Ryhl wrote: >> >> > On Fri, Sep 5, 2025 at 3:25 PM Boris Brezillon >> >> > <boris.brezillon@collabora.com> wrote: >> >> >> On Fri, 05 Sep 2025 12:11:28 +0000 >> >> >> Alice Ryhl <aliceryhl@google.com> wrote: >> >> >> > +static bool >> >> >> > +drm_gpuvm_bo_is_dead(struct drm_gpuvm_bo *vm_bo) >> >> >> > +{ >> >> >> > + return !kref_read(&vm_bo->kref); >> >> >> >> >> >> I'm not too sure I like the idea of [ab]using vm_bo::kref to defer the >> >> >> vm_bo release. I get why it's done like that, but I'm wondering why we >> >> >> don't defer the release of drm_gpuva objects instead (which is really >> >> >> what's being released in va_unlink()). I can imagine drivers wanting to >> >> >> attach resources to the gpuva that can't be released in the >> >> >> dma-signalling path in the future, and if we're doing that at the gpuva >> >> >> level, we also get rid of this kref dance, since the va will hold a >> >> >> vm_bo ref until it's destroyed. >> >> >> >> >> >> Any particular reason you went for vm_bo destruction deferral instead >> >> >> of gpuva? >> >> > >> >> > All of the things that were unsafe to release in the signalling path >> >> > were tied to the vm_bo, so that is why I went for vm_bo cleanup. >> >> > Another advantage is that it lets us use the same deferred logic for >> >> > the vm_bo_put() call that drops the refcount from vm_bo_obtain(). >> >> > >> >> > Of course if gpuvas might have resources that need deferred cleanup, >> >> > that might change the situation somewhat. >> >> >> >> I think we want to track PT(E) allocations, or rather reference counts of page >> >> table structures carried by the drm_gpuva, but we don't need to release them on >> >> drm_gpuva_unlink(), which is where we drop the reference count of the vm_bo. >> >> >> >> Deferring drm_gpuva_unlink() isn't really an option I think, the GEMs list of >> >> VM_BOs and the VM_BOs list of VAs is usually used in ttm_device_funcs::move to >> >> map or unmap all VAs associated with a GEM object. >> >> >> >> I think PT(E) reference counts etc. should be rather released when the drm_gpuva >> >> is freed, i.e. page table allocations can be bound to the lifetime of a >> >> drm_gpuva. Given that, I think that eventually we'll need a cleanup list for >> >> those as well, since once they're removed from the VM tree (in the fence >> >> signalling critical path), we loose access otherwise. >> > >> > Hmm. Another more conceptual issue with deferring gpuva is that >> > "immediate mode" is defined as having the GPUVM match the GPU's actual >> > address space at all times, which deferred gpuva cleanup would go >> > against. >> >> Depends on what "deferred gpuva cleanup" means. >> >> What needs to happen in the run_job() is drm_gpuva_unlink() and >> drm_gpuva_unmap(). Freeing the drm_gpuva, inluding releasing the assoiciated >> driver specific resources, can be deferred. > > Yeah I guess we could have unlink remove the gpuva, but then allow the > end-user to attach the gpuva to a list of gpuvas to kfree deferred. That > way, the drm_gpuva_unlink() is not deferred but any resources it has can > be. > > Of course, this approach also makes deferred gpuva cleanup somewhat > orthogonal to this patch. Yes, it is. > One annoying part is that we don't have an gpuvm ops operation for > freeing gpuva, and if we add one for this, it would *only* be used in > this case as most drivers explicitly kfree gpuvas, which could be > confusing for end-users. I think the reason why I left GPUVA alloc and free to drivers was that I was expecting them to use a dedicated kmemcache for that. However, we can still provide drm_gpuva_alloc(), drm_gpuva_free() and va_free(), va_alloc() callbacks for drivers to implement.
© 2016 - 2025 Red Hat, Inc.