drivers/dma-buf/dma-fence.c | 39 ++++++++++++++++++++++++++++--------- include/linux/dma-fence.h | 17 ++++++++++++---- 2 files changed, 43 insertions(+), 13 deletions(-)
The dma_fence backend_ops can access a fence. Hereby, a driver callback
will be running which likely will access driver specific data through
container_of(). If now, simultaneously, a driver signals the fence and
afterwards expects to run a driver specific destructor (using the same
data accessed through container_of()), there can be a race.
A driver very likely trusts that once it has signaled a fence, no one
will be accessing it anymore. Moreover, it might already want to free up
resources, making UAF bugs possible.
The race occurs because there are only pragmatic checks for the signaled
flag of a fence, without taking the fence lock. RCU guards exist, but
their purpose is to guard accesses through the backend_ops callbacks
against the driver (which implements the TEXT segment these callbacks
live in) from unloading.
Proper synchronization can be ensured by taking the fence lock. RCU is
still simultaneously required to guard against the unload.
Fix the races by taking the lock for all non-deprecated backend_ops
callbacks.
Conveniently, this also fixes a race where backend_ops->set_deadline()
might try to set a deadline for an already signaled fence.
Suggested-by: Danilo Krummrich <dakr@kernel.org>
Signed-off-by: Philipp Stanner <phasta@kernel.org>
---
We discovered this problem through our Rust abstractions, but it can
also occur in C.
The by far cleanest solution seems to be to use the fence lock. This RFC
serves to discuss whether there is anything preventing that.
(Patch so far just compile tested, to have some groundlayer for the
rough idea, to discuss it first)
---
drivers/dma-buf/dma-fence.c | 39 ++++++++++++++++++++++++++++---------
include/linux/dma-fence.h | 17 ++++++++++++----
2 files changed, 43 insertions(+), 13 deletions(-)
diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c
index c7ea1e75d38a..b74f02f3cca8 100644
--- a/drivers/dma-buf/dma-fence.c
+++ b/drivers/dma-buf/dma-fence.c
@@ -629,7 +629,8 @@ EXPORT_SYMBOL(dma_fence_free);
static bool __dma_fence_enable_signaling(struct dma_fence *fence)
{
const struct dma_fence_ops *ops;
- bool was_set;
+ bool was_set, success;
+ unsigned long flags;
dma_fence_assert_held(fence);
@@ -644,7 +645,10 @@ static bool __dma_fence_enable_signaling(struct dma_fence *fence)
if (!was_set && ops && ops->enable_signaling) {
trace_dma_fence_enable_signal(fence);
- if (!ops->enable_signaling(fence)) {
+ dma_fence_lock_irqsave(fence, flags);
+ success = ops->enable_signaling(fence);
+ dma_fence_unlock_irqrestore(fence, flags);
+ if (!success) {
rcu_read_unlock();
dma_fence_signal_locked(fence);
return false;
@@ -1020,11 +1024,20 @@ EXPORT_SYMBOL(dma_fence_wait_any_timeout);
void dma_fence_set_deadline(struct dma_fence *fence, ktime_t deadline)
{
const struct dma_fence_ops *ops;
+ unsigned long flags;
rcu_read_lock();
ops = rcu_dereference(fence->ops);
- if (ops && ops->set_deadline && !dma_fence_is_signaled(fence))
+ if (!ops || !ops->set_deadline) {
+ rcu_read_unlock();
+ return;
+ }
+
+ dma_fence_lock_irqsave(fence, flags);
+ if (!dma_fence_is_signaled_locked(fence))
ops->set_deadline(fence, deadline);
+
+ dma_fence_unlock_irqrestore(fence, flags);
rcu_read_unlock();
}
EXPORT_SYMBOL(dma_fence_set_deadline);
@@ -1166,14 +1179,18 @@ EXPORT_SYMBOL(dma_fence_init64);
*/
const char __rcu *dma_fence_driver_name(struct dma_fence *fence)
{
+ const char __rcu *name = "detached-driver";
const struct dma_fence_ops *ops;
+ unsigned long flags;
/* RCU protection is required for safe access to returned string */
ops = rcu_dereference(fence->ops);
+ dma_fence_lock_irqsave(fence, flags);
if (!dma_fence_test_signaled_flag(fence))
- return (const char __rcu *)ops->get_driver_name(fence);
- else
- return (const char __rcu *)"detached-driver";
+ name = ops->get_driver_name(fence);
+ dma_fence_unlock_irqrestore(fence, flags);
+
+ return name;
}
EXPORT_SYMBOL(dma_fence_driver_name);
@@ -1199,13 +1216,17 @@ EXPORT_SYMBOL(dma_fence_driver_name);
*/
const char __rcu *dma_fence_timeline_name(struct dma_fence *fence)
{
+ const char __rcu *name = "signaled-timeline";
const struct dma_fence_ops *ops;
+ unsigned long flags;
/* RCU protection is required for safe access to returned string */
ops = rcu_dereference(fence->ops);
+ dma_fence_lock_irqsave(fence, flags);
if (!dma_fence_test_signaled_flag(fence))
- return (const char __rcu *)ops->get_driver_name(fence);
- else
- return (const char __rcu *)"signaled-timeline";
+ name = ops->get_driver_name(fence);
+ dma_fence_unlock_irqrestore(fence, flags);
+
+ return name;
}
EXPORT_SYMBOL(dma_fence_timeline_name);
diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h
index b52ab692b22e..b93c3f7f69fb 100644
--- a/include/linux/dma-fence.h
+++ b/include/linux/dma-fence.h
@@ -547,20 +547,29 @@ static inline bool
dma_fence_is_signaled(struct dma_fence *fence)
{
const struct dma_fence_ops *ops;
+ unsigned long flags;
+ bool signaled;
if (dma_fence_test_signaled_flag(fence))
return true;
rcu_read_lock();
ops = rcu_dereference(fence->ops);
- if (ops && ops->signaled && ops->signaled(fence)) {
+ if (!ops || !ops->signaled) {
rcu_read_unlock();
- dma_fence_signal(fence);
- return true;
+ return false;
}
+
+ dma_fence_lock_irqsave(fence, flags);
+ signaled = ops->signaled(fence);
+
+ if (signaled)
+ dma_fence_signal_locked(fence);
+
+ dma_fence_unlock_irqrestore(fence, flags);
rcu_read_unlock();
- return false;
+ return signaled;
}
/**
--
2.54.0
On 6/8/26 16:24, Philipp Stanner wrote:
> The dma_fence backend_ops can access a fence. Hereby, a driver callback
> will be running which likely will access driver specific data through
> container_of(). If now, simultaneously, a driver signals the fence and
> afterwards expects to run a driver specific destructor (using the same
> data accessed through container_of()), there can be a race.
>
> A driver very likely trusts that once it has signaled a fence, no one
> will be accessing it anymore. Moreover, it might already want to free up
> resources, making UAF bugs possible.
>
> The race occurs because there are only pragmatic checks for the signaled
> flag of a fence, without taking the fence lock. RCU guards exist, but
> their purpose is to guard accesses through the backend_ops callbacks
> against the driver (which implements the TEXT segment these callbacks
> live in) from unloading.
>
> Proper synchronization can be ensured by taking the fence lock. RCU is
> still simultaneously required to guard against the unload.
>
> Fix the races by taking the lock for all non-deprecated backend_ops
> callbacks.
That sounds like the fundamentally wrong approach to me.
The lock protects the dma_fence signaling state and *NOT* any driver state, so it should not be used to protect any driver state.
Drivers need to make sure that they protect their driver state with separate lock and don't rely on the dma_fence lock for this. This is actually the core of why we want to deprecate the shared dma_fence spinlock.
Regards,
Christian.
>
> Conveniently, this also fixes a race where backend_ops->set_deadline()
> might try to set a deadline for an already signaled fence.
>
> Suggested-by: Danilo Krummrich <dakr@kernel.org>
> Signed-off-by: Philipp Stanner <phasta@kernel.org>
> ---
> We discovered this problem through our Rust abstractions, but it can
> also occur in C.
>
> The by far cleanest solution seems to be to use the fence lock. This RFC
> serves to discuss whether there is anything preventing that.
>
> (Patch so far just compile tested, to have some groundlayer for the
> rough idea, to discuss it first)
> ---
> drivers/dma-buf/dma-fence.c | 39 ++++++++++++++++++++++++++++---------
> include/linux/dma-fence.h | 17 ++++++++++++----
> 2 files changed, 43 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c
> index c7ea1e75d38a..b74f02f3cca8 100644
> --- a/drivers/dma-buf/dma-fence.c
> +++ b/drivers/dma-buf/dma-fence.c
> @@ -629,7 +629,8 @@ EXPORT_SYMBOL(dma_fence_free);
> static bool __dma_fence_enable_signaling(struct dma_fence *fence)
> {
> const struct dma_fence_ops *ops;
> - bool was_set;
> + bool was_set, success;
> + unsigned long flags;
>
> dma_fence_assert_held(fence);
>
> @@ -644,7 +645,10 @@ static bool __dma_fence_enable_signaling(struct dma_fence *fence)
> if (!was_set && ops && ops->enable_signaling) {
> trace_dma_fence_enable_signal(fence);
>
> - if (!ops->enable_signaling(fence)) {
> + dma_fence_lock_irqsave(fence, flags);
> + success = ops->enable_signaling(fence);
> + dma_fence_unlock_irqrestore(fence, flags);
> + if (!success) {
> rcu_read_unlock();
> dma_fence_signal_locked(fence);
> return false;
> @@ -1020,11 +1024,20 @@ EXPORT_SYMBOL(dma_fence_wait_any_timeout);
> void dma_fence_set_deadline(struct dma_fence *fence, ktime_t deadline)
> {
> const struct dma_fence_ops *ops;
> + unsigned long flags;
>
> rcu_read_lock();
> ops = rcu_dereference(fence->ops);
> - if (ops && ops->set_deadline && !dma_fence_is_signaled(fence))
> + if (!ops || !ops->set_deadline) {
> + rcu_read_unlock();
> + return;
> + }
> +
> + dma_fence_lock_irqsave(fence, flags);
> + if (!dma_fence_is_signaled_locked(fence))
> ops->set_deadline(fence, deadline);
> +
> + dma_fence_unlock_irqrestore(fence, flags);
> rcu_read_unlock();
> }
> EXPORT_SYMBOL(dma_fence_set_deadline);
> @@ -1166,14 +1179,18 @@ EXPORT_SYMBOL(dma_fence_init64);
> */
> const char __rcu *dma_fence_driver_name(struct dma_fence *fence)
> {
> + const char __rcu *name = "detached-driver";
> const struct dma_fence_ops *ops;
> + unsigned long flags;
>
> /* RCU protection is required for safe access to returned string */
> ops = rcu_dereference(fence->ops);
> + dma_fence_lock_irqsave(fence, flags);
> if (!dma_fence_test_signaled_flag(fence))
> - return (const char __rcu *)ops->get_driver_name(fence);
> - else
> - return (const char __rcu *)"detached-driver";
> + name = ops->get_driver_name(fence);
> + dma_fence_unlock_irqrestore(fence, flags);
> +
> + return name;
> }
> EXPORT_SYMBOL(dma_fence_driver_name);
>
> @@ -1199,13 +1216,17 @@ EXPORT_SYMBOL(dma_fence_driver_name);
> */
> const char __rcu *dma_fence_timeline_name(struct dma_fence *fence)
> {
> + const char __rcu *name = "signaled-timeline";
> const struct dma_fence_ops *ops;
> + unsigned long flags;
>
> /* RCU protection is required for safe access to returned string */
> ops = rcu_dereference(fence->ops);
> + dma_fence_lock_irqsave(fence, flags);
> if (!dma_fence_test_signaled_flag(fence))
> - return (const char __rcu *)ops->get_driver_name(fence);
> - else
> - return (const char __rcu *)"signaled-timeline";
> + name = ops->get_driver_name(fence);
> + dma_fence_unlock_irqrestore(fence, flags);
> +
> + return name;
> }
> EXPORT_SYMBOL(dma_fence_timeline_name);
> diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h
> index b52ab692b22e..b93c3f7f69fb 100644
> --- a/include/linux/dma-fence.h
> +++ b/include/linux/dma-fence.h
> @@ -547,20 +547,29 @@ static inline bool
> dma_fence_is_signaled(struct dma_fence *fence)
> {
> const struct dma_fence_ops *ops;
> + unsigned long flags;
> + bool signaled;
>
> if (dma_fence_test_signaled_flag(fence))
> return true;
>
> rcu_read_lock();
> ops = rcu_dereference(fence->ops);
> - if (ops && ops->signaled && ops->signaled(fence)) {
> + if (!ops || !ops->signaled) {
> rcu_read_unlock();
> - dma_fence_signal(fence);
> - return true;
> + return false;
> }
> +
> + dma_fence_lock_irqsave(fence, flags);
> + signaled = ops->signaled(fence);
> +
> + if (signaled)
> + dma_fence_signal_locked(fence);
> +
> + dma_fence_unlock_irqrestore(fence, flags);
> rcu_read_unlock();
>
> - return false;
> + return signaled;
> }
>
> /**
On Mon, 2026-06-08 at 17:35 +0200, Christian König wrote:
> On 6/8/26 16:24, Philipp Stanner wrote:
> > The dma_fence backend_ops can access a fence. Hereby, a driver callback
> > will be running which likely will access driver specific data through
> > container_of(). If now, simultaneously, a driver signals the fence and
> > afterwards expects to run a driver specific destructor (using the same
> > data accessed through container_of()), there can be a race.
> >
> > A driver very likely trusts that once it has signaled a fence, no one
> > will be accessing it anymore. Moreover, it might already want to free up
> > resources, making UAF bugs possible.
> >
> > The race occurs because there are only pragmatic checks for the signaled
> > flag of a fence, without taking the fence lock. RCU guards exist, but
> > their purpose is to guard accesses through the backend_ops callbacks
> > against the driver (which implements the TEXT segment these callbacks
> > live in) from unloading.
> >
> > Proper synchronization can be ensured by taking the fence lock. RCU is
> > still simultaneously required to guard against the unload.
> >
> > Fix the races by taking the lock for all non-deprecated backend_ops
> > callbacks.
>
> That sounds like the fundamentally wrong approach to me.
>
> The lock protects the dma_fence signaling state and *NOT* any driver
> state, so it should not be used to protect any driver state.
>
> Drivers need to make sure that they protect their driver state with
> separate lock and don't rely on the dma_fence lock for this. This is
> actually the core of why we want to deprecate the shared dma_fence
> spinlock.
It's not so much about protecting data, it's about correctness:
A driver that calls
dma_fence_signal(f)
expects that after signalling, no callback will be running into the
driver again. It's a fix synchronization point.
Only the fence lock can grant such synchronization.
Positive effects would be:
1. Drivers can do their cleanup immediately, without having to wait for
a grace period
2. Drivers could be sure that their driver_fence data, allocated
together with fence and accessed through container_of(fence), is not
being accessed anymore.
I see only advantages. Safer, faster. :)
P.
>
> Regards,
> Christian.
>
> >
> > Conveniently, this also fixes a race where backend_ops->set_deadline()
> > might try to set a deadline for an already signaled fence.
> >
> > Suggested-by: Danilo Krummrich <dakr@kernel.org>
> > Signed-off-by: Philipp Stanner <phasta@kernel.org>
> > ---
> > We discovered this problem through our Rust abstractions, but it can
> > also occur in C.
> >
> > The by far cleanest solution seems to be to use the fence lock. This RFC
> > serves to discuss whether there is anything preventing that.
> >
> > (Patch so far just compile tested, to have some groundlayer for the
> > rough idea, to discuss it first)
> > ---
> > drivers/dma-buf/dma-fence.c | 39 ++++++++++++++++++++++++++++---------
> > include/linux/dma-fence.h | 17 ++++++++++++----
> > 2 files changed, 43 insertions(+), 13 deletions(-)
> >
> > diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c
> > index c7ea1e75d38a..b74f02f3cca8 100644
> > --- a/drivers/dma-buf/dma-fence.c
> > +++ b/drivers/dma-buf/dma-fence.c
> > @@ -629,7 +629,8 @@ EXPORT_SYMBOL(dma_fence_free);
> > static bool __dma_fence_enable_signaling(struct dma_fence *fence)
> > {
> > const struct dma_fence_ops *ops;
> > - bool was_set;
> > + bool was_set, success;
> > + unsigned long flags;
> >
> > dma_fence_assert_held(fence);
> >
> > @@ -644,7 +645,10 @@ static bool __dma_fence_enable_signaling(struct dma_fence *fence)
> > if (!was_set && ops && ops->enable_signaling) {
> > trace_dma_fence_enable_signal(fence);
> >
> > - if (!ops->enable_signaling(fence)) {
> > + dma_fence_lock_irqsave(fence, flags);
> > + success = ops->enable_signaling(fence);
> > + dma_fence_unlock_irqrestore(fence, flags);
> > + if (!success) {
> > rcu_read_unlock();
> > dma_fence_signal_locked(fence);
> > return false;
> > @@ -1020,11 +1024,20 @@ EXPORT_SYMBOL(dma_fence_wait_any_timeout);
> > void dma_fence_set_deadline(struct dma_fence *fence, ktime_t deadline)
> > {
> > const struct dma_fence_ops *ops;
> > + unsigned long flags;
> >
> > rcu_read_lock();
> > ops = rcu_dereference(fence->ops);
> > - if (ops && ops->set_deadline && !dma_fence_is_signaled(fence))
> > + if (!ops || !ops->set_deadline) {
> > + rcu_read_unlock();
> > + return;
> > + }
> > +
> > + dma_fence_lock_irqsave(fence, flags);
> > + if (!dma_fence_is_signaled_locked(fence))
> > ops->set_deadline(fence, deadline);
> > +
> > + dma_fence_unlock_irqrestore(fence, flags);
> > rcu_read_unlock();
> > }
> > EXPORT_SYMBOL(dma_fence_set_deadline);
> > @@ -1166,14 +1179,18 @@ EXPORT_SYMBOL(dma_fence_init64);
> > */
> > const char __rcu *dma_fence_driver_name(struct dma_fence *fence)
> > {
> > + const char __rcu *name = "detached-driver";
> > const struct dma_fence_ops *ops;
> > + unsigned long flags;
> >
> > /* RCU protection is required for safe access to returned string */
> > ops = rcu_dereference(fence->ops);
> > + dma_fence_lock_irqsave(fence, flags);
> > if (!dma_fence_test_signaled_flag(fence))
> > - return (const char __rcu *)ops->get_driver_name(fence);
> > - else
> > - return (const char __rcu *)"detached-driver";
> > + name = ops->get_driver_name(fence);
> > + dma_fence_unlock_irqrestore(fence, flags);
> > +
> > + return name;
> > }
> > EXPORT_SYMBOL(dma_fence_driver_name);
> >
> > @@ -1199,13 +1216,17 @@ EXPORT_SYMBOL(dma_fence_driver_name);
> > */
> > const char __rcu *dma_fence_timeline_name(struct dma_fence *fence)
> > {
> > + const char __rcu *name = "signaled-timeline";
> > const struct dma_fence_ops *ops;
> > + unsigned long flags;
> >
> > /* RCU protection is required for safe access to returned string */
> > ops = rcu_dereference(fence->ops);
> > + dma_fence_lock_irqsave(fence, flags);
> > if (!dma_fence_test_signaled_flag(fence))
> > - return (const char __rcu *)ops->get_driver_name(fence);
> > - else
> > - return (const char __rcu *)"signaled-timeline";
> > + name = ops->get_driver_name(fence);
> > + dma_fence_unlock_irqrestore(fence, flags);
> > +
> > + return name;
> > }
> > EXPORT_SYMBOL(dma_fence_timeline_name);
> > diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h
> > index b52ab692b22e..b93c3f7f69fb 100644
> > --- a/include/linux/dma-fence.h
> > +++ b/include/linux/dma-fence.h
> > @@ -547,20 +547,29 @@ static inline bool
> > dma_fence_is_signaled(struct dma_fence *fence)
> > {
> > const struct dma_fence_ops *ops;
> > + unsigned long flags;
> > + bool signaled;
> >
> > if (dma_fence_test_signaled_flag(fence))
> > return true;
> >
> > rcu_read_lock();
> > ops = rcu_dereference(fence->ops);
> > - if (ops && ops->signaled && ops->signaled(fence)) {
> > + if (!ops || !ops->signaled) {
> > rcu_read_unlock();
> > - dma_fence_signal(fence);
> > - return true;
> > + return false;
> > }
> > +
> > + dma_fence_lock_irqsave(fence, flags);
> > + signaled = ops->signaled(fence);
> > +
> > + if (signaled)
> > + dma_fence_signal_locked(fence);
> > +
> > + dma_fence_unlock_irqrestore(fence, flags);
> > rcu_read_unlock();
> >
> > - return false;
> > + return signaled;
> > }
> >
> > /**
On 6/8/26 17:41, Philipp Stanner wrote:
> On Mon, 2026-06-08 at 17:35 +0200, Christian König wrote:
>> On 6/8/26 16:24, Philipp Stanner wrote:
>>> The dma_fence backend_ops can access a fence. Hereby, a driver callback
>>> will be running which likely will access driver specific data through
>>> container_of(). If now, simultaneously, a driver signals the fence and
>>> afterwards expects to run a driver specific destructor (using the same
>>> data accessed through container_of()), there can be a race.
>>>
>>> A driver very likely trusts that once it has signaled a fence, no one
>>> will be accessing it anymore. Moreover, it might already want to free up
>>> resources, making UAF bugs possible.
>>>
>>> The race occurs because there are only pragmatic checks for the signaled
>>> flag of a fence, without taking the fence lock. RCU guards exist, but
>>> their purpose is to guard accesses through the backend_ops callbacks
>>> against the driver (which implements the TEXT segment these callbacks
>>> live in) from unloading.
>>>
>>> Proper synchronization can be ensured by taking the fence lock. RCU is
>>> still simultaneously required to guard against the unload.
>>>
>>> Fix the races by taking the lock for all non-deprecated backend_ops
>>> callbacks.
>>
>> That sounds like the fundamentally wrong approach to me.
>>
>> The lock protects the dma_fence signaling state and *NOT* any driver
>> state, so it should not be used to protect any driver state.
>>
>> Drivers need to make sure that they protect their driver state with
>> separate lock and don't rely on the dma_fence lock for this. This is
>> actually the core of why we want to deprecate the shared dma_fence
>> spinlock.
>
> It's not so much about protecting data, it's about correctness:
>
> A driver that calls
>
> dma_fence_signal(f)
>
> expects that after signalling, no callback will be running into the
> driver again.
No, that is not even remotely correct.
That's why we need the RCU grace period to make sure that nobody is referencing the driver stuff any more.
DMA fence destruction has to wait for an RCU grace period for exactly the same reason as well.
If we want to cleanup I would start there. And then eventually stop calling callbacks with the fence lock held and only hold the RCU read side.
Regards,
Christian.
> It's a fix synchronization point.
>
> Only the fence lock can grant such synchronization.
>
> Positive effects would be:
>
> 1. Drivers can do their cleanup immediately, without having to wait for
> a grace period
>
> 2. Drivers could be sure that their driver_fence data, allocated
> together with fence and accessed through container_of(fence), is not
> being accessed anymore.
>
>
> I see only advantages. Safer, faster. :)
>
> P.
>
>>
>> Regards,
>> Christian.
>>
>>>
>>> Conveniently, this also fixes a race where backend_ops->set_deadline()
>>> might try to set a deadline for an already signaled fence.
>>>
>>> Suggested-by: Danilo Krummrich <dakr@kernel.org>
>>> Signed-off-by: Philipp Stanner <phasta@kernel.org>
>>> ---
>>> We discovered this problem through our Rust abstractions, but it can
>>> also occur in C.
>>>
>>> The by far cleanest solution seems to be to use the fence lock. This RFC
>>> serves to discuss whether there is anything preventing that.
>>>
>>> (Patch so far just compile tested, to have some groundlayer for the
>>> rough idea, to discuss it first)
>>> ---
>>> drivers/dma-buf/dma-fence.c | 39 ++++++++++++++++++++++++++++---------
>>> include/linux/dma-fence.h | 17 ++++++++++++----
>>> 2 files changed, 43 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c
>>> index c7ea1e75d38a..b74f02f3cca8 100644
>>> --- a/drivers/dma-buf/dma-fence.c
>>> +++ b/drivers/dma-buf/dma-fence.c
>>> @@ -629,7 +629,8 @@ EXPORT_SYMBOL(dma_fence_free);
>>> static bool __dma_fence_enable_signaling(struct dma_fence *fence)
>>> {
>>> const struct dma_fence_ops *ops;
>>> - bool was_set;
>>> + bool was_set, success;
>>> + unsigned long flags;
>>>
>>> dma_fence_assert_held(fence);
>>>
>>> @@ -644,7 +645,10 @@ static bool __dma_fence_enable_signaling(struct dma_fence *fence)
>>> if (!was_set && ops && ops->enable_signaling) {
>>> trace_dma_fence_enable_signal(fence);
>>>
>>> - if (!ops->enable_signaling(fence)) {
>>> + dma_fence_lock_irqsave(fence, flags);
>>> + success = ops->enable_signaling(fence);
>>> + dma_fence_unlock_irqrestore(fence, flags);
>>> + if (!success) {
>>> rcu_read_unlock();
>>> dma_fence_signal_locked(fence);
>>> return false;
>>> @@ -1020,11 +1024,20 @@ EXPORT_SYMBOL(dma_fence_wait_any_timeout);
>>> void dma_fence_set_deadline(struct dma_fence *fence, ktime_t deadline)
>>> {
>>> const struct dma_fence_ops *ops;
>>> + unsigned long flags;
>>>
>>> rcu_read_lock();
>>> ops = rcu_dereference(fence->ops);
>>> - if (ops && ops->set_deadline && !dma_fence_is_signaled(fence))
>>> + if (!ops || !ops->set_deadline) {
>>> + rcu_read_unlock();
>>> + return;
>>> + }
>>> +
>>> + dma_fence_lock_irqsave(fence, flags);
>>> + if (!dma_fence_is_signaled_locked(fence))
>>> ops->set_deadline(fence, deadline);
>>> +
>>> + dma_fence_unlock_irqrestore(fence, flags);
>>> rcu_read_unlock();
>>> }
>>> EXPORT_SYMBOL(dma_fence_set_deadline);
>>> @@ -1166,14 +1179,18 @@ EXPORT_SYMBOL(dma_fence_init64);
>>> */
>>> const char __rcu *dma_fence_driver_name(struct dma_fence *fence)
>>> {
>>> + const char __rcu *name = "detached-driver";
>>> const struct dma_fence_ops *ops;
>>> + unsigned long flags;
>>>
>>> /* RCU protection is required for safe access to returned string */
>>> ops = rcu_dereference(fence->ops);
>>> + dma_fence_lock_irqsave(fence, flags);
>>> if (!dma_fence_test_signaled_flag(fence))
>>> - return (const char __rcu *)ops->get_driver_name(fence);
>>> - else
>>> - return (const char __rcu *)"detached-driver";
>>> + name = ops->get_driver_name(fence);
>>> + dma_fence_unlock_irqrestore(fence, flags);
>>> +
>>> + return name;
>>> }
>>> EXPORT_SYMBOL(dma_fence_driver_name);
>>>
>>> @@ -1199,13 +1216,17 @@ EXPORT_SYMBOL(dma_fence_driver_name);
>>> */
>>> const char __rcu *dma_fence_timeline_name(struct dma_fence *fence)
>>> {
>>> + const char __rcu *name = "signaled-timeline";
>>> const struct dma_fence_ops *ops;
>>> + unsigned long flags;
>>>
>>> /* RCU protection is required for safe access to returned string */
>>> ops = rcu_dereference(fence->ops);
>>> + dma_fence_lock_irqsave(fence, flags);
>>> if (!dma_fence_test_signaled_flag(fence))
>>> - return (const char __rcu *)ops->get_driver_name(fence);
>>> - else
>>> - return (const char __rcu *)"signaled-timeline";
>>> + name = ops->get_driver_name(fence);
>>> + dma_fence_unlock_irqrestore(fence, flags);
>>> +
>>> + return name;
>>> }
>>> EXPORT_SYMBOL(dma_fence_timeline_name);
>>> diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h
>>> index b52ab692b22e..b93c3f7f69fb 100644
>>> --- a/include/linux/dma-fence.h
>>> +++ b/include/linux/dma-fence.h
>>> @@ -547,20 +547,29 @@ static inline bool
>>> dma_fence_is_signaled(struct dma_fence *fence)
>>> {
>>> const struct dma_fence_ops *ops;
>>> + unsigned long flags;
>>> + bool signaled;
>>>
>>> if (dma_fence_test_signaled_flag(fence))
>>> return true;
>>>
>>> rcu_read_lock();
>>> ops = rcu_dereference(fence->ops);
>>> - if (ops && ops->signaled && ops->signaled(fence)) {
>>> + if (!ops || !ops->signaled) {
>>> rcu_read_unlock();
>>> - dma_fence_signal(fence);
>>> - return true;
>>> + return false;
>>> }
>>> +
>>> + dma_fence_lock_irqsave(fence, flags);
>>> + signaled = ops->signaled(fence);
>>> +
>>> + if (signaled)
>>> + dma_fence_signal_locked(fence);
>>> +
>>> + dma_fence_unlock_irqrestore(fence, flags);
>>> rcu_read_unlock();
>>>
>>> - return false;
>>> + return signaled;
>>> }
>>>
>>> /**
On Mon Jun 8, 2026 at 7:34 PM CEST, Christian König wrote: > That's why we need the RCU grace period to make sure that nobody is > referencing the driver stuff any more. Right, and that's what Philipp tries to address, the requirement to wait for an RCU grace period is perfectly fine if it is only about freeing memory, but it can become painful if the fence private data contains data also needs to be destructed in some way. IOW, if a driver signals a fence, it is lifecycle-wise reasonable to destruct the private data that is no longer needed (remaining users only deal with struct dma_fence) and having to wait for a full grace period adds sublety and complication that can be avoided with the proposed approach. That said, I'd like to ask the opposite question: What are the concerns with the proposed approach over (pure) RCU?
On 6/8/26 19:59, Danilo Krummrich wrote: > On Mon Jun 8, 2026 at 7:34 PM CEST, Christian König wrote: >> That's why we need the RCU grace period to make sure that nobody is >> referencing the driver stuff any more. > > Right, and that's what Philipp tries to address, the requirement to wait for an > RCU grace period is perfectly fine if it is only about freeing memory, but it > can become painful if the fence private data contains data also needs to be > destructed in some way. Yeah that makes sense. > IOW, if a driver signals a fence, it is lifecycle-wise reasonable to destruct > the private data that is no longer needed (remaining users only deal with struct > dma_fence) and having to wait for a full grace period adds sublety and > complication that can be avoided with the proposed approach. Yeah, I've run into that when I tried to make the amdgpu fences independent as well. > That said, I'd like to ask the opposite question: What are the concerns with the > proposed approach over (pure) RCU? Well a) locking inversions and b) performance. For example the reason why we have the dma_fence_is_signaled() and dma_fence_is_signaled_locked() variants is because there is a measurable difference in some specific use cases for not grabbing the locks. I personally find those micro-optimizations rather questionable, but the community agreement is that we should have them. So my take would rather be that the dma_fence_is_signaled_locked() variant goes away and we consistently call the ops pointers without holding the dma_fence lock and the driver implementations can then optionally take it if necessary. I think for this we would just need to replace most calls to dma_fence_is_signaled_locked() with dma_fence_test_signaled(). In the long term that would also allow cleaning up the container handling and simplifying the DRM scheduler a bit. Regards, Christian.
On Mon, 2026-06-08 at 20:32 +0200, Christian König wrote:
> On 6/8/26 19:59, Danilo Krummrich wrote:
> > On Mon Jun 8, 2026 at 7:34 PM CEST, Christian König wrote:
> > > That's why we need the RCU grace period to make sure that nobody
> > > is
> > > referencing the driver stuff any more.
> >
> > Right, and that's what Philipp tries to address, the requirement to
> > wait for an
> > RCU grace period is perfectly fine if it is only about freeing
> > memory, but it
> > can become painful if the fence private data contains data also
> > needs to be
> > destructed in some way.
>
> Yeah that makes sense.
>
> > IOW, if a driver signals a fence, it is lifecycle-wise reasonable
> > to destruct
> > the private data that is no longer needed (remaining users only
> > deal with struct
> > dma_fence) and having to wait for a full grace period adds sublety
> > and
> > complication that can be avoided with the proposed approach.
>
> Yeah, I've run into that when I tried to make the amdgpu fences
> independent as well.
> > That said, I'd like to ask the opposite question: What are the
> > concerns with the
> > proposed approach over (pure) RCU?
>
> For example the reason why we have the dma_fence_is_signaled() and
> dma_fence_is_signaled_locked() variants is because there is a
> measurable difference in some specific use cases for not grabbing the
> locks.
Yeah, certainly, not taking locks makes your code go faster. drm_sched
can sing a song about that -.-
>
> I personally find those micro-optimizations rather questionable, but
> the community agreement is that we should have them.
Yeah so this is most definitely broken and needs to be removed. Proof
is that various parties already need to work around that issue. You
just never know what drivers do. It's very conceivable that the Nouveau
case will often happen:
if (dma_fence_is_signaled(f))
dma_fence_put(f);
This fast path check in my mind certainly breaks the intended dma_fence
design:
void dma_fence_signal_timestamp_locked(struct dma_fence *fence,
ktime_t timestamp)
{
const struct dma_fence_ops *ops;
struct dma_fence_cb *cur, *tmp;
struct list_head cb_list;
dma_fence_assert_held(fence);
if (unlikely(test_and_set_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
&fence->flags)))
return;
Modifying the bit is consistently done under lock protection, so
reading must be done, too.
Do you remember who wanted those fast path checks? Who is spinning on
that lock?
In any case, that needs to be repaired. A hacky lockless way might
hypothetically be doable by setting barriers, as I suggest here:
https://lore.kernel.org/dri-devel/20260612104251.2264707-2-phasta@kernel.org/
But note that the ops-decoupling in e.g. dma_fence_timeline_name()
already depends on lockless ordering mechanisms in
dma_fence_signal_timestamp_locked(). So we're already a bit fragile
here.
>
> So my take would rather be that the dma_fence_is_signaled_locked()
> variant goes away and we consistently call the ops pointers without
> holding the dma_fence lock and the driver implementations can then
> optionally take it if necessary.
That might work.
>
> I think for this we would just need to replace most calls to
> dma_fence_is_signaled_locked() with dma_fence_test_signaled().
That would not fix the cleanup race in Nouveau.
I do get the idealistic idea of fence->signaled really just
representing whether the hardware is done, without any further
guarantees, but having this fast-path lockless magic in functions
checking the signaled state is really asking for trouble.
So it would seem that both dma_fence_is_signaled() and
dma_fence_test_signaled_flag() need to be properly synchronized.
P.
>
> In the long term that would also allow cleaning up the container
> handling and simplifying the DRM scheduler a bit.
>
> Regards,
> Christian.
On 6/15/26 10:29, Philipp Stanner wrote:
> On Mon, 2026-06-08 at 20:32 +0200, Christian König wrote:
>> On 6/8/26 19:59, Danilo Krummrich wrote:
>>> On Mon Jun 8, 2026 at 7:34 PM CEST, Christian König wrote:
>>>> That's why we need the RCU grace period to make sure that nobody
>>>> is
>>>> referencing the driver stuff any more.
>>>
>>> Right, and that's what Philipp tries to address, the requirement to
>>> wait for an
>>> RCU grace period is perfectly fine if it is only about freeing
>>> memory, but it
>>> can become painful if the fence private data contains data also
>>> needs to be
>>> destructed in some way.
>>
>> Yeah that makes sense.
>>
>>> IOW, if a driver signals a fence, it is lifecycle-wise reasonable
>>> to destruct
>>> the private data that is no longer needed (remaining users only
>>> deal with struct
>>> dma_fence) and having to wait for a full grace period adds sublety
>>> and
>>> complication that can be avoided with the proposed approach.
>>
>> Yeah, I've run into that when I tried to make the amdgpu fences
>> independent as well.
>>> That said, I'd like to ask the opposite question: What are the
>>> concerns with the
>>> proposed approach over (pure) RCU?
>>
>> For example the reason why we have the dma_fence_is_signaled() and
>> dma_fence_is_signaled_locked() variants is because there is a
>> measurable difference in some specific use cases for not grabbing the
>> locks.
>
> Yeah, certainly, not taking locks makes your code go faster. drm_sched
> can sing a song about that -.-
>
>>
>> I personally find those micro-optimizations rather questionable, but
>> the community agreement is that we should have them.
>
> Yeah so this is most definitely broken and needs to be removed. Proof
> is that various parties already need to work around that issue. You
> just never know what drivers do. It's very conceivable that the Nouveau
> case will often happen:
>
> if (dma_fence_is_signaled(f))
> dma_fence_put(f);
>
>
> This fast path check in my mind certainly breaks the intended dma_fence
> design:
>
> void dma_fence_signal_timestamp_locked(struct dma_fence *fence,
> ktime_t timestamp)
> {
> const struct dma_fence_ops *ops;
> struct dma_fence_cb *cur, *tmp;
> struct list_head cb_list;
>
> dma_fence_assert_held(fence);
>
> if (unlikely(test_and_set_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
> &fence->flags)))
> return;
>
>
> Modifying the bit is consistently done under lock protection, so
> reading must be done, too.
>
> Do you remember who wanted those fast path checks? Who is spinning on
> that lock?
Simona Vetter and basically the rest of the community.
And I can clearly say even if I don't like them that those optimizations are a must have.
> In any case, that needs to be repaired.
No, see my discussion with Simona on the mailing list. I need to dig that up as well, but it was around the time I added the same workaround to amdgpu.
You are basically trying what I have been suggesting as well, but there is a very wide agreement that the current design is a must have.
Regards,
Christian.
> A hacky lockless way might
> hypothetically be doable by setting barriers, as I suggest here:
>
> https://lore.kernel.org/dri-devel/20260612104251.2264707-2-phasta@kernel.org/
>
> But note that the ops-decoupling in e.g. dma_fence_timeline_name()
> already depends on lockless ordering mechanisms in
> dma_fence_signal_timestamp_locked(). So we're already a bit fragile
> here.
>
>>
>> So my take would rather be that the dma_fence_is_signaled_locked()
>> variant goes away and we consistently call the ops pointers without
>> holding the dma_fence lock and the driver implementations can then
>> optionally take it if necessary.
>
> That might work.
>
>>
>> I think for this we would just need to replace most calls to
>> dma_fence_is_signaled_locked() with dma_fence_test_signaled().
>
> That would not fix the cleanup race in Nouveau.
>
> I do get the idealistic idea of fence->signaled really just
> representing whether the hardware is done, without any further
> guarantees, but having this fast-path lockless magic in functions
> checking the signaled state is really asking for trouble.
>
> So it would seem that both dma_fence_is_signaled() and
> dma_fence_test_signaled_flag() need to be properly synchronized.
>
>
> P.
>
>>
>> In the long term that would also allow cleaning up the container
>> handling and simplifying the DRM scheduler a bit.
>>
>> Regards,
>> Christian.
On Mon, 2026-06-15 at 11:57 +0200, Christian König wrote:
> On 6/15/26 10:29, Philipp Stanner wrote:
> >
> > This fast path check in my mind certainly breaks the intended dma_fence
> > design:
> >
> > void dma_fence_signal_timestamp_locked(struct dma_fence *fence,
> > ktime_t timestamp)
> > {
> > const struct dma_fence_ops *ops;
> > struct dma_fence_cb *cur, *tmp;
> > struct list_head cb_list;
> >
> > dma_fence_assert_held(fence);
> >
> > if (unlikely(test_and_set_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
> > &fence->flags)))
> > return;
> >
> >
> > Modifying the bit is consistently done under lock protection, so
> > reading must be done, too.
> >
> > Do you remember who wanted those fast path checks? Who is spinning on
> > that lock?
>
> Simona Vetter and basically the rest of the community.
I did some git-blame and it would seem to me that this fast-path hack
was added in
e941759c74a4 fence: dma-buf cross-device synchronization (v18)
in 2014 A.D.
So it was there from the very beginning and was not added because there
was a performance bottleneck later. It is conceivable that a
performance issue was present from the get go, of course.
>
> And I can clearly say even if I don't like them that those
> optimizations are a must have.
>
> > In any case, that needs to be repaired.
>
> No, see my discussion with Simona on the mailing list. I need to dig
> that up as well, but it was around the time I added the same
> workaround to amdgpu.
>
> You are basically trying what I have been suggesting as well, but
> there is a very wide agreement that the current design is a must
> have.
I suggest at least three things:
A.
Very explicitly document all lockless mechanisms and their
justification in both code comments and commit messages.
* We need to document lockless magic *drastically* better in DRM. I
see code left and right where there is some barrier with the comment
simply being "so list_empty() works without a lock".
* The commit message needs to justify why a lock is missing, why this
is the preferred solution, why it is correct. The latter also needs
to be in a code comment.
* Note that WRITE_ONCE() is not only about volatile, but also about
"watch out, here is a lockless access!", as Linus pointed out
repeatedly.
B.
I think rejecting ideas with "we tried this, it >>didn't work<<" is not
a valid reason for refusing an idea. Point A above helps with that. If
your commit message contains measurements or links to tickets with
*real life* performance regressions (microbenchmarks are invalid), that
helps reducing discussion overhead drastically.
Now, in this particular case, I fail to see how taking the spinlock to
check that bit is evil. If it regresses someone's speed that much, it
would mean that someone is heavily punching that lock, like polling
24/7 with dma_fence_is_signaled().
Again, having that use case documented somewhere could save us all time
– especially for you, Christian, since you wouldn't be forced to have
the same discussion over and over again over the years ;-)
C.
Robustness and correctness always trump performance. They especially
trump microbenchmarks.
P.
On 6/16/26 13:25, Philipp Stanner wrote:
> On Mon, 2026-06-15 at 11:57 +0200, Christian König wrote:
>> On 6/15/26 10:29, Philipp Stanner wrote:
>>>
>>> This fast path check in my mind certainly breaks the intended dma_fence
>>> design:
>>>
>>> void dma_fence_signal_timestamp_locked(struct dma_fence *fence,
>>> ktime_t timestamp)
>>> {
>>> const struct dma_fence_ops *ops;
>>> struct dma_fence_cb *cur, *tmp;
>>> struct list_head cb_list;
>>>
>>> dma_fence_assert_held(fence);
>>>
>>> if (unlikely(test_and_set_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
>>> &fence->flags)))
>>> return;
>>>
>>>
>>> Modifying the bit is consistently done under lock protection, so
>>> reading must be done, too.
>>>
>>> Do you remember who wanted those fast path checks? Who is spinning on
>>> that lock?
>>
>> Simona Vetter and basically the rest of the community.
>
> I did some git-blame and it would seem to me that this fast-path hack
> was added in
>
> e941759c74a4 fence: dma-buf cross-device synchronization (v18)
>
> in 2014 A.D.
>
> So it was there from the very beginning and was not added because there
> was a performance bottleneck later. It is conceivable that a
> performance issue was present from the get go, of course.
Not saying that it wasn't there from the beginning, but multiple people (including me) have tried to improve the situation and that was either immediately reverted or directly rejected.
>>
>> And I can clearly say even if I don't like them that those
>> optimizations are a must have.
>>
>>> In any case, that needs to be repaired.
>>
>> No, see my discussion with Simona on the mailing list. I need to dig
>> that up as well, but it was around the time I added the same
>> workaround to amdgpu.
>>
>> You are basically trying what I have been suggesting as well, but
>> there is a very wide agreement that the current design is a must
>> have.
>
> I suggest at least three things:
>
>
> A.
>
> Very explicitly document all lockless mechanisms and their
> justification in both code comments and commit messages.
>
> * We need to document lockless magic *drastically* better in DRM. I
> see code left and right where there is some barrier with the comment
> simply being "so list_empty() works without a lock".
> * The commit message needs to justify why a lock is missing, why this
> is the preferred solution, why it is correct. The latter also needs
> to be in a code comment.
> * Note that WRITE_ONCE() is not only about volatile, but also about
> "watch out, here is a lockless access!", as Linus pointed out
> repeatedly.
Completely agree. Question is who has time for that?
> B.
>
> I think rejecting ideas with "we tried this, it >>didn't work<<" is not
> a valid reason for refusing an idea. Point A above helps with that. If
> your commit message contains measurements or links to tickets with
> *real life* performance regressions (microbenchmarks are invalid), that
> helps reducing discussion overhead drastically.
>
> Now, in this particular case, I fail to see how taking the spinlock to
> check that bit is evil. If it regresses someone's speed that much, it
> would mean that someone is heavily punching that lock, like polling
> 24/7 with dma_fence_is_signaled().
I think (but I'm not 100% sure) the the problem is that taking the spinlock introduces a write to the cache line it is in.
At the moment when a fence is signaled a read is enough to check that state, so what happens is that the cache line for the signaled bit sooner or later end up in all CPU caches.
When you start to use the spinlock the cache line backing that plays ping/pong between all the CPU cores and that is something which always stalls each CPU when it needs to acquire the cache line. Keep in mind that on a modern box you can calculate like a 4x4 matrix in the same time you solve a cache miss.
This is especially important for the stub fence which is used by basically all cores at the same time whenever you need a signaled dummy.
> Again, having that use case documented somewhere could save us all time
> – especially for you, Christian, since you wouldn't be forced to have
> the same discussion over and over again over the years ;-)
Well I could also send out all the DMA-buf resilient patches/ideas I came up with over the years once more.
I think you will potentially agree to some of them immediately.
> C.
> Robustness and correctness always trump performance. They especially
> trump microbenchmarks.
Yeah agree completely as well. We should also in general not optimize for unrealistic use cases.
For example I had to reject multiple attempts to optimize the dma_fence_chain container for something which only happens in test cases.
Regards,
Christian.
>
>
> P.
On Wed Jun 17, 2026 at 10:46 AM BST, Christian König wrote: > On 6/16/26 13:25, Philipp Stanner wrote: >> >> I think rejecting ideas with "we tried this, it >>didn't work<<" is not >> a valid reason for refusing an idea. Point A above helps with that. If >> your commit message contains measurements or links to tickets with >> *real life* performance regressions (microbenchmarks are invalid), that >> helps reducing discussion overhead drastically. >> >> Now, in this particular case, I fail to see how taking the spinlock to >> check that bit is evil. If it regresses someone's speed that much, it >> would mean that someone is heavily punching that lock, like polling >> 24/7 with dma_fence_is_signaled(). > > I think (but I'm not 100% sure) the the problem is that taking the spinlock > introduces a write to the cache line it is in. > > At the moment when a fence is signaled a read is enough to check that state, > so what happens is that the cache line for the signaled bit sooner or later > end up in all CPU caches. > > When you start to use the spinlock the cache line backing that plays ping/pong > between all the CPU cores and that is something which always stalls each CPU > when it needs to acquire the cache line. Keep in mind that on a modern box you > can calculate like a 4x4 matrix in the same time you solve a cache miss. > > This is especially important for the stub fence which is used by basically all > cores at the same time whenever you need a signaled dummy. > This sounds like an area where hazard pointers can help. Like RCU the reader side is lock-free. And for the specific case of signaled state where it is only going one direction, hazard pointer is also wait-free because it does not need to loop until the state is stable. The reclaim side just needs to wait for all reader to exit their critical section, unless RCU where it needs to wait for a full grace period. That said, the reclaim waiter still must not hold any locks (or other resources) that the reader side critical section can take. So you still got a variant of the lock inversion problem to avoid. Best, Gary
On Wed, 2026-06-17 at 11:46 +0200, Christian König wrote: > On 6/16/26 13:25, Philipp Stanner wrote: > > > > So it was there from the very beginning and was not added because there > > was a performance bottleneck later. It is conceivable that a > > performance issue was present from the get go, of course. > > Not saying that it wasn't there from the beginning, but multiple > people (including me) have tried to improve the situation and that > was either immediately reverted or directly rejected. > > > > > > > And I can clearly say even if I don't like them that those > > > optimizations are a must have. > > > > > > > In any case, that needs to be repaired. > > > > > > No, see my discussion with Simona on the mailing list. I need to dig > > > that up as well, but it was around the time I added the same > > > workaround to amdgpu. > > > > > > You are basically trying what I have been suggesting as well, but > > > there is a very wide agreement that the current design is a must > > > have. > > > > > > * We need to document lockless magic *drastically* better in DRM. I > > see code left and right where there is some barrier with the comment > > simply being "so list_empty() works without a lock". > > * The commit message needs to justify why a lock is missing, why this > > is the preferred solution, why it is correct. The latter also needs > > to be in a code comment. > > * Note that WRITE_ONCE() is not only about volatile, but also about > > "watch out, here is a lockless access!", as Linus pointed out > > repeatedly. > > Completely agree. Question is who has time for that? We / the maintainers of the respective systems need to heavily encourage that :) The good news btw is that many things are moving towards a good direction in recent past, as far as I have seen > > > B. > > > > I think rejecting ideas with "we tried this, it >>didn't work<<" is not > > a valid reason for refusing an idea. Point A above helps with that. If > > your commit message contains measurements or links to tickets with > > *real life* performance regressions (microbenchmarks are invalid), that > > helps reducing discussion overhead drastically. > > > > Now, in this particular case, I fail to see how taking the spinlock to > > check that bit is evil. If it regresses someone's speed that much, it > > would mean that someone is heavily punching that lock, like polling > > 24/7 with dma_fence_is_signaled(). > > I think (but I'm not 100% sure) the the problem is that taking the > spinlock introduces a write to the cache line it is in. > > At the moment when a fence is signaled a read is enough to check that > state, so what happens is that the cache line for the signaled bit > sooner or later end up in all CPU caches. > > When you start to use the spinlock the cache line backing that plays > ping/pong between all the CPU cores and that is something which > always stalls each CPU when it needs to acquire the cache line. Keep > in mind that on a modern box you can calculate like a 4x4 matrix in > the same time you solve a cache miss. > > This is especially important for the stub fence which is used by > basically all cores at the same time whenever you need a signaled > dummy. Alright, that sort of sounds logical, I guess. So the argument basically is that if we'd try to lock that, someone would immediately report real and massive performance regressions leading to a revert. I think last time you mentioned that memory footprint is less of a concern for dma_fence than cache lines. Out of interest: has anyone ever experimented with more padding to prevent spinners from shooting down other CPUs cache lines? Since you're the maintainer of dma-buf, what would you wish we do? Would you be at least OK with the memory barrier approach to make the API a bit more robust? AFAIU the barriers will not cause a cache line invalidation. > > > Again, having that use case documented somewhere could save us all time > > – especially for you, Christian, since you wouldn't be forced to have > > the same discussion over and over again over the years ;-) > > Well I could also send out all the DMA-buf resilient patches/ideas I came up with over the years once more. Maybe we could have sort of a wiki in Documentation/ with links to relevant mail threads and some explanations of why things are the way they are? btw, is there a dma-buf TODO list like for DRM in general? There are many passionate hackers who love challenges. We could certainly add a few "Difficulty: hard" entries for a few controversial potential reworks. P.
On 6/17/26 12:16, Philipp Stanner wrote: > On Wed, 2026-06-17 at 11:46 +0200, Christian König wrote: ... >> >>> B. >>> >>> I think rejecting ideas with "we tried this, it >>didn't work<<" is not >>> a valid reason for refusing an idea. Point A above helps with that. If >>> your commit message contains measurements or links to tickets with >>> *real life* performance regressions (microbenchmarks are invalid), that >>> helps reducing discussion overhead drastically. >>> >>> Now, in this particular case, I fail to see how taking the spinlock to >>> check that bit is evil. If it regresses someone's speed that much, it >>> would mean that someone is heavily punching that lock, like polling >>> 24/7 with dma_fence_is_signaled(). >> >> I think (but I'm not 100% sure) the the problem is that taking the >> spinlock introduces a write to the cache line it is in. >> >> At the moment when a fence is signaled a read is enough to check that >> state, so what happens is that the cache line for the signaled bit >> sooner or later end up in all CPU caches. >> >> When you start to use the spinlock the cache line backing that plays >> ping/pong between all the CPU cores and that is something which >> always stalls each CPU when it needs to acquire the cache line. Keep >> in mind that on a modern box you can calculate like a 4x4 matrix in >> the same time you solve a cache miss. >> >> This is especially important for the stub fence which is used by >> basically all cores at the same time whenever you need a signaled >> dummy. > > Alright, that sort of sounds logical, I guess. So the argument > basically is that if we'd try to lock that, someone would immediately > report real and massive performance regressions leading to a revert. > > I think last time you mentioned that memory footprint is less of a > concern for dma_fence than cache lines. Out of interest: has anyone > ever experimented with more padding to prevent spinners from shooting > down other CPUs cache lines? How would that work in this case? I mean as long as you have the same variable (spinlock) you have the same cache line no matter how you pad. > Since you're the maintainer of dma-buf, what would you wish we do? Try to improve the documentation by sending out patches. I will send out my ideas for resilient improvements and we then discuss on the patches. > Would you be at least OK with the memory barrier approach to make the > API a bit more robust? AFAIU the barriers will not cause a cache line > invalidation. What exactly do you mean with that? The test_bit() and set_bit() are already memory barriers as far as I know. >> >>> Again, having that use case documented somewhere could save us all time >>> – especially for you, Christian, since you wouldn't be forced to have >>> the same discussion over and over again over the years ;-) >> >> Well I could also send out all the DMA-buf resilient patches/ideas I came up with over the years once more. > > Maybe we could have sort of a wiki in Documentation/ with links to > relevant mail threads and some explanations of why things are the way > they are? I think some AI analyzing the mailing list and noting when some ideas repeat would help. At least for me maintaining some kind of Wiki additional to my current workload wouldn't be possible at all. > btw, is there a dma-buf TODO list like for DRM in general? No, not that I know of. We used to have minor cleanup tasks on the DRM TODOs, but those were already taken by somebody. Regards, Christian. > > There are many passionate hackers who love challenges. We could > certainly add a few "Difficulty: hard" entries for a few controversial > potential reworks. > > > P.
On Wed, 2026-06-17 at 15:03 +0200, Christian König wrote:
> On 6/17/26 12:16, Philipp Stanner wrote:
> > On Wed, 2026-06-17 at 11:46 +0200, Christian König wrote:
> ...
> > >
> > > > B.
> > > >
> > > > I think rejecting ideas with "we tried this, it >>didn't work<<" is not
> > > > a valid reason for refusing an idea. Point A above helps with that. If
> > > > your commit message contains measurements or links to tickets with
> > > > *real life* performance regressions (microbenchmarks are invalid), that
> > > > helps reducing discussion overhead drastically.
> > > >
> > > > Now, in this particular case, I fail to see how taking the spinlock to
> > > > check that bit is evil. If it regresses someone's speed that much, it
> > > > would mean that someone is heavily punching that lock, like polling
> > > > 24/7 with dma_fence_is_signaled().
> > >
> > > I think (but I'm not 100% sure) the the problem is that taking the
> > > spinlock introduces a write to the cache line it is in.
> > >
> > > At the moment when a fence is signaled a read is enough to check that
> > > state, so what happens is that the cache line for the signaled bit
> > > sooner or later end up in all CPU caches.
> > >
> > > When you start to use the spinlock the cache line backing that plays
> > > ping/pong between all the CPU cores and that is something which
> > > always stalls each CPU when it needs to acquire the cache line. Keep
> > > in mind that on a modern box you can calculate like a 4x4 matrix in
> > > the same time you solve a cache miss.
> > >
> > > This is especially important for the stub fence which is used by
> > > basically all cores at the same time whenever you need a signaled
> > > dummy.
> >
> > Alright, that sort of sounds logical, I guess. So the argument
> > basically is that if we'd try to lock that, someone would immediately
> > report real and massive performance regressions leading to a revert.
> >
> > I think last time you mentioned that memory footprint is less of a
> > concern for dma_fence than cache lines. Out of interest: has anyone
> > ever experimented with more padding to prevent spinners from shooting
> > down other CPUs cache lines?
>
> How would that work in this case? I mean as long as you have the same
> variable (spinlock) you have the same cache line no matter how you
> pad.
Ah, gotcha. I was talking more in general. Sometimes you have
situations like:
struct foo {
spinlock_t lock;
// place padding to fill up a cache line here?
u8 data[];
} bar;
// thread A
lock(bar->lock);
do_sth(bar->data); // works on `data`
unlock(bar->lock);
// thread B
lock(bar->lock); // might invalidate the cache line the beginning of `data` lives in
That wouldn't solve the spinlock-issue; but I've been interested in a
while in whether the above has been an observed problem.
>
> > Since you're the maintainer of dma-buf, what would you wish we do?
>
> Try to improve the documentation by sending out patches. I will send out my ideas for resilient improvements and we then discuss on the patches.
>
> > Would you be at least OK with the memory barrier approach to make the
> > API a bit more robust? AFAIU the barriers will not cause a cache line
> > invalidation.
>
> What exactly do you mean with that? The test_bit() and set_bit() are
> already memory barriers as far as I know.
I'm talking about whether we could enforce that the bit is only set
once the callbacks have completed, so someone who wants to drop his
reference once dma_fence_is_signaled() returns true doesn't cause a
UAF. You didn't answer here:
https://lore.kernel.org/dri-devel/dca171cea556c3f3de3a86f735eeb53335cd3f49.camel@mailbox.org/
>
> > >
> > > > Again, having that use case documented somewhere could save us all time
> > > > – especially for you, Christian, since you wouldn't be forced to have
> > > > the same discussion over and over again over the years ;-)
> > >
> > > Well I could also send out all the DMA-buf resilient patches/ideas I came up with over the years once more.
> >
> > Maybe we could have sort of a wiki in Documentation/ with links to
> > relevant mail threads and some explanations of why things are the way
> > they are?
>
> I think some AI analyzing the mailing list and noting when some ideas
> repeat would help.
Such tools should be used sparingly. Notably they can contribute to
contributor-frustration.
>
> At least for me maintaining some kind of Wiki additional to my
> current workload wouldn't be possible at all.
A simple file with some links could be enough. But was just an idea, no
hard feelings about it.
P.
On Mon Jun 8, 2026 at 8:32 PM CEST, Christian König wrote: > On 6/8/26 19:59, Danilo Krummrich wrote: >> On Mon Jun 8, 2026 at 7:34 PM CEST, Christian König wrote: >>> That's why we need the RCU grace period to make sure that nobody is >>> referencing the driver stuff any more. >> >> Right, and that's what Philipp tries to address, the requirement to wait for an >> RCU grace period is perfectly fine if it is only about freeing memory, but it >> can become painful if the fence private data contains data also needs to be >> destructed in some way. > > Yeah that makes sense. > >> IOW, if a driver signals a fence, it is lifecycle-wise reasonable to destruct >> the private data that is no longer needed (remaining users only deal with struct >> dma_fence) and having to wait for a full grace period adds sublety and >> complication that can be avoided with the proposed approach. > > Yeah, I've run into that when I tried to make the amdgpu fences independent as well. >> That said, I'd like to ask the opposite question: What are the concerns with the >> proposed approach over (pure) RCU? > > Well a) locking inversions and b) performance. > > For example the reason why we have the dma_fence_is_signaled() and > dma_fence_is_signaled_locked() variants is because there is a measurable > difference in some specific use cases for not grabbing the locks. I checked for this as well, but couldn't find a case where dma_fence_is_signaled() is used in a way where it would be performance critical to avoid the lock in any way. Note that the lock is only bypassed when the fence is signaled already (this would be preserved) and if signaled() returns false, i.e. dma_fence_signal() will take the lock anyways. > I personally find those micro-optimizations rather questionable, but the > community agreement is that we should have them. I agree, it is rather questionable. So, I wouldn't make this the deciding factor unless someone can present a valid case where it actually matters. > So my take would rather be that the dma_fence_is_signaled_locked() variant > goes away and we consistently call the ops pointers without holding the > dma_fence lock and the driver implementations can then optionally take it if > necessary. How did you get to this conclusion considering that you run into what I mentioned above as well and the fact that we seem to agree that the performance concern is rather questionable?
On 6/8/26 20:39, Danilo Krummrich wrote: > On Mon Jun 8, 2026 at 8:32 PM CEST, Christian König wrote: >> On 6/8/26 19:59, Danilo Krummrich wrote: >>> On Mon Jun 8, 2026 at 7:34 PM CEST, Christian König wrote: >>>> That's why we need the RCU grace period to make sure that nobody is >>>> referencing the driver stuff any more. >>> >>> Right, and that's what Philipp tries to address, the requirement to wait for an >>> RCU grace period is perfectly fine if it is only about freeing memory, but it >>> can become painful if the fence private data contains data also needs to be >>> destructed in some way. >> >> Yeah that makes sense. >> >>> IOW, if a driver signals a fence, it is lifecycle-wise reasonable to destruct >>> the private data that is no longer needed (remaining users only deal with struct >>> dma_fence) and having to wait for a full grace period adds sublety and >>> complication that can be avoided with the proposed approach. >> >> Yeah, I've run into that when I tried to make the amdgpu fences independent as well. >>> That said, I'd like to ask the opposite question: What are the concerns with the >>> proposed approach over (pure) RCU? >> >> Well a) locking inversions and b) performance. >> >> For example the reason why we have the dma_fence_is_signaled() and >> dma_fence_is_signaled_locked() variants is because there is a measurable >> difference in some specific use cases for not grabbing the locks. > > I checked for this as well, but couldn't find a case where > dma_fence_is_signaled() is used in a way where it would be performance critical > to avoid the lock in any way. > > Note that the lock is only bypassed when the fence is signaled already (this > would be preserved) and if signaled() returns false, i.e. dma_fence_signal() > will take the lock anyways. > >> I personally find those micro-optimizations rather questionable, but the >> community agreement is that we should have them. > > I agree, it is rather questionable. So, I wouldn't make this the deciding factor > unless someone can present a valid case where it actually matters. > >> So my take would rather be that the dma_fence_is_signaled_locked() variant >> goes away and we consistently call the ops pointers without holding the >> dma_fence lock and the driver implementations can then optionally take it if >> necessary. > > How did you get to this conclusion considering that you run into what I > mentioned above as well and the fact that we seem to agree that the performance > concern is rather questionable? Quite simple, it's the cleaner approach. Calling callbacks with locks held is rather questionable even putting the performance issue aside. In detail calling the callbacks without holding locks allows all implementations who need it to explicitly take locks in the order they want. If you call it with the lock held you enforce the fence lock the be the outermost lock. Regards, Christian.
+Cc Dave
On Mon, 2026-06-08 at 20:47 +0200, Christian König wrote:
> On 6/8/26 20:39, Danilo Krummrich wrote:
> > How did you get to this conclusion considering that you run into what I
> > mentioned above as well and the fact that we seem to agree that the performance
> > concern is rather questionable?
>
> Quite simple, it's the cleaner approach.
>
> Calling callbacks with locks held is rather questionable even putting the performance issue aside.
I'm right now going through all fence users to see whether we can
implement my solution.
And look what I found:
static inline bool
nouveau_cli_work_ready(struct dma_fence *fence)
{
unsigned long flags;
bool ret = true;
dma_fence_lock_irqsave(fence, flags);
if (!dma_fence_is_signaled_locked(fence))
ret = false;
dma_fence_unlock_irqrestore(fence, flags);
if (ret == true)
dma_fence_put(fence);
return ret;
}
That looks weird, doesn't it?
We do some git-blame:
c8a5d5ea3ba6a18958f8d76430e4cd68eea33943
and we find that it's Dave who wrote that code, because
"
My analysis: two threads are running, one in the irq signalling the
fence, in dma_fence_signal_timestamp_locked, it has done the
DMA_FENCE_FLAG_SIGNALLED_BIT setting, but hasn't yet reached the
callbacks.
The second thread in nouveau_cli_work_ready, where it sees the fence is
signalled, so then puts the fence, cleanups the object and frees the
work item, which contains the callback.
Thread one goes again and tries to call the callback and causes the
use-after-free.
"
So this is a race, caused by lockless speed optimization.
And it would further seem that there is one invention in computer
science that can prevent such races:
Locks.
There is no cleaner, safer synchronization strategy in computer science
than locking.
This race became possible because the lock does not guard the entirety
of dma_fence_is_signaled().
Wouldn't you agree that this is a strong indicator for the great
advantages that consequent and consistent lock-protection grants? IOW,
by using locks more strictly in dma_fence, we can increase its
robustness and reliability.
P.
On 09/06/2026 14:19, Philipp Stanner wrote:
> +Cc Dave
>
> On Mon, 2026-06-08 at 20:47 +0200, Christian König wrote:
>> On 6/8/26 20:39, Danilo Krummrich wrote:
>>> How did you get to this conclusion considering that you run into what I
>>> mentioned above as well and the fact that we seem to agree that the performance
>>> concern is rather questionable?
>>
>> Quite simple, it's the cleaner approach.
>>
>> Calling callbacks with locks held is rather questionable even putting the performance issue aside.
>
>
> I'm right now going through all fence users to see whether we can
> implement my solution.
>
> And look what I found:
>
> static inline bool
> nouveau_cli_work_ready(struct dma_fence *fence)
> {
> unsigned long flags;
> bool ret = true;
>
> dma_fence_lock_irqsave(fence, flags);
> if (!dma_fence_is_signaled_locked(fence))
> ret = false;
> dma_fence_unlock_irqrestore(fence, flags);
>
> if (ret == true)
> dma_fence_put(fence);
> return ret;
> }
>
>
> That looks weird, doesn't it?
>
>
> We do some git-blame:
>
> c8a5d5ea3ba6a18958f8d76430e4cd68eea33943
>
> and we find that it's Dave who wrote that code, because
>
> "
> My analysis: two threads are running, one in the irq signalling the
> fence, in dma_fence_signal_timestamp_locked, it has done the
> DMA_FENCE_FLAG_SIGNALLED_BIT setting, but hasn't yet reached the
> callbacks.
>
> The second thread in nouveau_cli_work_ready, where it sees the fence is
> signalled, so then puts the fence, cleanups the object and frees the
> work item, which contains the callback.
>
> Thread one goes again and tries to call the callback and causes the
> use-after-free.
> "
>
>
> So this is a race, caused by lockless speed optimization.
>
> And it would further seem that there is one invention in computer
> science that can prevent such races:
>
> Locks.
>
> There is no cleaner, safer synchronization strategy in computer science
> than locking.
>
> This race became possible because the lock does not guard the entirety
> of dma_fence_is_signaled().
One could also argue the problem was caused by sharing the fence
reference without holding it, anyway, I wanted to ask something else.
What happened to the idea to remove opportunistic signalling from
dma_fence_is_signaled?
Regards,
Tvrtko
>
> Wouldn't you agree that this is a strong indicator for the great
> advantages that consequent and consistent lock-protection grants? IOW,
> by using locks more strictly in dma_fence, we can increase its
> robustness and reliability.
>
>
> P.
On Tue, 2026-06-09 at 14:36 +0100, Tvrtko Ursulin wrote: > I wanted to ask something else. > What happened to the idea to remove opportunistic signalling from > dma_fence_is_signaled? AFAIAR that was declared impossible because some system setups like mobile devices don't signal the fences with an interrupt handler, but have some sort of daemon (in userspace) who peridiocally wakes up to update the graphics output. It wakes up every T milliseconds and opportunistically asks through dma_fence_is_signaled() (and that through ops->signaled() which fences are already signaled. P.
On 6/9/26 15:57, Philipp Stanner wrote: > On Tue, 2026-06-09 at 14:36 +0100, Tvrtko Ursulin wrote: >> I wanted to ask something else. >> What happened to the idea to remove opportunistic signalling from >> dma_fence_is_signaled? > > AFAIAR that was declared impossible because some system setups like > mobile devices don't signal the fences with an interrupt handler, but > have some sort of daemon (in userspace) who peridiocally wakes up to > update the graphics output. It wakes up every T milliseconds and > opportunistically asks through dma_fence_is_signaled() (and that > through ops->signaled() which fences are already signaled. That's a pretty good summary we should probably add it to the documentation. Just one additional note: The userspace deamon is usually the compositor who is in control of the CRTCs and the interrupt is the periodical refresh timer, usually vblank. Christian. > > P.
On 6/9/26 15:19, Philipp Stanner wrote:
> +Cc Dave
>
> On Mon, 2026-06-08 at 20:47 +0200, Christian König wrote:
>> On 6/8/26 20:39, Danilo Krummrich wrote:
>>> How did you get to this conclusion considering that you run into what I
>>> mentioned above as well and the fact that we seem to agree that the performance
>>> concern is rather questionable?
>>
>> Quite simple, it's the cleaner approach.
>>
>> Calling callbacks with locks held is rather questionable even putting the performance issue aside.
>
>
> I'm right now going through all fence users to see whether we can
> implement my solution.
>
> And look what I found:
>
> static inline bool
> nouveau_cli_work_ready(struct dma_fence *fence)
> {
> unsigned long flags;
> bool ret = true;
>
> dma_fence_lock_irqsave(fence, flags);
> if (!dma_fence_is_signaled_locked(fence))
> ret = false;
> dma_fence_unlock_irqrestore(fence, flags);
>
> if (ret == true)
> dma_fence_put(fence);
> return ret;
> }
>
>
> That looks weird, doesn't it?
No, that is pretty much expected.
This issue results because of the lock inversion/cleanup race between nouveau_fence_chan->lock and dropping the last reference.
That in turn is caused by the fact that enable_signaling is called with the fence lock held and delegates the signaling to the caller instead of doing it itself.
This in turn means that you can't do proper cleanup after the signaling is done by grabbing driver specific locks.
This is *exactly* the problem I'm trying to prevent here.
When the callbacks wouldn't be called with the fence lock held the Nouveau nouveau_fence_chan->lock and the fence lock would be completely independent.
This results in much better cleanup paths, fences which are independent of their issuers and in general much simpler handling for all dma_fence implementation backends because we don't need to worry all the time about lock inversions between the fence lock and internal driver locks.
So as far as I can see what you suggest here is exactly what has caused all the problems in the first place.
For the cleanup path in Rust you should be trivially able to use call_rcu() if the synchronized cleanup path would be causing issues (which I clearly agree on).
Regards,
Christian.
>
>
> We do some git-blame:
>
> c8a5d5ea3ba6a18958f8d76430e4cd68eea33943
>
> and we find that it's Dave who wrote that code, because
>
> "
> My analysis: two threads are running, one in the irq signalling the
> fence, in dma_fence_signal_timestamp_locked, it has done the
> DMA_FENCE_FLAG_SIGNALLED_BIT setting, but hasn't yet reached the
> callbacks.
>
> The second thread in nouveau_cli_work_ready, where it sees the fence is
> signalled, so then puts the fence, cleanups the object and frees the
> work item, which contains the callback.
>
> Thread one goes again and tries to call the callback and causes the
> use-after-free.
> "
>
>
> So this is a race, caused by lockless speed optimization.
>
> And it would further seem that there is one invention in computer
> science that can prevent such races:
>
> Locks.
>
> There is no cleaner, safer synchronization strategy in computer science
> than locking.
>
> This race became possible because the lock does not guard the entirety
> of dma_fence_is_signaled().
>
> Wouldn't you agree that this is a strong indicator for the great
> advantages that consequent and consistent lock-protection grants? IOW,
> by using locks more strictly in dma_fence, we can increase its
> robustness and reliability.
>
>
> P.
On Tue, 2026-06-09 at 15:34 +0200, Christian König wrote:
> On 6/9/26 15:19, Philipp Stanner wrote:
> > And look what I found:
> >
> > static inline bool
> > nouveau_cli_work_ready(struct dma_fence *fence)
> > {
> > unsigned long flags;
> > bool ret = true;
> >
> > dma_fence_lock_irqsave(fence, flags);
> > if (!dma_fence_is_signaled_locked(fence))
> > ret = false;
> > dma_fence_unlock_irqrestore(fence, flags);
> >
> > if (ret == true)
> > dma_fence_put(fence);
> > return ret;
> > }
> >
> >
> > That looks weird, doesn't it?
>
> No, that is pretty much expected.
>
> This issue results because of the lock inversion/cleanup race between
> nouveau_fence_chan->lock and dropping the last reference.
>
> That in turn is caused by the fact that enable_signaling is called
> with the fence lock held
Are you referring to this comment from the documentation?
* Since many implementations can call dma_fence_signal() even when before
* @enable_signaling has been called there's a race window, where the
* dma_fence_signal() might result in the final fence reference being
* released and its memory freed. To avoid this, implementations of this
* callback should grab their own reference using dma_fence_get(), to be
* released when the fence is signalled (through e.g. the interrupt
* handler).
*
* This callback is optional. If this callback is not present, then the
* driver must always have signaling enabled.
*/
bool (*enable_signaling)(struct dma_fence *fence);
> and delegates the signaling to the caller instead of doing it itself.
Who delegates what to whom?
enable_signaling() is called indirectly by someone who adds a callback.
If enable_signaling() is implemented, then the driver callback's only
job is to activate some sort of interrupt or worker that will signal
that fence at some point.
> This in turn means that you can't do proper cleanup after the
> signaling is done by grabbing driver specific locks.
Sure you can. If everything is properly synchronized.
// driver
dma_fence_signal(f);
// all callbacks can't reach our driver anymore
struct driver_fence = container_of(f);
lock(driver_fence->special_lock);
cleanup(f);
unlock(…);
Where is the deadlock?
I continue to fail to see it. Show me an example of some code that
would deadlock, please. Either fictive or from the kernel.
Nouveau's enable_signaling() callback does not take locks. Its
signaled() does not take locks.
I went through all implementors of signaled() and found that only xe
and nouveau invoke some function pointer that we want to investigate
closer:
User Way signaled() operates
-----------------------------------------------
amdgpu_userq_fence.c lockless
etnaviv lockless
i915 lockless
nouveau lockless?
radeon lockless
vc4 lockless
xe lockless?
Who will be deadlocking and with which locks?
> This is *exactly* the problem I'm trying to prevent here.
>
> When the callbacks wouldn't be called with the fence lock held the
> Nouveau nouveau_fence_chan->lock and the fence lock would be
> completely independent.
You seem to circle around the idea that Nouveau's fence list is
protected with the shared lock?
That issue, if it exists, is solvable through the embedded lock.
Anyways, there is no issue. Neither Nouveau nor other drivers will
deadlock if we call ops->signaled() with the fence lock around.
>
> This results in much better cleanup paths, fences which are
> independent of their issuers and in general much simpler handling for
> all dma_fence implementation backends because we don't need to worry
> all the time about lock inversions between the fence lock and
> internal driver locks.
>
> So as far as I can see what you suggest here is exactly what has
> caused all the problems in the first place.
>
> For the cleanup path in Rust you should be trivially able to use
> call_rcu() if the synchronized cleanup path would be causing issues
> (which I clearly agree on).
You can *satisfy the current dma_fence API* with call_rcu(), but as
Danilo states delaying work always has subtleties – plus I don't really
want to have an unnecessary call_rcu() call for tens of thousands of
fences per second.
And I would prefer if the dma_fence implementation just wouldn't be
racing. It is clearly desirable that a call to dma_fence_signal()
decouples the callbacks. That's even how you have designed it, just
with an unnecessary graceperiod thereafter.
I also doubt that it's robust that dma_fence_is_signaled() can return
true while the callbacks are not all executed.
And all the RCU dancing is fragile, where people need to guard API
calls with the RCU read lock instead of having the ability to rely on
their reference / refcount. It would be far safer if dma_fence would
guard against everything with the lock.
Summarizing:
* enable_signaling() uses the lock already.
* set_deadline() users use the lock around the entire callback.
* dma_fence_is_signaled_locked() demands that it must always be
possible to invoke ops->signaled() with a lock, as Danilo pointed
out.
* The few implementors of ops->signaled(), see my list above, all do
atomic operations without any locks at all.
* The other callbacks are either deprecated or not relevant in this
regard.
So making dma_fence locking consistent would make all our problems
disappear, could probably even solve the unload-problem if
dma_fence_signal() becomes a hard, synchronous decoupling point, and it
comes at no proven cost.
P.
On 6/10/26 16:25, Philipp Stanner wrote:
> On Tue, 2026-06-09 at 15:34 +0200, Christian König wrote:
>> On 6/9/26 15:19, Philipp Stanner wrote:
>>> And look what I found:
>>>
>>> static inline bool
>>> nouveau_cli_work_ready(struct dma_fence *fence)
>>> {
>>> unsigned long flags;
>>> bool ret = true;
>>>
>>> dma_fence_lock_irqsave(fence, flags);
>>> if (!dma_fence_is_signaled_locked(fence))
>>> ret = false;
>>> dma_fence_unlock_irqrestore(fence, flags);
>>>
>>> if (ret == true)
>>> dma_fence_put(fence);
>>> return ret;
>>> }
>>>
>>>
>>> That looks weird, doesn't it?
>>
>> No, that is pretty much expected.
>>
>> This issue results because of the lock inversion/cleanup race between
>> nouveau_fence_chan->lock and dropping the last reference.
>>
>> That in turn is caused by the fact that enable_signaling is called
>> with the fence lock held
>
>
> Are you referring to this comment from the documentation?
>
> * Since many implementations can call dma_fence_signal() even when before
> * @enable_signaling has been called there's a race window, where the
> * dma_fence_signal() might result in the final fence reference being
> * released and its memory freed. To avoid this, implementations of this
> * callback should grab their own reference using dma_fence_get(), to be
> * released when the fence is signalled (through e.g. the interrupt
> * handler).
> *
> * This callback is optional. If this callback is not present, then the
> * driver must always have signaling enabled.
> */
> bool (*enable_signaling)(struct dma_fence *fence);
Yes, that was an extremely bad idea which I have tried multiple times to fix.
>
>> and delegates the signaling to the caller instead of doing it itself.
>
> Who delegates what to whom?
>
> enable_signaling() is called indirectly by someone who adds a callback.
> If enable_signaling() is implemented, then the driver callback's only
> job is to activate some sort of interrupt or worker that will signal
> that fence at some point.
No, enable_signal also returns if enabling was successfully if it wasn't successfully the dma_fence framework signals the fence.
>> This in turn means that you can't do proper cleanup after the
>> signaling is done by grabbing driver specific locks.
>
> Sure you can. If everything is properly synchronized.
>
> // driver
> dma_fence_signal(f);
> // all callbacks can't reach our driver anymore
That's irrelevant. The question is not if a callback can reach the backend after signaling.
The question is if the backend can clean up after dma_fence_signal() completes.
> struct driver_fence = container_of(f);
> lock(driver_fence->special_lock);
> cleanup(f);
> unlock(…);
>
> Where is the deadlock?
>
> I continue to fail to see it. Show me an example of some code that
> would deadlock, please. Either fictive or from the kernel.
Interrupt driven signaling path:
spin_lock_irqsave(driver->fence_list_lock, flags);
list_for_each_entry_safe(...) {
if (fence->seqno < signaled_seqno)
break;
dma_fence_signal(fence);
list_entry_del(&fence->list);
dma_fence_put(fence);
}
spin_unlock_irqsave(driver->list_lock, flags);
Enable signaling path:
myfence_enable_signaling()
{
if (fence->seqno <= signaled_seqno)
return false;
talk_to_the_hw();
return true;
}
The problem here is that the enable_signaling path can't grab the driver->fence_list_lock because that would be lock inversion with the fence lock.
Implementations came up with tons of workarounds for this which only work more or less correctly. The issues Nouveau had is just the tip of the iceberg here.
If we nuke the fact that enable_signaling() is called while holding the fence lock all that complexity goes away. In other words it would then look like this:
myfence_enable_signaling()
{
spin_lock_irqsave(driver->fence_list_lock, flags);
if (fence->seqno > signaled_seqno) {
talk_to_the_hw();
} else {
dma_fence_signal(fence);
list_del(fence->list);
dma_fence_put(fence);
}
spin_unlock_irqrestore(driver->fence_list_lock, flags);
}
> Nouveau's enable_signaling() callback does not take locks. Its
> signaled() does not take locks.
Yeah because Nouveau reverted like most driver to use the same spinlock for the driver lock and the fence lock.
But that approach is fundamentally broken, a) you can't cleanup from the is_signaled path because that can be called with both the lock held and not held and b) it doesn't allow the fences to be independent of the driver who issued them.
When it would just be that we consistently call the is_signaled path with the lock held I would immediately agree, but that breaks fence independence and already caused so many issues that I clearly want to remove it.
Regards,
Christian.
>
> I went through all implementors of signaled() and found that only xe
> and nouveau invoke some function pointer that we want to investigate
> closer:
>
> User Way signaled() operates
> -----------------------------------------------
> amdgpu_userq_fence.c lockless
> etnaviv lockless
> i915 lockless
> nouveau lockless?
> radeon lockless
> vc4 lockless
> xe lockless?
>
>
> Who will be deadlocking and with which locks?
>
>
>> This is *exactly* the problem I'm trying to prevent here.
>>
>> When the callbacks wouldn't be called with the fence lock held the
>> Nouveau nouveau_fence_chan->lock and the fence lock would be
>> completely independent.
>
> You seem to circle around the idea that Nouveau's fence list is
> protected with the shared lock?
>
> That issue, if it exists, is solvable through the embedded lock.
>
> Anyways, there is no issue. Neither Nouveau nor other drivers will
> deadlock if we call ops->signaled() with the fence lock around.
>
>>
>> This results in much better cleanup paths, fences which are
>> independent of their issuers and in general much simpler handling for
>> all dma_fence implementation backends because we don't need to worry
>> all the time about lock inversions between the fence lock and
>> internal driver locks.
>>
>> So as far as I can see what you suggest here is exactly what has
>> caused all the problems in the first place.
>>
>> For the cleanup path in Rust you should be trivially able to use
>> call_rcu() if the synchronized cleanup path would be causing issues
>> (which I clearly agree on).
>
> You can *satisfy the current dma_fence API* with call_rcu(), but as
> Danilo states delaying work always has subtleties – plus I don't really
> want to have an unnecessary call_rcu() call for tens of thousands of
> fences per second.
>
> And I would prefer if the dma_fence implementation just wouldn't be
> racing. It is clearly desirable that a call to dma_fence_signal()
> decouples the callbacks. That's even how you have designed it, just
> with an unnecessary graceperiod thereafter.
>
> I also doubt that it's robust that dma_fence_is_signaled() can return
> true while the callbacks are not all executed.
>
> And all the RCU dancing is fragile, where people need to guard API
> calls with the RCU read lock instead of having the ability to rely on
> their reference / refcount. It would be far safer if dma_fence would
> guard against everything with the lock.
>
> Summarizing:
> * enable_signaling() uses the lock already.
> * set_deadline() users use the lock around the entire callback.
> * dma_fence_is_signaled_locked() demands that it must always be
> possible to invoke ops->signaled() with a lock, as Danilo pointed
> out.
> * The few implementors of ops->signaled(), see my list above, all do
> atomic operations without any locks at all.
> * The other callbacks are either deprecated or not relevant in this
> regard.
>
> So making dma_fence locking consistent would make all our problems
> disappear, could probably even solve the unload-problem if
> dma_fence_signal() becomes a hard, synchronous decoupling point, and it
> comes at no proven cost.
>
>
>
> P.
On Wed, 2026-06-10 at 17:15 +0200, Christian König wrote:
> On 6/10/26 16:25, Philipp Stanner wrote:
> >
> > Are you referring to this comment from the documentation?
> >
> > * Since many implementations can call dma_fence_signal() even when before
> > * @enable_signaling has been called there's a race window, where the
> > * dma_fence_signal() might result in the final fence reference being
> > * released and its memory freed. To avoid this, implementations of this
> > * callback should grab their own reference using dma_fence_get(), to be
> > * released when the fence is signalled (through e.g. the interrupt
> > * handler).
> > *
> > * This callback is optional. If this callback is not present, then the
> > * driver must always have signaling enabled.
> > */
> > bool (*enable_signaling)(struct dma_fence *fence);
>
> Yes, that was an extremely bad idea which I have tried multiple times to fix.
What's the reason why enable_signaling() is nowadays called with lock-
protection?
>
> >
> > > and delegates the signaling to the caller instead of doing it itself.
> >
> > Who delegates what to whom?
> >
> > enable_signaling() is called indirectly by someone who adds a callback.
> > If enable_signaling() is implemented, then the driver callback's only
> > job is to activate some sort of interrupt or worker that will signal
> > that fence at some point.
>
> No, enable_signal also returns if enabling was successfully if it wasn't successfully the dma_fence framework signals the fence.
>
> > > This in turn means that you can't do proper cleanup after the
> > > signaling is done by grabbing driver specific locks.
> >
> > Sure you can. If everything is properly synchronized.
> >
> > // driver
> > dma_fence_signal(f);
> > // all callbacks can't reach our driver anymore
>
> That's irrelevant. The question is not if a callback can reach the backend after signaling.
Irrelevant for your lock-inversion maybe. It's very relevant for life
time and module unload. Proof: you and Tvrtko made dma_fence_signal()
the decoupling point, with RCU protection.
And as I keep saying, that synchronisation point would make the entire
framework simpler and more robust if we were making it consistent.
>
> The question is if the backend can clean up after dma_fence_signal() completes.
>
> > struct driver_fence = container_of(f);
> > lock(driver_fence->special_lock);
> > cleanup(f);
> > unlock(…);
> >
> > Where is the deadlock?
> >
> > I continue to fail to see it. Show me an example of some code that
> > would deadlock, please. Either fictive or from the kernel.
>
>
> Interrupt driven signaling path:
>
> spin_lock_irqsave(driver->fence_list_lock, flags);
> list_for_each_entry_safe(...) {
> if (fence->seqno < signaled_seqno)
> break;
> dma_fence_signal(fence);
> list_entry_del(&fence->list);
> dma_fence_put(fence);
> }
> spin_unlock_irqsave(driver->list_lock, flags);
>
> Enable signaling path:
>
> myfence_enable_signaling()
> {
> if (fence->seqno <= signaled_seqno)
> return false;
>
> talk_to_the_hw();
> return true;
> }
>
> The problem here is that the enable_signaling path can't grab the
> driver->fence_list_lock because that would be lock inversion with the
> fence lock.
>
> Implementations came up with tons of workarounds for this which only
> work more or less correctly. The issues Nouveau had is just the tip
> of the iceberg here.
The fact that every implementor right now can live without driver locks
proofs that it can be done.
The few implementors of enable_signaling() who'd really run into an
inversion could address that with various techniques, like using an RCU
list, llist, rw_lock..
I would imagine the most common action for enable_signaling() is to
check whether an interrupt is already registered. If not, register it.
Is that where you got lock inversions?
>
> If we nuke the fact that enable_signaling() is called while holding
> the fence lock all that complexity goes away.
I'm afraid you cannot nuke that, Dr. Oppenheimer :p
You need the lock for synchronization. To add the callback.
Basically, you came quite naturally to the same conclusion and solution
as I did: you need / want race-free synchronization.
It's the exact same use-case, really. You want to prevent someone from
interfering with the fence while the ops->callback is running. You need
the lock for that, by definition.
The other two relevant callbacks, signaled() and set_deadline(),
already have the same lock protection guarantee. The former through
dma_fence_is_signaled_locked(), the latter through the fact that
everyone takes the lock immediately after callback begin anyways.
> In other words it would then look like this:
>
> myfence_enable_signaling()
> {
> spin_lock_irqsave(driver->fence_list_lock, flags);
> if (fence->seqno > signaled_seqno) {
> talk_to_the_hw();
This doesn't need the list lock.
> } else {
> dma_fence_signal(fence);
Without the fence lock protection someone else could have signaled the
fence already btw and now be spinning to do the same list deletion.
> list_del(fence->list);
> dma_fence_put(fence);
> }
> spin_unlock_irqrestore(driver->fence_list_lock, flags);
> }
Plus I don't see why that callback would have to signal the fence. The
return of the boolean¹ seems very consistent with
dma_fence_is_signaled() to me, which asks "is this job done?" and if
the answer is "yes", the framework signals that fence.
Regarding ops->signaled(), that really just needs to read some
ioremaped u64 integer and then return true or false. That will never
need a lock for anything.
So again: the code is already such that implementors of
enable_signaling() and signaled() need to assume that the fence lock is
being held by someone else. All the practical evidence shows that the
lock protection is actually necessary.
All we have to do is make the existing rules official and the code a
bit more consistent.
P.
[¹] though we probably want an int error code as the return value of
enable_signaling(), to distinguish between error and "already
signaled".
>
> > Nouveau's enable_signaling() callback does not take locks. Its
> > signaled() does not take locks.
>
> Yeah because Nouveau reverted like most driver to use the same
> spinlock for the driver lock and the fence lock.
>
> But that approach is fundamentally broken, a) you can't cleanup from
> the is_signaled path because that can be called with both the lock
> held and not held and b) it doesn't allow the fences to be
> independent of the driver who issued them.
>
> When it would just be that we consistently call the is_signaled path
> with the lock held I would immediately agree, but that breaks fence
> independence and already caused so many issues that I clearly want to
> remove it.
>
> Regards,
> Christian.
On 6/11/26 10:35, Philipp Stanner wrote: > On Wed, 2026-06-10 at 17:15 +0200, Christian König wrote: >> On 6/10/26 16:25, Philipp Stanner wrote: >>> >>> Are you referring to this comment from the documentation? >>> >>> * Since many implementations can call dma_fence_signal() even when before >>> * @enable_signaling has been called there's a race window, where the >>> * dma_fence_signal() might result in the final fence reference being >>> * released and its memory freed. To avoid this, implementations of this >>> * callback should grab their own reference using dma_fence_get(), to be >>> * released when the fence is signalled (through e.g. the interrupt >>> * handler). >>> * >>> * This callback is optional. If this callback is not present, then the >>> * driver must always have signaling enabled. >>> */ >>> bool (*enable_signaling)(struct dma_fence *fence); >> >> Yes, that was an extremely bad idea which I have tried multiple times to fix. > > What's the reason why enable_signaling() is nowadays called with lock- > protection? That was some decision originally made a long long time ago because the initial thought was that fence need to signal in order, but that concept was also abandoned a long long time ago. Fixing this is on my TODO list ever since we figured out that this was a bad idea, see the latest patch set here as well: https://www.spinics.net/lists/dri-devel/msg461253.html >>> Sure you can. If everything is properly synchronized. >>> >>> // driver >>> dma_fence_signal(f); >>> // all callbacks can't reach our driver anymore >> >> That's irrelevant. The question is not if a callback can reach the backend after signaling. > > Irrelevant for your lock-inversion maybe. It's very relevant for life > time and module unload. Proof: you and Tvrtko made dma_fence_signal() > the decoupling point, with RCU protection. > > And as I keep saying, that synchronisation point would make the entire > framework simpler and more robust if we were making it consistent. Exactly that's what I strongly disagree on. The fence lock and the synchronization point which allows the driver to know when nobody else is in a callback any more are two very different things we should not be mixed up together. The fence lock protects the signaled state of the fence, so that the fence functions can add callbacks, test signaling etc... while the fence can't signal. It is protecting internal state of the fence and so should be internal to the fence, external code should not touch it if possible. The synchronization point for the driver is a service the dma_fence implementation offers to let drivers know when there is no more caller of their function. And usually this is implemented using RCU/SRCU. Tvrko and I now choose RCU instead of SRCU because there didn't seem to be a need to sleep in the fence callbacks. But both RCU and SRCU offer not only synchronize_rcu()/_srcu() but also call_rcu()/_srcu() which allows drivers to delegate the cleanup to a point where it is save to do so. So while I see the problem I actually don't understand why you insist of solving it with the fence lock? That just brings us back to the bad design we had before where internal fence state is abused to protect something else. Regards, Christian.
On Thu, 2026-06-11 at 11:14 +0200, Christian König wrote: > On 6/11/26 10:35, Philipp Stanner wrote: > > On Wed, 2026-06-10 at 17:15 +0200, Christian König wrote: > > > On 6/10/26 16:25, Philipp Stanner wrote: > > > > > > > > Are you referring to this comment from the documentation? > > > > > > > > * Since many implementations can call dma_fence_signal() even > > > > when before > > > > * @enable_signaling has been called there's a race window, > > > > where the > > > > * dma_fence_signal() might result in the final fence reference > > > > being > > > > * released and its memory freed. To avoid this, > > > > implementations of this > > > > * callback should grab their own reference using > > > > dma_fence_get(), to be > > > > * released when the fence is signalled (through e.g. the > > > > interrupt > > > > * handler). > > > > * > > > > * This callback is optional. If this callback is not present, > > > > then the > > > > * driver must always have signaling enabled. > > > > */ > > > > bool (*enable_signaling)(struct dma_fence *fence); > > > > > > Yes, that was an extremely bad idea which I have tried multiple > > > times to fix. > > > > What's the reason why enable_signaling() is nowadays called with > > lock- > > protection? > > That was some decision originally made a long long time ago because > the initial thought was that fence need to signal in order, but that > concept was also abandoned a long long time ago. You also need it to avoid a race with the driver's requested signalling in ops->enable_signalling(), through dma_fence_add_callback(). I pointed that out in my answer. You cut it out and did not react to that statement. > > Fixing this is on my TODO list ever since we figured out that this > was a bad idea, see the latest patch set here as well: > https://www.spinics.net/lists/dri-devel/msg461253.html > > > > > Sure you can. If everything is properly synchronized. > > > > > > > > // driver > > > > dma_fence_signal(f); > > > > // all callbacks can't reach our driver anymore > > > > > > That's irrelevant. The question is not if a callback can reach > > > the backend after signaling. > > > > Irrelevant for your lock-inversion maybe. It's very relevant for > > life > > time and module unload. Proof: you and Tvrtko made > > dma_fence_signal() > > the decoupling point, with RCU protection. > > > > And as I keep saying, that synchronisation point would make the > > entire > > framework simpler and more robust if we were making it consistent. > > Exactly that's what I strongly disagree on. Yeah, and you don't provide any rationale AFAICS. The argument basically is "this is how it is, and it is how it is because it is like that". It's "this is done with RCU instead of locks, because you can use RCU for that". No offense intended, but I just can't see any substantial argument except for non-existing deadlocks that COULD ALREADY OCCUR and have to be mitigated by the drivers already. > > The fence lock and the synchronization point which allows the driver > to know when nobody else is in a callback any more are two very > different things we should not be mixed up together. Why? It's literally what locks exist for. For synchronization. RCU has been literally developed for deadlock avoidance, which its usage in dma_fence provably doesn't achieve. You still have the deadlock problem, and on top of that you have redundant, unnecessary RCU calls. > > The fence lock protects the signaled state of the fence, so that the > fence functions can add callbacks, test signaling etc... while the > fence can't signal. It is protecting internal state of the fence and > so should be internal to the fence, external code should not touch it > if possible. You allow people to take the lock manually with the API function dma_fence_lock_irqsave(). But you know what, I absolutely agree! I think if we take the locks for the callbacks and remove dma_fence_set_error(), then the need to ever touch the lock manually disappears. Then the driver knows what I pointed out at least three times now: dma_fence_signal(f); cleanup(f); // perfectly safe, bc after signal() all accesors are done No RCU, no nothing. Just plain proper synchronization. But when I point that out, you say "No no, you can't do that, that's not the purpose of the lock, you have to do: dma_fence_signal(f); call_rcu(cleanup(f)); " ??? > > The synchronization point for the driver is a service the dma_fence > implementation offers to let drivers know when there is no more > caller of their function. And usually this is implemented using > RCU/SRCU. "this is how it is because it is how it is". The one and only reason why you need RCU is that you don't lock consistently. Using RCU everywhere is nothing but symptom treatment. > > Tvrko and I now choose RCU instead of SRCU because there didn't seem > to be a need to sleep in the fence callbacks. You can't sleep in the callbacks, because the callbacks are invoked with the fence spinlock being held. > > But both RCU and SRCU offer not only synchronize_rcu()/_srcu() but > also call_rcu()/_srcu() which allows drivers to delegate the cleanup > to a point where it is save to do so. "We use RCU instead of locks because we can use RCU instead of locks". This is not an argument against locks, Christian. If you lock consistently, cleaning up after dma_fence_signal() is guaranteed to be safe to do, immediately. > > So while I see the problem I actually don't understand why you insist > of solving it with the fence lock? Well, that's then a philosophical moment, because I on the contrary don't understand why you insist on solving it with RCU :D The tl;dr is: using the locks makes the entire implementation more robust and consistent and allows for removing the RCU delays. I showed you a bug that would have been prevented with locks. It was caused by the improper synchronization, where dma_fence_is_signaled() can return true when the callbacks have not yet been run. The solution is to use dma_fence_lock_irqsave(f); if (dma_fence_is_signaled_locked(f)) cleanup(f); dma_fence_unlock_irqrestore(f); IOW… the solution is to call the ops->signaled() callback with the lock held……… notice something? ;) I provided more rationale on the rest of my mail, but you cut that out. Here, let me repost it: " You need the lock for synchronization. To add the callback. Basically, you came quite naturally to the same conclusion and solution as I did: you need / want race-free synchronization. It's the exact same use-case, really. You want to prevent someone from interfering with the fence while the ops->callback is running. You need the lock for that, by definition. The other two relevant callbacks, signaled() and set_deadline(), already have the same lock protection guarantee. The former through dma_fence_is_signaled_locked(), the latter through the fact that everyone takes the lock immediately after callback begin anyways. " and " All we have to do is make the existing rules official and the code a bit more consistent. " The deadlock argument doesn't hold up because the lock is already being taken for enable_signalling() and signaled(). I showed you one Nouveau bug which the locks would have prevented. Furthermore, I argue that it's impossible for you to remove the lock protection from ops->enable_signalling(). I further argue that consistent lock protection would allow for removing the need for dma_fence_is_signaled_locked() and dma_fence_lock_irqsave() as public APIs. > That just brings us back to the bad design we had before where > internal fence state is abused to protect something else. If we apply my design, we could deprecate dma_fence_lock_irqsave() and thereby declare it illegal to do that. Relying on dma_fence_signal() being a point of synchronization is completely different from using its internal lock to protect your own list or tree or whatever. P. > > Regards, > Christian.
On 6/11/26 11:50, Philipp Stanner wrote: > On Thu, 2026-06-11 at 11:14 +0200, Christian König wrote: >> On 6/11/26 10:35, Philipp Stanner wrote: >>> On Wed, 2026-06-10 at 17:15 +0200, Christian König wrote: >>>> On 6/10/26 16:25, Philipp Stanner wrote: >>>>> >>>>> Are you referring to this comment from the documentation? >>>>> >>>>> * Since many implementations can call dma_fence_signal() even >>>>> when before >>>>> * @enable_signaling has been called there's a race window, >>>>> where the >>>>> * dma_fence_signal() might result in the final fence reference >>>>> being >>>>> * released and its memory freed. To avoid this, >>>>> implementations of this >>>>> * callback should grab their own reference using >>>>> dma_fence_get(), to be >>>>> * released when the fence is signalled (through e.g. the >>>>> interrupt >>>>> * handler). >>>>> * >>>>> * This callback is optional. If this callback is not present, >>>>> then the >>>>> * driver must always have signaling enabled. >>>>> */ >>>>> bool (*enable_signaling)(struct dma_fence *fence); >>>> >>>> Yes, that was an extremely bad idea which I have tried multiple >>>> times to fix. >>> >>> What's the reason why enable_signaling() is nowadays called with >>> lock- >>> protection? >> >> That was some decision originally made a long long time ago because >> the initial thought was that fence need to signal in order, but that >> concept was also abandoned a long long time ago. > > You also need it to avoid a race with the driver's requested signalling > in ops->enable_signalling(), through dma_fence_add_callback(). > > I pointed that out in my answer. You cut it out and did not react to > that statement. Yeah because I don't think it's relevant. You don't have a race between enabling signaling and adding the callback, that are two independent functions. Enable signaling is something which must be done before adding the callbacks, that's granted but they don't need a common lock protection. >> Fixing this is on my TODO list ever since we figured out that this >> was a bad idea, see the latest patch set here as well: >> https://www.spinics.net/lists/dri-devel/msg461253.html >> >>>>> Sure you can. If everything is properly synchronized. >>>>> >>>>> // driver >>>>> dma_fence_signal(f); >>>>> // all callbacks can't reach our driver anymore >>>> >>>> That's irrelevant. The question is not if a callback can reach >>>> the backend after signaling. >>> >>> Irrelevant for your lock-inversion maybe. It's very relevant for >>> life >>> time and module unload. Proof: you and Tvrtko made >>> dma_fence_signal() >>> the decoupling point, with RCU protection. >>> >>> And as I keep saying, that synchronisation point would make the >>> entire >>> framework simpler and more robust if we were making it consistent. >> >> Exactly that's what I strongly disagree on. > > Yeah, and you don't provide any rationale AFAICS. The argument > basically is "this is how it is, and it is how it is because it is like > that". It's "this is done with RCU instead of locks, because you can > use RCU for that". > > No offense intended, but I just can't see any substantial argument > except for non-existing deadlocks that COULD ALREADY OCCUR and have to > be mitigated by the drivers already. Well those deadlocks absolutely do exists. I mean just take look at how bad the drivers have to work around the issue that enable_signaling and is_signaled can be called with the lock held. >> The fence lock and the synchronization point which allows the driver >> to know when nobody else is in a callback any more are two very >> different things we should not be mixed up together. > > Why? It's literally what locks exist for. For synchronization. No, locks exists for protection of state. > RCU has been literally developed for deadlock avoidance, which its > usage in dma_fence provably doesn't achieve. You still have the > deadlock problem, and on top of that you have redundant, unnecessary > RCU calls. Ok, where exactly do you see the problem? >> The fence lock protects the signaled state of the fence, so that the >> fence functions can add callbacks, test signaling etc... while the >> fence can't signal. It is protecting internal state of the fence and >> so should be internal to the fence, external code should not touch it >> if possible. > > You allow people to take the lock manually with the API function > dma_fence_lock_irqsave(). > > But you know what, I absolutely agree! > > I think if we take the locks for the callbacks and remove > dma_fence_set_error(), then the need to ever touch the lock manually > disappears. I agree on that but that makes it impossible for the dma_fence to independent of the driver who issues it. > > Then the driver knows what I pointed out at least three times now: > > dma_fence_signal(f); > cleanup(f); // perfectly safe, bc after signal() all accesors are done > > No RCU, no nothing. Just plain proper synchronization. No, that doesn't work because you have multiple paths to dma_fence_signal(): A) Driver is calling that from IRQ. B) Through calling dma_fence_enable_sw_signaling() and the underlying callback returning false. C) Through calling dma_fence_is_signaled() and the underlying callback returning true. For A you clearly can use that approach, but it mangles every attempt to cleanup for B & C because the driver can't grab locks which it would grab on path A. It's really a picture book classic lock inversion problem with callbacks. > But when I point that out, you say "No no, you can't do that, that's > not the purpose of the lock, you have to do: > > dma_fence_signal(f); > call_rcu(cleanup(f)); > " > > ??? Yes, what exactly is wrong with that? I mean you must have some kind of technical issue with that which I don't understand. > >> >> The synchronization point for the driver is a service the dma_fence >> implementation offers to let drivers know when there is no more >> caller of their function. And usually this is implemented using >> RCU/SRCU. > > "this is how it is because it is how it is". > > The one and only reason why you need RCU is that you don't lock > consistently. > > Using RCU everywhere is nothing but symptom treatment. > >> >> Tvrko and I now choose RCU instead of SRCU because there didn't seem >> to be a need to sleep in the fence callbacks. > > You can't sleep in the callbacks, because the callbacks are invoked > with the fence spinlock being held. Yes which is the case at the moment, but that is something we want to get away from. >> >> But both RCU and SRCU offer not only synchronize_rcu()/_srcu() but >> also call_rcu()/_srcu() which allows drivers to delegate the cleanup >> to a point where it is save to do so. > > "We use RCU instead of locks because we can use RCU instead of locks". > This is not an argument against locks, Christian. > > If you lock consistently, cleaning up after dma_fence_signal() is > guaranteed to be safe to do, immediately. Yes I agree on that, but you completely mess up all alternative signaling paths and that is something I can't accept. >> >> So while I see the problem I actually don't understand why you insist >> of solving it with the fence lock? > > Well, that's then a philosophical moment, because I on the contrary > don't understand why you insist on solving it with RCU :D Well because of the technical problems I see :D > The tl;dr is: using the locks makes the entire implementation more > robust and consistent and allows for removing the RCU delays. > > I showed you a bug that would have been prevented with locks. It was > caused by the improper synchronization, where dma_fence_is_signaled() > can return true when the callbacks have not yet been run. No, that problem wouldn't be fixed by that. Even when we call the callbacks with the locks held has no effect on the handling in dma_fence_is_signaled(). We would also need to move the testing if a fence is signaled or not under the lock and I can guarantee you that there will be push back on this. > The solution is to use > > dma_fence_lock_irqsave(f); > if (dma_fence_is_signaled_locked(f)) > cleanup(f); > dma_fence_unlock_irqrestore(f); > > IOW… the solution is to call the ops->signaled() callback with the lock > held……… notice something? ;) > > I provided more rationale on the rest of my mail, but you cut that out. > Here, let me repost it: > > > " > You need the lock for synchronization. To add the callback. > > Basically, you came quite naturally to the same conclusion and solution > as I did: you need / want race-free synchronization. > > It's the exact same use-case, really. You want to prevent someone from > interfering with the fence while the ops->callback is running. You need > the lock for that, by definition. > > The other two relevant callbacks, signaled() and set_deadline(), > already have the same lock protection guarantee. The former through > dma_fence_is_signaled_locked(), the latter through the fact that > everyone takes the lock immediately after callback begin anyways. > " > > and > > " > All we have to do is make the existing rules official and the code a > bit more consistent. > " > > > The deadlock argument doesn't hold up because the lock is already being > taken for enable_signalling() and signaled(). > > I showed you one Nouveau bug which the locks would have prevented. Well I can clearly disproved that. > Furthermore, I argue that it's impossible for you to remove the lock > protection from ops->enable_signalling(). Yeah that is something I don't understand. Where exactly do you see the problem here? We already have patches which does that which have been reviewed and tested and there wasn't any problem regarding that. It can be that we missed this because it was never pushed upstream. > I further argue that consistent lock protection would allow for > removing the need for dma_fence_is_signaled_locked() and > dma_fence_lock_irqsave() as public APIs. I agree that this goal would be nice to have, but I don't see how your proposal will help with that. Regards, Christian. > > >> That just brings us back to the bad design we had before where >> internal fence state is abused to protect something else. > > If we apply my design, we could deprecate dma_fence_lock_irqsave() and > thereby declare it illegal to do that. > > Relying on dma_fence_signal() being a point of synchronization is > completely different from using its internal lock to protect your own > list or tree or whatever. > > > P. > >> >> Regards, >> Christian.
On Mon, 2026-06-08 at 20:47 +0200, Christian König wrote: > On 6/8/26 20:39, Danilo Krummrich wrote: > > On Mon Jun 8, 2026 at 8:32 PM CEST, Christian König wrote: > > > On 6/8/26 19:59, Danilo Krummrich wrote: > > > > On Mon Jun 8, 2026 at 7:34 PM CEST, Christian König wrote: > > > > > That's why we need the RCU grace period to make sure that nobody is > > > > > referencing the driver stuff any more. > > > > > > > > Right, and that's what Philipp tries to address, the requirement to wait for an > > > > RCU grace period is perfectly fine if it is only about freeing memory, but it > > > > can become painful if the fence private data contains data also needs to be > > > > destructed in some way. > > > > > > Yeah that makes sense. > > > > > > > IOW, if a driver signals a fence, it is lifecycle-wise reasonable to destruct > > > > the private data that is no longer needed (remaining users only deal with struct > > > > dma_fence) and having to wait for a full grace period adds sublety and > > > > complication that can be avoided with the proposed approach. > > > > > > Yeah, I've run into that when I tried to make the amdgpu fences independent as well. > > > > That said, I'd like to ask the opposite question: What are the concerns with the > > > > proposed approach over (pure) RCU? > > > > > > Well a) locking inversions and b) performance. > > > > > > For example the reason why we have the dma_fence_is_signaled() and > > > dma_fence_is_signaled_locked() variants is because there is a measurable > > > difference in some specific use cases for not grabbing the locks. > > > > I checked for this as well, but couldn't find a case where > > dma_fence_is_signaled() is used in a way where it would be performance critical > > to avoid the lock in any way. > > > > Note that the lock is only bypassed when the fence is signaled already (this > > would be preserved) and if signaled() returns false, i.e. dma_fence_signal() > > will take the lock anyways. > > > > > I personally find those micro-optimizations rather questionable, but the > > > community agreement is that we should have them. > > > > I agree, it is rather questionable. So, I wouldn't make this the deciding factor > > unless someone can present a valid case where it actually matters. > > > > > So my take would rather be that the dma_fence_is_signaled_locked() variant > > > goes away and we consistently call the ops pointers without holding the > > > dma_fence lock and the driver implementations can then optionally take it if > > > necessary. > > > > How did you get to this conclusion considering that you run into what I > > mentioned above as well and the fact that we seem to agree that the performance > > concern is rather questionable? > > Quite simple, it's the cleaner approach. Depends on the definition of "clean". Considering the scheduler disaster, I learned that there is nothing cleaner than locking. It provides perfect synchronization. It would eliminate the race that we see for the Rust abstractions, and likely also the one that you have seen. The more of the heavy lifting the API does, the less likely it becomes that drivers even start to interfer with your API internals, like taking your internal locks to work around race conditions in the API (talking about drm_sched again). > > Calling callbacks with locks held is rather questionable even putting the performance issue aside. We already see that a few drivers have to take the lock anyways, which informs that it's probably necessary to take it for the racy conditions this patch desires to address. > > In detail calling the callbacks without holding locks allows all implementations who need it to explicitly take locks in the order they want. Didn't you say a few mails above that the implementation should not use the fence lock for its own purposes? P. > > If you call it with the lock held you enforce the fence lock the be the outermost lock. > > Regards, > Christian.
On 6/9/26 07:52, Philipp Stanner wrote: ... >> >> In detail calling the callbacks without holding locks allows all implementations who need it to explicitly take locks in the order they want. > > Didn't you say a few mails above that the implementation should not use > the fence lock for its own purposes? The usual use case the drivers have is this here: dma_fence_lock_irqsave(fence, flags); was_signaled = dma_fence_test_signaled(fence); if (!was_signaled) dma_fence_signal(fence); dma_fence_unlock_irqrestore(fence, flags); if (!was_signaled) cleanup(); This is actually what you and me came up with for the KFD when we removed the return code for dma_fence_signal(). Taking the lock around the enable_signaling() callback has the exact same reason, preventing the fence from signaling between testing and calling dma_fence_signal(). The problem with that approach is that cleanup() now suddenly runs under the fence lock as well. So you are left with few options: Either the fence lock is external, which we don't want because that make the fence non-independent, or cleanup() defers work to irq_work or work_structs, which creates numerous lifetime issues. When enable_signaling would be independent of the fence lock we could avoid all of this and just use a normal spin_lock() for the cleanup. Regards, Christian. > > > P. > >> >> If you call it with the lock held you enforce the fence lock the be the outermost lock. >> >> Regards, >> Christian.
On Tue, 2026-06-09 at 12:26 +0200, Christian König wrote: > On 6/9/26 07:52, Philipp Stanner wrote: > ... > > > > > > In detail calling the callbacks without holding locks allows all implementations who need it to explicitly take locks in the order they want. > > > > Didn't you say a few mails above that the implementation should not use > > the fence lock for its own purposes? > > The usual use case the drivers have is this here: > > dma_fence_lock_irqsave(fence, flags); > was_signaled = dma_fence_test_signaled(fence); > if (!was_signaled) > dma_fence_signal(fence); > dma_fence_unlock_irqrestore(fence, flags); > > if (!was_signaled) If cleanup() touches fence data, this is now exactly the race that I am concerned with. With the current design, you'd actually need a synchronize_rcu() here, which you definitely do not want in a hot path :( > cleanup(); > > This is actually what you and me came up with for the KFD when we > removed the return code for dma_fence_signal(). Well, what we came up with for that rare case was was_signaled = dma_fence_check_and_signal(fence); if (!was_signaled) cleanup(); It seems that many of the other use cases where the fence lock is taken by drivers ultimately are rooted in the (IMO) design mistake that dma_fence_set_error() exists. There should just be void dma_fence_signal(struct dma_fence *f, int err); which also conveniently forces all signalers to really carefully think about whether the fence's associated operations succeeded. Correctly representing the "true fence error state" to all consumers is vital for sane system behavior, as you continuously have to point out. > > Taking the lock around the enable_signaling() callback has the exact > same reason, preventing the fence from signaling between testing and > calling dma_fence_signal(). The problem with that approach is that > cleanup() now suddenly runs under the fence lock as well. That problem does not exist if we re-design dma_fence like that: // driver dma_fence_signal(f); // revokes all accesses to our driver through backend_ops // synchronize_rcu() now unnecessary \o/ cleanup(f); // We know that all accessors are gone dma_fence_put(f); > > So you are left with few options: Either the fence lock is external, > which we don't want because that make the fence non-independent, or > cleanup() defers work to irq_work or work_structs, which creates > numerous lifetime issues. Yup, this is uncool and we want to avoid that. But these seem to be the options 1. Ensure proper synchronization 2. Wait for a grace period in a hot path 3. Defer cleanup() with some delay mechanism #1 is by far the cleanest approach. I still cannot see any downside, and quite a few upsides. https://elixir.bootlin.com/linux/v7.1-rc6/source/drivers/dma-buf/dma-fence.c#L1025 ^ is already racing with the signaled check. I'm going through the users right now, and it seems all we need to ensure to implement this change is to audit all ops->signaled() for usage of the fence_lock and see if we can port them. > > When enable_signaling would be independent of the fence lock we could > avoid all of this and just use a normal spin_lock() for the cleanup. How's that related to our problem? P.
On 6/9/26 12:42, Philipp Stanner wrote: > On Tue, 2026-06-09 at 12:26 +0200, Christian König wrote: >> On 6/9/26 07:52, Philipp Stanner wrote: >> ... >>>> >>>> In detail calling the callbacks without holding locks allows all implementations who need it to explicitly take locks in the order they want. >>> >>> Didn't you say a few mails above that the implementation should not use >>> the fence lock for its own purposes? >> >> The usual use case the drivers have is this here: >> >> dma_fence_lock_irqsave(fence, flags); >> was_signaled = dma_fence_test_signaled(fence); >> if (!was_signaled) >> dma_fence_signal(fence); >> dma_fence_unlock_irqrestore(fence, flags); >> >> if (!was_signaled) > > If cleanup() touches fence data, this is now exactly the race that I am > concerned with. > > With the current design, you'd actually need a synchronize_rcu() here, > which you definitely do not want in a hot path :( > >> cleanup(); >> >> This is actually what you and me came up with for the KFD when we >> removed the return code for dma_fence_signal(). > > Well, what we came up with for that rare case was > > was_signaled = dma_fence_check_and_signal(fence); > if (!was_signaled) > cleanup(); > > It seems that many of the other use cases where the fence lock is taken > by drivers ultimately are rooted in the (IMO) design mistake that > dma_fence_set_error() exists. > > There should just be > > void > dma_fence_signal(struct dma_fence *f, int err); > > which also conveniently forces all signalers to really carefully think > about whether the fence's associated operations succeeded. Correctly > representing the "true fence error state" to all consumers is vital for > sane system behavior, as you continuously have to point out. Yeah that came to my mind before as well. >> >> Taking the lock around the enable_signaling() callback has the exact >> same reason, preventing the fence from signaling between testing and >> calling dma_fence_signal(). The problem with that approach is that >> cleanup() now suddenly runs under the fence lock as well. > > That problem does not exist if we re-design dma_fence like that: > > // driver > dma_fence_signal(f); // revokes all accesses to our driver through backend_ops > // synchronize_rcu() now unnecessary \o/ > cleanup(f); // We know that all accessors are gone > dma_fence_put(f); Yeah and exactly that doesn't work. Just think about the Nouveau case when you have your fences on a double linked list. When the fence lock is independent, e.g. have a separate lock for each fence then this lock can't protect this double linked list. So your cleanup path needs to take a lock which protects the list, but you then run into lock inversion. >> >> So you are left with few options: Either the fence lock is external, >> which we don't want because that make the fence non-independent, or >> cleanup() defers work to irq_work or work_structs, which creates >> numerous lifetime issues. > > Yup, this is uncool and we want to avoid that. > > But these seem to be the options > > 1. Ensure proper synchronization > 2. Wait for a grace period in a hot path > 3. Defer cleanup() with some delay mechanism > > #1 is by far the cleanest approach. I still cannot see any downside, > and quite a few upsides. > > https://elixir.bootlin.com/linux/v7.1-rc6/source/drivers/dma-buf/dma-fence.c#L1025 > > ^ is already racing with the signaled check. Yeah so what? That is just an opportunistic check. > I'm going through the users right now, and it seems all we need to > ensure to implement this change is to audit all ops->signaled() for > usage of the fence_lock and see if we can port them. > >> >> When enable_signaling would be independent of the fence lock we could >> avoid all of this and just use a normal spin_lock() for the cleanup. > > How's that related to our problem? See the explanation above for the Nouveau example. As far as I can see the approach you want to implement here is a NO-GO from my side. Regards, Christian. > > P.
On Tue, 2026-06-09 at 12:53 +0200, Christian König wrote:
> >
> > // driver
> > dma_fence_signal(f); // revokes all accesses to our driver through backend_ops
> > // synchronize_rcu() now unnecessary \o/
> > cleanup(f); // We know that all accessors are gone
> > dma_fence_put(f);
>
> Yeah and exactly that doesn't work.
>
> Just think about the Nouveau case when you have your fences on a double linked list.
>
> When the fence lock is independent, e.g. have a separate lock for each fence then this lock can't protect this double linked list.
>
> So your cleanup path needs to take a lock which protects the list, but you then run into lock inversion.
static bool nouveau_fence_is_signaled(struct dma_fence *f)
{
struct nouveau_fence *fence = to_nouveau_fence(f);
struct nouveau_fence_chan *fctx = nouveau_fctx(fence);
struct nouveau_channel *chan;
bool ret = false;
rcu_read_lock();
chan = rcu_dereference(fence->channel);
if (chan)
ret = (int)(fctx->read(chan) - fence->base.seqno) >=
0;
rcu_read_unlock();
return ret;
}
AFAICT fctx->read() does not take f->lock. So where is the lock
inversion?
Again, ideally we can get to the point where no one except for the
fence subsystem itself has to take the lock manually anymore.
>
> > >
> > > So you are left with few options: Either the fence lock is external,
> > > which we don't want because that make the fence non-independent, or
> > > cleanup() defers work to irq_work or work_structs, which creates
> > > numerous lifetime issues.
> >
> > Yup, this is uncool and we want to avoid that.
> >
> > But these seem to be the options
> >
> > 1. Ensure proper synchronization
> > 2. Wait for a grace period in a hot path
> > 3. Defer cleanup() with some delay mechanism
> >
> > #1 is by far the cleanest approach. I still cannot see any downside,
> > and quite a few upsides.
> >
> > https://elixir.bootlin.com/linux/v7.1-rc6/source/drivers/dma-buf/dma-fence.c#L1025
> >
> > ^ is already racing with the signaled check.
>
> Yeah so what? That is just an opportunistic check.
What happens if someone signals the fence while the set_deadline()
callback is running?
P.
On Mon Jun 8, 2026 at 8:47 PM CEST, Christian König wrote:
> On 6/8/26 20:39, Danilo Krummrich wrote:
>> On Mon Jun 8, 2026 at 8:32 PM CEST, Christian König wrote:
>>> On 6/8/26 19:59, Danilo Krummrich wrote:
>>>> On Mon Jun 8, 2026 at 7:34 PM CEST, Christian König wrote:
>>>>> That's why we need the RCU grace period to make sure that nobody is
>>>>> referencing the driver stuff any more.
>>>>
>>>> Right, and that's what Philipp tries to address, the requirement to wait for an
>>>> RCU grace period is perfectly fine if it is only about freeing memory, but it
>>>> can become painful if the fence private data contains data also needs to be
>>>> destructed in some way.
>>>
>>> Yeah that makes sense.
>>>
>>>> IOW, if a driver signals a fence, it is lifecycle-wise reasonable to destruct
>>>> the private data that is no longer needed (remaining users only deal with struct
>>>> dma_fence) and having to wait for a full grace period adds sublety and
>>>> complication that can be avoided with the proposed approach.
>>>
>>> Yeah, I've run into that when I tried to make the amdgpu fences independent as well.
>>>> That said, I'd like to ask the opposite question: What are the concerns with the
>>>> proposed approach over (pure) RCU?
>>>
>>> Well a) locking inversions and b) performance.
>>>
>>> For example the reason why we have the dma_fence_is_signaled() and
>>> dma_fence_is_signaled_locked() variants is because there is a measurable
>>> difference in some specific use cases for not grabbing the locks.
>>
>> I checked for this as well, but couldn't find a case where
>> dma_fence_is_signaled() is used in a way where it would be performance critical
>> to avoid the lock in any way.
>>
>> Note that the lock is only bypassed when the fence is signaled already (this
>> would be preserved) and if signaled() returns false, i.e. dma_fence_signal()
>> will take the lock anyways.
>>
>>> I personally find those micro-optimizations rather questionable, but the
>>> community agreement is that we should have them.
>>
>> I agree, it is rather questionable. So, I wouldn't make this the deciding factor
>> unless someone can present a valid case where it actually matters.
>>
>>> So my take would rather be that the dma_fence_is_signaled_locked() variant
>>> goes away and we consistently call the ops pointers without holding the
>>> dma_fence lock and the driver implementations can then optionally take it if
>>> necessary.
>>
>> How did you get to this conclusion considering that you run into what I
>> mentioned above as well and the fact that we seem to agree that the performance
>> concern is rather questionable?
>
> Quite simple, it's the cleaner approach.
I would maybe agree iff the RCU read side critical section wouldn't be needed
and we wouldn't need to deal with the consequences of having to defer
everything.
And so far it seems to me that there isn't really any other reason that the
performance concern we both don't buy into.
> Calling callbacks with locks held is rather questionable even putting the
> performance issue aside.
In general, I don't think that more flexibility for drivers is automatically
always superior.
Also, before we keep calling it a performance issue, I'd really love to know
where dma_fence_is_signaled() is called in a case where it returns false and the
spinlock causes such an overhead that it actually matters.
(As mentioned above, none of the cases where it returns true would change.)
> In detail calling the callbacks without holding locks allows all
> implementations who need it to explicitly take locks in the order they want.
I don't think this is true in this case.
1) The existence of dma_fence_is_signaled_locked() already mandates that all
such callbacks must work properly if called with the fence lock held.
2) The RCU read side critical section already mandates that driver must not
sleep within the callback.
> If you call it with the lock held you enforce the fence lock the be the
> outermost lock.
That's practically already the case, due to dma_fence_is_signaled_locked().
On 6/8/26 21:25, Danilo Krummrich wrote: > On Mon Jun 8, 2026 at 8:47 PM CEST, Christian König wrote: >> On 6/8/26 20:39, Danilo Krummrich wrote: >>> On Mon Jun 8, 2026 at 8:32 PM CEST, Christian König wrote: >>>> On 6/8/26 19:59, Danilo Krummrich wrote: >>>>> On Mon Jun 8, 2026 at 7:34 PM CEST, Christian König wrote: >>>>>> That's why we need the RCU grace period to make sure that nobody is >>>>>> referencing the driver stuff any more. >>>>> >>>>> Right, and that's what Philipp tries to address, the requirement to wait for an >>>>> RCU grace period is perfectly fine if it is only about freeing memory, but it >>>>> can become painful if the fence private data contains data also needs to be >>>>> destructed in some way. >>>> >>>> Yeah that makes sense. >>>> >>>>> IOW, if a driver signals a fence, it is lifecycle-wise reasonable to destruct >>>>> the private data that is no longer needed (remaining users only deal with struct >>>>> dma_fence) and having to wait for a full grace period adds sublety and >>>>> complication that can be avoided with the proposed approach. >>>> >>>> Yeah, I've run into that when I tried to make the amdgpu fences independent as well. >>>>> That said, I'd like to ask the opposite question: What are the concerns with the >>>>> proposed approach over (pure) RCU? >>>> >>>> Well a) locking inversions and b) performance. >>>> >>>> For example the reason why we have the dma_fence_is_signaled() and >>>> dma_fence_is_signaled_locked() variants is because there is a measurable >>>> difference in some specific use cases for not grabbing the locks. >>> >>> I checked for this as well, but couldn't find a case where >>> dma_fence_is_signaled() is used in a way where it would be performance critical >>> to avoid the lock in any way. >>> >>> Note that the lock is only bypassed when the fence is signaled already (this >>> would be preserved) and if signaled() returns false, i.e. dma_fence_signal() >>> will take the lock anyways. >>> >>>> I personally find those micro-optimizations rather questionable, but the >>>> community agreement is that we should have them. >>> >>> I agree, it is rather questionable. So, I wouldn't make this the deciding factor >>> unless someone can present a valid case where it actually matters. >>> >>>> So my take would rather be that the dma_fence_is_signaled_locked() variant >>>> goes away and we consistently call the ops pointers without holding the >>>> dma_fence lock and the driver implementations can then optionally take it if >>>> necessary. >>> >>> How did you get to this conclusion considering that you run into what I >>> mentioned above as well and the fact that we seem to agree that the performance >>> concern is rather questionable? >> >> Quite simple, it's the cleaner approach. > > I would maybe agree iff the RCU read side critical section wouldn't be needed > and we wouldn't need to deal with the consequences of having to defer > everything. > > And so far it seems to me that there isn't really any other reason that the > performance concern we both don't buy into. > >> Calling callbacks with locks held is rather questionable even putting the >> performance issue aside. > > In general, I don't think that more flexibility for drivers is automatically > always superior. > > Also, before we keep calling it a performance issue, I'd really love to know > where dma_fence_is_signaled() is called in a case where it returns false and the > spinlock causes such an overhead that it actually matters. > > (As mentioned above, none of the cases where it returns true would change.) > >> In detail calling the callbacks without holding locks allows all >> implementations who need it to explicitly take locks in the order they want. > > I don't think this is true in this case. > > 1) The existence of dma_fence_is_signaled_locked() already mandates that all > such callbacks must work properly if called with the fence lock held. For the deadline callback it is mandatory to *not* hold the lock. The signaled callback can be called with and without holding the lock. The enable_signaled callback is always called while holding the lock. > > 2) The RCU read side critical section already mandates that driver must not > sleep within the callback. > >> If you call it with the lock held you enforce the fence lock the be the >> outermost lock. > > That's practically already the case, due to dma_fence_is_signaled_locked(). Yeah, but we can fix this and the enable signaling callback. But the other way around isn't easily possible as far as I can see. Regards, Christian.
On 08/06/2026 15:24, Philipp Stanner wrote:
> The dma_fence backend_ops can access a fence. Hereby, a driver callback
> will be running which likely will access driver specific data through
> container_of(). If now, simultaneously, a driver signals the fence and
> afterwards expects to run a driver specific destructor (using the same
> data accessed through container_of()), there can be a race.
>
> A driver very likely trusts that once it has signaled a fence, no one
> will be accessing it anymore. Moreover, it might already want to free up
> resources, making UAF bugs possible.
Can you explain this race a bit differently? I am struggling to
understand the scenario.
Are you talking about driver freeing the data immediately after
signalling? That is not allowed as per the "DOC: Safe external access to
driver provided object members", ie. divers must ensure a RCU grace
period between signalling and freeing any data which can be reached by
any external caller.
Regards,
Tvrtko
> The race occurs because there are only pragmatic checks for the signaled
> flag of a fence, without taking the fence lock. RCU guards exist, but
> their purpose is to guard accesses through the backend_ops callbacks
> against the driver (which implements the TEXT segment these callbacks
> live in) from unloading.
>
> Proper synchronization can be ensured by taking the fence lock. RCU is
> still simultaneously required to guard against the unload.
>
> Fix the races by taking the lock for all non-deprecated backend_ops
> callbacks.
>
> Conveniently, this also fixes a race where backend_ops->set_deadline()
> might try to set a deadline for an already signaled fence.
>
> Suggested-by: Danilo Krummrich <dakr@kernel.org>
> Signed-off-by: Philipp Stanner <phasta@kernel.org>
> ---
> We discovered this problem through our Rust abstractions, but it can
> also occur in C.
>
> The by far cleanest solution seems to be to use the fence lock. This RFC
> serves to discuss whether there is anything preventing that.
>
> (Patch so far just compile tested, to have some groundlayer for the
> rough idea, to discuss it first)
> ---
> drivers/dma-buf/dma-fence.c | 39 ++++++++++++++++++++++++++++---------
> include/linux/dma-fence.h | 17 ++++++++++++----
> 2 files changed, 43 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c
> index c7ea1e75d38a..b74f02f3cca8 100644
> --- a/drivers/dma-buf/dma-fence.c
> +++ b/drivers/dma-buf/dma-fence.c
> @@ -629,7 +629,8 @@ EXPORT_SYMBOL(dma_fence_free);
> static bool __dma_fence_enable_signaling(struct dma_fence *fence)
> {
> const struct dma_fence_ops *ops;
> - bool was_set;
> + bool was_set, success;
> + unsigned long flags;
>
> dma_fence_assert_held(fence);
>
> @@ -644,7 +645,10 @@ static bool __dma_fence_enable_signaling(struct dma_fence *fence)
> if (!was_set && ops && ops->enable_signaling) {
> trace_dma_fence_enable_signal(fence);
>
> - if (!ops->enable_signaling(fence)) {
> + dma_fence_lock_irqsave(fence, flags);
> + success = ops->enable_signaling(fence);
> + dma_fence_unlock_irqrestore(fence, flags);
> + if (!success) {
> rcu_read_unlock();
> dma_fence_signal_locked(fence);
> return false;
> @@ -1020,11 +1024,20 @@ EXPORT_SYMBOL(dma_fence_wait_any_timeout);
> void dma_fence_set_deadline(struct dma_fence *fence, ktime_t deadline)
> {
> const struct dma_fence_ops *ops;
> + unsigned long flags;
>
> rcu_read_lock();
> ops = rcu_dereference(fence->ops);
> - if (ops && ops->set_deadline && !dma_fence_is_signaled(fence))
> + if (!ops || !ops->set_deadline) {
> + rcu_read_unlock();
> + return;
> + }
> +
> + dma_fence_lock_irqsave(fence, flags);
> + if (!dma_fence_is_signaled_locked(fence))
> ops->set_deadline(fence, deadline);
> +
> + dma_fence_unlock_irqrestore(fence, flags);
> rcu_read_unlock();
> }
> EXPORT_SYMBOL(dma_fence_set_deadline);
> @@ -1166,14 +1179,18 @@ EXPORT_SYMBOL(dma_fence_init64);
> */
> const char __rcu *dma_fence_driver_name(struct dma_fence *fence)
> {
> + const char __rcu *name = "detached-driver";
> const struct dma_fence_ops *ops;
> + unsigned long flags;
>
> /* RCU protection is required for safe access to returned string */
> ops = rcu_dereference(fence->ops);
> + dma_fence_lock_irqsave(fence, flags);
> if (!dma_fence_test_signaled_flag(fence))
> - return (const char __rcu *)ops->get_driver_name(fence);
> - else
> - return (const char __rcu *)"detached-driver";
> + name = ops->get_driver_name(fence);
> + dma_fence_unlock_irqrestore(fence, flags);
> +
> + return name;
> }
> EXPORT_SYMBOL(dma_fence_driver_name);
>
> @@ -1199,13 +1216,17 @@ EXPORT_SYMBOL(dma_fence_driver_name);
> */
> const char __rcu *dma_fence_timeline_name(struct dma_fence *fence)
> {
> + const char __rcu *name = "signaled-timeline";
> const struct dma_fence_ops *ops;
> + unsigned long flags;
>
> /* RCU protection is required for safe access to returned string */
> ops = rcu_dereference(fence->ops);
> + dma_fence_lock_irqsave(fence, flags);
> if (!dma_fence_test_signaled_flag(fence))
> - return (const char __rcu *)ops->get_driver_name(fence);
> - else
> - return (const char __rcu *)"signaled-timeline";
> + name = ops->get_driver_name(fence);
> + dma_fence_unlock_irqrestore(fence, flags);
> +
> + return name;
> }
> EXPORT_SYMBOL(dma_fence_timeline_name);
> diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h
> index b52ab692b22e..b93c3f7f69fb 100644
> --- a/include/linux/dma-fence.h
> +++ b/include/linux/dma-fence.h
> @@ -547,20 +547,29 @@ static inline bool
> dma_fence_is_signaled(struct dma_fence *fence)
> {
> const struct dma_fence_ops *ops;
> + unsigned long flags;
> + bool signaled;
>
> if (dma_fence_test_signaled_flag(fence))
> return true;
>
> rcu_read_lock();
> ops = rcu_dereference(fence->ops);
> - if (ops && ops->signaled && ops->signaled(fence)) {
> + if (!ops || !ops->signaled) {
> rcu_read_unlock();
> - dma_fence_signal(fence);
> - return true;
> + return false;
> }
> +
> + dma_fence_lock_irqsave(fence, flags);
> + signaled = ops->signaled(fence);
> +
> + if (signaled)
> + dma_fence_signal_locked(fence);
> +
> + dma_fence_unlock_irqrestore(fence, flags);
> rcu_read_unlock();
>
> - return false;
> + return signaled;
> }
>
> /**
On Mon, 2026-06-08 at 16:07 +0100, Tvrtko Ursulin wrote:
>
> On 08/06/2026 15:24, Philipp Stanner wrote:
> > The dma_fence backend_ops can access a fence. Hereby, a driver callback
> > will be running which likely will access driver specific data through
> > container_of(). If now, simultaneously, a driver signals the fence and
> > afterwards expects to run a driver specific destructor (using the same
> > data accessed through container_of()), there can be a race.
> >
> > A driver very likely trusts that once it has signaled a fence, no one
> > will be accessing it anymore. Moreover, it might already want to free up
> > resources, making UAF bugs possible.
>
> Can you explain this race a bit differently? I am struggling to
> understand the scenario.
driver: fence consumer:
dma_fence_signal(f) backend_ops() starts running, slightly misses the signaled-bit
do_sth_with_data(container_of(f)) backend_ops() accesses container_of(f)
^ race
backend_ops callbacks must not be invoked after a fence was signaled.
Right now, that is racing.
re: free(), OK, yes, that is the documented rule. Fair. But:
>
> Are you talking about driver freeing the data immediately after
> signalling? That is not allowed as per the "DOC: Safe external access to
> driver provided object members", ie. divers must ensure a RCU grace
> period between signalling and freeing any data which can be reached by
> any external caller.
This is the documented rule mostly because this isn't solved through
locking. If the lock were to grant full synchronization, you could
immediately do your driver cleanup after signaling, because all
potential accessors would be gone.
P.
>
> Regards,
>
> Tvrtko
>
> > The race occurs because there are only pragmatic checks for the signaled
> > flag of a fence, without taking the fence lock. RCU guards exist, but
> > their purpose is to guard accesses through the backend_ops callbacks
> > against the driver (which implements the TEXT segment these callbacks
> > live in) from unloading.
> >
> > Proper synchronization can be ensured by taking the fence lock. RCU is
> > still simultaneously required to guard against the unload.
> >
> > Fix the races by taking the lock for all non-deprecated backend_ops
> > callbacks.
> >
> > Conveniently, this also fixes a race where backend_ops->set_deadline()
> > might try to set a deadline for an already signaled fence.
> >
> > Suggested-by: Danilo Krummrich <dakr@kernel.org>
> > Signed-off-by: Philipp Stanner <phasta@kernel.org>
> > ---
> > We discovered this problem through our Rust abstractions, but it can
> > also occur in C.
> >
> > The by far cleanest solution seems to be to use the fence lock. This RFC
> > serves to discuss whether there is anything preventing that.
> >
> > (Patch so far just compile tested, to have some groundlayer for the
> > rough idea, to discuss it first)
> > ---
> > drivers/dma-buf/dma-fence.c | 39 ++++++++++++++++++++++++++++---------
> > include/linux/dma-fence.h | 17 ++++++++++++----
> > 2 files changed, 43 insertions(+), 13 deletions(-)
> >
> > diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c
> > index c7ea1e75d38a..b74f02f3cca8 100644
> > --- a/drivers/dma-buf/dma-fence.c
> > +++ b/drivers/dma-buf/dma-fence.c
> > @@ -629,7 +629,8 @@ EXPORT_SYMBOL(dma_fence_free);
> > static bool __dma_fence_enable_signaling(struct dma_fence *fence)
> > {
> > const struct dma_fence_ops *ops;
> > - bool was_set;
> > + bool was_set, success;
> > + unsigned long flags;
> >
> > dma_fence_assert_held(fence);
> >
> > @@ -644,7 +645,10 @@ static bool __dma_fence_enable_signaling(struct dma_fence *fence)
> > if (!was_set && ops && ops->enable_signaling) {
> > trace_dma_fence_enable_signal(fence);
> >
> > - if (!ops->enable_signaling(fence)) {
> > + dma_fence_lock_irqsave(fence, flags);
> > + success = ops->enable_signaling(fence);
> > + dma_fence_unlock_irqrestore(fence, flags);
> > + if (!success) {
> > rcu_read_unlock();
> > dma_fence_signal_locked(fence);
> > return false;
> > @@ -1020,11 +1024,20 @@ EXPORT_SYMBOL(dma_fence_wait_any_timeout);
> > void dma_fence_set_deadline(struct dma_fence *fence, ktime_t deadline)
> > {
> > const struct dma_fence_ops *ops;
> > + unsigned long flags;
> >
> > rcu_read_lock();
> > ops = rcu_dereference(fence->ops);
> > - if (ops && ops->set_deadline && !dma_fence_is_signaled(fence))
> > + if (!ops || !ops->set_deadline) {
> > + rcu_read_unlock();
> > + return;
> > + }
> > +
> > + dma_fence_lock_irqsave(fence, flags);
> > + if (!dma_fence_is_signaled_locked(fence))
> > ops->set_deadline(fence, deadline);
> > +
> > + dma_fence_unlock_irqrestore(fence, flags);
> > rcu_read_unlock();
> > }
> > EXPORT_SYMBOL(dma_fence_set_deadline);
> > @@ -1166,14 +1179,18 @@ EXPORT_SYMBOL(dma_fence_init64);
> > */
> > const char __rcu *dma_fence_driver_name(struct dma_fence *fence)
> > {
> > + const char __rcu *name = "detached-driver";
> > const struct dma_fence_ops *ops;
> > + unsigned long flags;
> >
> > /* RCU protection is required for safe access to returned string */
> > ops = rcu_dereference(fence->ops);
> > + dma_fence_lock_irqsave(fence, flags);
> > if (!dma_fence_test_signaled_flag(fence))
> > - return (const char __rcu *)ops->get_driver_name(fence);
> > - else
> > - return (const char __rcu *)"detached-driver";
> > + name = ops->get_driver_name(fence);
> > + dma_fence_unlock_irqrestore(fence, flags);
> > +
> > + return name;
> > }
> > EXPORT_SYMBOL(dma_fence_driver_name);
> >
> > @@ -1199,13 +1216,17 @@ EXPORT_SYMBOL(dma_fence_driver_name);
> > */
> > const char __rcu *dma_fence_timeline_name(struct dma_fence *fence)
> > {
> > + const char __rcu *name = "signaled-timeline";
> > const struct dma_fence_ops *ops;
> > + unsigned long flags;
> >
> > /* RCU protection is required for safe access to returned string */
> > ops = rcu_dereference(fence->ops);
> > + dma_fence_lock_irqsave(fence, flags);
> > if (!dma_fence_test_signaled_flag(fence))
> > - return (const char __rcu *)ops->get_driver_name(fence);
> > - else
> > - return (const char __rcu *)"signaled-timeline";
> > + name = ops->get_driver_name(fence);
> > + dma_fence_unlock_irqrestore(fence, flags);
> > +
> > + return name;
> > }
> > EXPORT_SYMBOL(dma_fence_timeline_name);
> > diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h
> > index b52ab692b22e..b93c3f7f69fb 100644
> > --- a/include/linux/dma-fence.h
> > +++ b/include/linux/dma-fence.h
> > @@ -547,20 +547,29 @@ static inline bool
> > dma_fence_is_signaled(struct dma_fence *fence)
> > {
> > const struct dma_fence_ops *ops;
> > + unsigned long flags;
> > + bool signaled;
> >
> > if (dma_fence_test_signaled_flag(fence))
> > return true;
> >
> > rcu_read_lock();
> > ops = rcu_dereference(fence->ops);
> > - if (ops && ops->signaled && ops->signaled(fence)) {
> > + if (!ops || !ops->signaled) {
> > rcu_read_unlock();
> > - dma_fence_signal(fence);
> > - return true;
> > + return false;
> > }
> > +
> > + dma_fence_lock_irqsave(fence, flags);
> > + signaled = ops->signaled(fence);
> > +
> > + if (signaled)
> > + dma_fence_signal_locked(fence);
> > +
> > + dma_fence_unlock_irqrestore(fence, flags);
> > rcu_read_unlock();
> >
> > - return false;
> > + return signaled;
> > }
> >
> > /**
On Mon, 8 Jun 2026 16:24:37 +0200
Philipp Stanner <phasta@kernel.org> wrote:
> @@ -1020,11 +1024,20 @@ EXPORT_SYMBOL(dma_fence_wait_any_timeout);
> void dma_fence_set_deadline(struct dma_fence *fence, ktime_t deadline)
> {
> const struct dma_fence_ops *ops;
> + unsigned long flags;
>
> rcu_read_lock();
> ops = rcu_dereference(fence->ops);
> - if (ops && ops->set_deadline && !dma_fence_is_signaled(fence))
> + if (!ops || !ops->set_deadline) {
> + rcu_read_unlock();
> + return;
> + }
> +
> + dma_fence_lock_irqsave(fence, flags);
> + if (!dma_fence_is_signaled_locked(fence))
> ops->set_deadline(fence, deadline);
You can't take the fence lock around ->set_deadline(), otherwise you'll
deadlock here [1] or here [2].
> +
> + dma_fence_unlock_irqrestore(fence, flags);
> rcu_read_unlock();
> }
[1]https://elixir.bootlin.com/linux/v7.0.11/source/drivers/dma-buf/sw_sync.c#L182
[2]https://elixir.bootlin.com/linux/v7.0.11/source/drivers/gpu/drm/msm/msm_fence.c#L139
On Mon, 2026-06-08 at 17:01 +0200, Boris Brezillon wrote:
> On Mon, 8 Jun 2026 16:24:37 +0200
> Philipp Stanner <phasta@kernel.org> wrote:
>
> > @@ -1020,11 +1024,20 @@ EXPORT_SYMBOL(dma_fence_wait_any_timeout);
> > void dma_fence_set_deadline(struct dma_fence *fence, ktime_t deadline)
> > {
> > const struct dma_fence_ops *ops;
> > + unsigned long flags;
> >
> > rcu_read_lock();
> > ops = rcu_dereference(fence->ops);
> > - if (ops && ops->set_deadline && !dma_fence_is_signaled(fence))
> > + if (!ops || !ops->set_deadline) {
> > + rcu_read_unlock();
> > + return;
> > + }
> > +
> > + dma_fence_lock_irqsave(fence, flags);
> > + if (!dma_fence_is_signaled_locked(fence))
> > ops->set_deadline(fence, deadline);
>
> You can't take the fence lock around ->set_deadline(), otherwise you'll
> deadlock here [1] or here [2].
>
> > +
> > + dma_fence_unlock_irqrestore(fence, flags);
> > rcu_read_unlock();
> > }
>
>
> [1]https://elixir.bootlin.com/linux/v7.0.11/source/drivers/dma-buf/sw_sync.c#L182
> [2]https://elixir.bootlin.com/linux/v7.0.11/source/drivers/gpu/drm/msm/msm_fence.c#L139
If we'd port these (and maybe some we have overlooked) simultaneously,
they could completely drop their separate locking.
The fact that other parties were forced to take the fence lock in their
callbacks (and even 100% of the functions' code) actually proves that
this RFC is probably a good idea and callback-calls should be guarded
by the fence lock :]
P.
On Mon Jun 8, 2026 at 5:17 PM CEST, Philipp Stanner wrote:
> On Mon, 2026-06-08 at 17:01 +0200, Boris Brezillon wrote:
>> On Mon, 8 Jun 2026 16:24:37 +0200
>> Philipp Stanner <phasta@kernel.org> wrote:
>>
>> > @@ -1020,11 +1024,20 @@ EXPORT_SYMBOL(dma_fence_wait_any_timeout);
>> > void dma_fence_set_deadline(struct dma_fence *fence, ktime_t deadline)
>> > {
>> > const struct dma_fence_ops *ops;
>> > + unsigned long flags;
>> >
>> > rcu_read_lock();
>> > ops = rcu_dereference(fence->ops);
>> > - if (ops && ops->set_deadline && !dma_fence_is_signaled(fence))
>> > + if (!ops || !ops->set_deadline) {
>> > + rcu_read_unlock();
>> > + return;
>> > + }
>> > +
>> > + dma_fence_lock_irqsave(fence, flags);
>> > + if (!dma_fence_is_signaled_locked(fence))
>> > ops->set_deadline(fence, deadline);
>>
>> You can't take the fence lock around ->set_deadline(), otherwise you'll
>> deadlock here [1] or here [2].
>>
>> > +
>> > + dma_fence_unlock_irqrestore(fence, flags);
>> > rcu_read_unlock();
>> > }
>>
>>
>> [1]https://elixir.bootlin.com/linux/v7.0.11/source/drivers/dma-buf/sw_sync.c#L182
>> [2]https://elixir.bootlin.com/linux/v7.0.11/source/drivers/gpu/drm/msm/msm_fence.c#L139
>
>
> If we'd port these (and maybe some we have overlooked) simultaneously,
> they could completely drop their separate locking.
>
> The fact that other parties were forced to take the fence lock in their
> callbacks (and even 100% of the functions' code) actually proves that
> this RFC is probably a good idea and callback-calls should be guarded
> by the fence lock :]
I think I looked into this recently and IIRC it indeed seems like all
implementors of set_deadline() take the lock within their callback.
On Mon, 2026-06-08 at 17:23 +0200, Danilo Krummrich wrote:
> On Mon Jun 8, 2026 at 5:17 PM CEST, Philipp Stanner wrote:
> > On Mon, 2026-06-08 at 17:01 +0200, Boris Brezillon wrote:
> > > On Mon, 8 Jun 2026 16:24:37 +0200
> > > Philipp Stanner <phasta@kernel.org> wrote:
> > >
> > > > @@ -1020,11 +1024,20 @@ EXPORT_SYMBOL(dma_fence_wait_any_timeout);
> > > > void dma_fence_set_deadline(struct dma_fence *fence, ktime_t deadline)
> > > > {
> > > > const struct dma_fence_ops *ops;
> > > > + unsigned long flags;
> > > >
> > > > rcu_read_lock();
> > > > ops = rcu_dereference(fence->ops);
> > > > - if (ops && ops->set_deadline && !dma_fence_is_signaled(fence))
> > > > + if (!ops || !ops->set_deadline) {
> > > > + rcu_read_unlock();
> > > > + return;
> > > > + }
> > > > +
> > > > + dma_fence_lock_irqsave(fence, flags);
> > > > + if (!dma_fence_is_signaled_locked(fence))
> > > > ops->set_deadline(fence, deadline);
> > >
> > > You can't take the fence lock around ->set_deadline(), otherwise you'll
> > > deadlock here [1] or here [2].
> > >
> > > > +
> > > > + dma_fence_unlock_irqrestore(fence, flags);
> > > > rcu_read_unlock();
> > > > }
> > >
> > >
> > > [1]https://elixir.bootlin.com/linux/v7.0.11/source/drivers/dma-buf/sw_sync.c#L182
> > > [2]https://elixir.bootlin.com/linux/v7.0.11/source/drivers/gpu/drm/msm/msm_fence.c#L139
Oh, MSM actually doesn't btw, that's a false positive. That's a
distinct spinlock on their fence context object.
But yes, before we could upstream this, we would go through all the
implementors like Danilo did, to find all the others.
P.
> >
> >
> > If we'd port these (and maybe some we have overlooked) simultaneously,
> > they could completely drop their separate locking.
> >
> > The fact that other parties were forced to take the fence lock in their
> > callbacks (and even 100% of the functions' code) actually proves that
> > this RFC is probably a good idea and callback-calls should be guarded
> > by the fence lock :]
>
> I think I looked into this recently and IIRC it indeed seems like all
> implementors of set_deadline() take the lock within their callback.
On Mon, 08 Jun 2026 17:30:58 +0200
Philipp Stanner <phasta@mailbox.org> wrote:
> On Mon, 2026-06-08 at 17:23 +0200, Danilo Krummrich wrote:
> > On Mon Jun 8, 2026 at 5:17 PM CEST, Philipp Stanner wrote:
> > > On Mon, 2026-06-08 at 17:01 +0200, Boris Brezillon wrote:
> > > > On Mon, 8 Jun 2026 16:24:37 +0200
> > > > Philipp Stanner <phasta@kernel.org> wrote:
> > > >
> > > > > @@ -1020,11 +1024,20 @@ EXPORT_SYMBOL(dma_fence_wait_any_timeout);
> > > > > void dma_fence_set_deadline(struct dma_fence *fence, ktime_t deadline)
> > > > > {
> > > > > const struct dma_fence_ops *ops;
> > > > > + unsigned long flags;
> > > > >
> > > > > rcu_read_lock();
> > > > > ops = rcu_dereference(fence->ops);
> > > > > - if (ops && ops->set_deadline && !dma_fence_is_signaled(fence))
> > > > > + if (!ops || !ops->set_deadline) {
> > > > > + rcu_read_unlock();
> > > > > + return;
> > > > > + }
> > > > > +
> > > > > + dma_fence_lock_irqsave(fence, flags);
> > > > > + if (!dma_fence_is_signaled_locked(fence))
> > > > > ops->set_deadline(fence, deadline);
> > > >
> > > > You can't take the fence lock around ->set_deadline(), otherwise you'll
> > > > deadlock here [1] or here [2].
> > > >
> > > > > +
> > > > > + dma_fence_unlock_irqrestore(fence, flags);
> > > > > rcu_read_unlock();
> > > > > }
> > > >
> > > >
> > > > [1]https://elixir.bootlin.com/linux/v7.0.11/source/drivers/dma-buf/sw_sync.c#L182
> > > > [2]https://elixir.bootlin.com/linux/v7.0.11/source/drivers/gpu/drm/msm/msm_fence.c#L139
>
> Oh, MSM actually doesn't btw, that's a false positive. That's a
> distinct spinlock on their fence context object.
It's not, it's the same lock they attach to all their fences coming
from this context. It's just that this lock appears to be per-context,
like is the case for basically every driver, since the inline lock was
introduced only in this release cycle.
Anyway, this is all stuff we can fix if people think it's okay to
protect dma_fence_ops calls with the fence lock. But my point remains:
each op has its own locking-rules, some are called with the fence lock
held (enable_signaling(), signaled()), others are not (set_deadline(),
get_xxx_name()), so we need to carefully audit each of those to make
sure:
- calling with the lock held in the new places is not causing a
deadlock
- the returned data, if not a scalar, is protected by the RCU read lock
- any driver implementing ops that can be called without the lock held
need to hold on the device data for an RCU grace period
The last bullet is probably the one I'm the most worried about, because
instead of a single rule that applies to all ops, we have various cases
based on whether some ops are implemented or not, but that's already
the case with deprecated ops like .release() or .wait(), so maybe
that's okay with the proper doc.
If I were to choose, I'd probably go for a dedicated rwlock_t to
protect dma_fence_ops, so we can:
- protect all dma_buf_ops::xx() consistently no matter the kind of op
- protect returned data (get_xxx_name()) with this lock instead of the
RCU read lock
But the overhead of this extra lock might not be acceptable, dunno.
>
>
> But yes, before we could upstream this, we would go through all the
> implementors like Danilo did, to find all the others.
There's the two I pointed out, plus the array/chain containers I
mentioned, which are not problematic.
On Mon, 2026-06-08 at 18:16 +0200, Boris Brezillon wrote: > If I were to choose, I'd probably go for a dedicated rwlock_t to side note: rw_locks are officially discouraged AFAIK. They utilize more of the expensive instructions than a spinlock and are only worth it if the read section is really long compared to the write section. > protect dma_fence_ops, so we can: > > - protect all dma_buf_ops::xx() consistently no matter the kind of op > - protect returned data (get_xxx_name()) with this lock instead of the > RCU read lock That doesn't solve our Rust destructor problem though, does it? What we want to do is: 1. Signal the fence before it drops 2. Wait for all accessors to be gone 3. Run the destructor Step #2 by definition demands the signaled-state lock. Or would your plan be to take and release the ops-lock before the destructor to ensure all callbacks are gone? P.
On 6/9/26 10:43, Philipp Stanner wrote: > On Mon, 2026-06-08 at 18:16 +0200, Boris Brezillon wrote: >> If I were to choose, I'd probably go for a dedicated rwlock_t to > > side note: > rw_locks are officially discouraged AFAIK. They utilize more of the > expensive instructions than a spinlock and are only worth it if the > read section is really long compared to the write section. > >> protect dma_fence_ops, so we can: >> >> - protect all dma_buf_ops::xx() consistently no matter the kind of op >> - protect returned data (get_xxx_name()) with this lock instead of the >> RCU read lock > > That doesn't solve our Rust destructor problem though, does it? > > What we want to do is: > > 1. Signal the fence before it drops > 2. Wait for all accessors to be gone > 3. Run the destructor > > Step #2 by definition demands the signaled-state lock. If I'm not completely mistaken that approach won't work. See you can't have a destructor if the dma_fence is independent of the module it issued. That's why we have all the handling for inline lock and fence independents. Regards, Christian. > > Or would your plan be to take and release the ops-lock before the > destructor to ensure all callbacks are gone? > > P.
On Tue, 2026-06-09 at 10:47 +0200, Christian König wrote: > On 6/9/26 10:43, Philipp Stanner wrote: > > On Mon, 2026-06-08 at 18:16 +0200, Boris Brezillon wrote: > > > If I were to choose, I'd probably go for a dedicated rwlock_t to > > > > side note: > > rw_locks are officially discouraged AFAIK. They utilize more of the > > expensive instructions than a spinlock and are only worth it if the > > read section is really long compared to the write section. > > > > > protect dma_fence_ops, so we can: > > > > > > - protect all dma_buf_ops::xx() consistently no matter the kind > > > of op > > > - protect returned data (get_xxx_name()) with this lock instead > > > of the > > > RCU read lock > > > > That doesn't solve our Rust destructor problem though, does it? > > > > What we want to do is: > > > > 1. Signal the fence before it drops > > 2. Wait for all accessors to be gone > > 3. Run the destructor > > > > Step #2 by definition demands the signaled-state lock. > > If I'm not completely mistaken that approach won't work. > > See you can't have a destructor if the dma_fence is independent of > the module it issued. > > That's why we have all the handling for inline lock and fence > independents. It does work if you have two fence types, one for the producer and one for the consumers. The producer type has a destructor, and with Rust you can ensure that all producer-fences get signaled and deconstructed before module unload. That relies on the work done by you and Tvrtko in C, where the signaled-state is the point of decoupling. The only additional thing I'd need now would be to make this a hard synchronization point that cannot race / does not need an RCU delay :) P.
On 6/8/26 18:16, Boris Brezillon wrote:
> On Mon, 08 Jun 2026 17:30:58 +0200
> Philipp Stanner <phasta@mailbox.org> wrote:
>
>> On Mon, 2026-06-08 at 17:23 +0200, Danilo Krummrich wrote:
>>> On Mon Jun 8, 2026 at 5:17 PM CEST, Philipp Stanner wrote:
>>>> On Mon, 2026-06-08 at 17:01 +0200, Boris Brezillon wrote:
>>>>> On Mon, 8 Jun 2026 16:24:37 +0200
>>>>> Philipp Stanner <phasta@kernel.org> wrote:
>>>>>
>>>>>> @@ -1020,11 +1024,20 @@ EXPORT_SYMBOL(dma_fence_wait_any_timeout);
>>>>>> void dma_fence_set_deadline(struct dma_fence *fence, ktime_t deadline)
>>>>>> {
>>>>>> const struct dma_fence_ops *ops;
>>>>>> + unsigned long flags;
>>>>>>
>>>>>> rcu_read_lock();
>>>>>> ops = rcu_dereference(fence->ops);
>>>>>> - if (ops && ops->set_deadline && !dma_fence_is_signaled(fence))
>>>>>> + if (!ops || !ops->set_deadline) {
>>>>>> + rcu_read_unlock();
>>>>>> + return;
>>>>>> + }
>>>>>> +
>>>>>> + dma_fence_lock_irqsave(fence, flags);
>>>>>> + if (!dma_fence_is_signaled_locked(fence))
>>>>>> ops->set_deadline(fence, deadline);
>>>>>
>>>>> You can't take the fence lock around ->set_deadline(), otherwise you'll
>>>>> deadlock here [1] or here [2].
>>>>>
>>>>>> +
>>>>>> + dma_fence_unlock_irqrestore(fence, flags);
>>>>>> rcu_read_unlock();
>>>>>> }
>>>>>
>>>>>
>>>>> [1]https://elixir.bootlin.com/linux/v7.0.11/source/drivers/dma-buf/sw_sync.c#L182
>>>>> [2]https://elixir.bootlin.com/linux/v7.0.11/source/drivers/gpu/drm/msm/msm_fence.c#L139
>>
>> Oh, MSM actually doesn't btw, that's a false positive. That's a
>> distinct spinlock on their fence context object.
>
> It's not, it's the same lock they attach to all their fences coming
> from this context. It's just that this lock appears to be per-context,
> like is the case for basically every driver, since the inline lock was
> introduced only in this release cycle.
>
> Anyway, this is all stuff we can fix if people think it's okay to
> protect dma_fence_ops calls with the fence lock. But my point remains:
> each op has its own locking-rules, some are called with the fence lock
> held (enable_signaling(), signaled()), others are not (set_deadline(),
> get_xxx_name()), so we need to carefully audit each of those to make
> sure:
>
> - calling with the lock held in the new places is not causing a
> deadlock
Yeah, exactly that.
For example the set_deadline() is intentionally not called with the fence lock held because that won't work for some use cases.
The problem when you call ops with a lock held is always that this lock then becomes the outermost look held. In other words when you for example want to grab a power management lock to implement the deadline feature the framework enforces an order between the two locks which isn't desired.
Regards,
Christian.
> - the returned data, if not a scalar, is protected by the RCU read lock
> - any driver implementing ops that can be called without the lock held
> need to hold on the device data for an RCU grace period
>
> The last bullet is probably the one I'm the most worried about, because
> instead of a single rule that applies to all ops, we have various cases
> based on whether some ops are implemented or not, but that's already
> the case with deprecated ops like .release() or .wait(), so maybe
> that's okay with the proper doc.
>
> If I were to choose, I'd probably go for a dedicated rwlock_t to
> protect dma_fence_ops, so we can:
>
> - protect all dma_buf_ops::xx() consistently no matter the kind of op
> - protect returned data (get_xxx_name()) with this lock instead of the
> RCU read lock
>
> But the overhead of this extra lock might not be acceptable, dunno.
>
>>
>>
>> But yes, before we could upstream this, we would go through all the
>> implementors like Danilo did, to find all the others.
>
> There's the two I pointed out, plus the array/chain containers I
> mentioned, which are not problematic.
On Tue, 2026-06-09 at 10:02 +0200, Christian König wrote: > On 6/8/26 18:16, Boris Brezillon wrote: > > - calling with the lock held in the new places is not causing a > > deadlock > > Yeah, exactly that. > > For example the set_deadline() is intentionally not called with the > fence lock held because that won't work for some use cases. > > The problem when you call ops with a lock held is always that this > lock then becomes the outermost look held. In other words when you > for example want to grab a power management lock to implement the > deadline feature the framework enforces an order between the two > locks which isn't desired. A deadlock can only happen if that power management routine also attempts to take the fence lock. Or maybe the fence ctx lock. Why would it want to do that? What does it want to signal? I suppose in case of a close enough deadline you will do a firmware call to shovel some more coals into the kettle. The only GPU driver implementing it seems MSM anyways? P.
On Mon, 08 Jun 2026 17:23:06 +0200
"Danilo Krummrich" <dakr@kernel.org> wrote:
> On Mon Jun 8, 2026 at 5:17 PM CEST, Philipp Stanner wrote:
> > On Mon, 2026-06-08 at 17:01 +0200, Boris Brezillon wrote:
> >> On Mon, 8 Jun 2026 16:24:37 +0200
> >> Philipp Stanner <phasta@kernel.org> wrote:
> >>
> >> > @@ -1020,11 +1024,20 @@ EXPORT_SYMBOL(dma_fence_wait_any_timeout);
> >> > void dma_fence_set_deadline(struct dma_fence *fence, ktime_t deadline)
> >> > {
> >> > const struct dma_fence_ops *ops;
> >> > + unsigned long flags;
> >> >
> >> > rcu_read_lock();
> >> > ops = rcu_dereference(fence->ops);
> >> > - if (ops && ops->set_deadline && !dma_fence_is_signaled(fence))
> >> > + if (!ops || !ops->set_deadline) {
> >> > + rcu_read_unlock();
> >> > + return;
> >> > + }
> >> > +
> >> > + dma_fence_lock_irqsave(fence, flags);
> >> > + if (!dma_fence_is_signaled_locked(fence))
> >> > ops->set_deadline(fence, deadline);
> >>
> >> You can't take the fence lock around ->set_deadline(), otherwise you'll
> >> deadlock here [1] or here [2].
> >>
> >> > +
> >> > + dma_fence_unlock_irqrestore(fence, flags);
> >> > rcu_read_unlock();
> >> > }
> >>
> >>
> >> [1]https://elixir.bootlin.com/linux/v7.0.11/source/drivers/dma-buf/sw_sync.c#L182
> >> [2]https://elixir.bootlin.com/linux/v7.0.11/source/drivers/gpu/drm/msm/msm_fence.c#L139
> >
> >
> > If we'd port these (and maybe some we have overlooked) simultaneously,
> > they could completely drop their separate locking.
> >
> > The fact that other parties were forced to take the fence lock in their
> > callbacks (and even 100% of the functions' code) actually proves that
> > this RFC is probably a good idea and callback-calls should be guarded
> > by the fence lock :]
>
> I think I looked into this recently and IIRC it indeed seems like all
> implementors of set_deadline() take the lock within their callback.
dma-fence-{chain-array}.c don't, but that's probably okay if they are
called with the container fence lock held, because we already have a
separate lockdep-class assigned to deal with the nested-locking of
dma_fence::inline_lock in the signal path.
© 2016 - 2026 Red Hat, Inc.