drivers/thunderbolt/xdomain.c | 41 ++++++++++++++++++++++++++--------- include/linux/thunderbolt.h | 3 +++ 2 files changed, 34 insertions(+), 10 deletions(-)
tb_xdp_handle_request() runs on system_wq and queues
xd->state_work via queue_delayed_work() in three request handlers:
PROPERTIES_CHANGED_REQUEST, UUID_REQUEST (via start_handshake),
and LINK_STATE_CHANGE_REQUEST. Similarly, update_xdomain() queues
xd->properties_changed_work when local properties change.
Concurrently, tb_xdomain_remove() calls stop_handshake() which does
cancel_delayed_work_sync() on both delayed works. Later,
tb_xdomain_unregister() calls device_unregister() which eventually
frees the xdomain. Since commit 559c1e1e0134 ("thunderbolt: Run
tb_xdp_handle_request() in system workqueue") moved the request
handler off tb->wq, the handler and the remove path are no longer
serialized. If queue_delayed_work() executes after
cancel_delayed_work_sync() but before the xdomain is freed, the
delayed work fires on a freed object.
Add xd->removing that tb_xdomain_remove() sets under xd->lock
before calling stop_handshake(). Each external queue site holds
the same lock and checks removing before calling
queue_delayed_work(). This provides the mutual exclusion needed:
either the queue site acquires the lock first and queues work that
the subsequent cancel will see, or the remove path acquires the
lock first and the queue site observes removing == true and skips
the queue.
Fixes: 559c1e1e0134 ("thunderbolt: Run tb_xdp_handle_request() in system workqueue")
Cc: stable@vger.kernel.org
Assisted-by: Claude:claude-opus-4-7
Signed-off-by: Michael Bommarito <michael.bommarito@gmail.com>
---
v2: Rebased onto thunderbolt.git/next per Mika's request. Verified
the race persists on next: tb_xdp_handle_request still runs on
system_wq, the remove/unregister split does not add
synchronization with the queue sites. Updated commit message to
reflect that tb_xdomain_unregister() now does the
device_unregister (split from tb_xdomain_remove on next).
drivers/thunderbolt/xdomain.c | 41 ++++++++++++++++++++++++++---------
include/linux/thunderbolt.h | 3 +++
2 files changed, 34 insertions(+), 10 deletions(-)
diff --git a/drivers/thunderbolt/xdomain.c b/drivers/thunderbolt/xdomain.c
index 781d88d06b935..fe6c5ac703f4d 100644
--- a/drivers/thunderbolt/xdomain.c
+++ b/drivers/thunderbolt/xdomain.c
@@ -803,9 +803,13 @@ static void tb_xdp_handle_request(struct work_struct *work)
* the xdomain related to this connection as well in
* case there is a change in services it offers.
*/
- if (xd && device_is_registered(&xd->dev))
- queue_delayed_work(tb->wq, &xd->state_work,
- msecs_to_jiffies(XDOMAIN_SHORT_TIMEOUT));
+ if (xd) {
+ mutex_lock(&xd->lock);
+ if (!xd->removing && device_is_registered(&xd->dev))
+ queue_delayed_work(tb->wq, &xd->state_work,
+ msecs_to_jiffies(XDOMAIN_SHORT_TIMEOUT));
+ mutex_unlock(&xd->lock);
+ }
break;
case UUID_REQUEST_OLD:
@@ -818,8 +822,12 @@ static void tb_xdp_handle_request(struct work_struct *work)
* received UUID request from the remote host.
*/
if (!ret && xd && xd->state == XDOMAIN_STATE_ERROR) {
- dev_dbg(&xd->dev, "restarting handshake\n");
- start_handshake(xd);
+ mutex_lock(&xd->lock);
+ if (!xd->removing) {
+ dev_dbg(&xd->dev, "restarting handshake\n");
+ start_handshake(xd);
+ }
+ mutex_unlock(&xd->lock);
}
break;
@@ -885,9 +893,13 @@ static void tb_xdp_handle_request(struct work_struct *work)
ret = tb_xdp_link_state_change_response(ctl, route,
sequence, 0);
- xd->target_link_width = lsc->tlw;
- queue_delayed_work(tb->wq, &xd->state_work,
- msecs_to_jiffies(XDOMAIN_SHORT_TIMEOUT));
+ mutex_lock(&xd->lock);
+ if (!xd->removing) {
+ xd->target_link_width = lsc->tlw;
+ queue_delayed_work(tb->wq, &xd->state_work,
+ msecs_to_jiffies(XDOMAIN_SHORT_TIMEOUT));
+ }
+ mutex_unlock(&xd->lock);
} else {
tb_xdp_error_response(ctl, route, sequence,
ERROR_NOT_READY);
@@ -971,8 +983,12 @@ static int update_xdomain(struct device *dev, void *data)
xd = tb_to_xdomain(dev);
if (xd) {
- queue_delayed_work(xd->tb->wq, &xd->properties_changed_work,
- msecs_to_jiffies(50));
+ mutex_lock(&xd->lock);
+ if (!xd->removing)
+ queue_delayed_work(xd->tb->wq,
+ &xd->properties_changed_work,
+ msecs_to_jiffies(50));
+ mutex_unlock(&xd->lock);
}
return 0;
@@ -2200,6 +2216,11 @@ static int unregister_service(struct device *dev, void *data)
void tb_xdomain_remove(struct tb_xdomain *xd)
{
tb_xdomain_debugfs_remove(xd);
+
+ mutex_lock(&xd->lock);
+ xd->removing = true;
+ mutex_unlock(&xd->lock);
+
stop_handshake(xd);
tb_xdomain_link_exit(xd);
diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index b5659f8835171..feb1af175cfde 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -213,6 +213,8 @@ enum tb_link_width {
* @link_width: Width of the downstream facing link
* @link_usb4: Downstream link is USB4
* @is_unplugged: The XDomain is unplugged
+ * @removing: Set by tb_xdomain_remove() under @lock to prevent
+ * concurrent delayed work queueing
* @needs_uuid: If the XDomain does not have @remote_uuid it will be
* queried first
* @service_ids: Used to generate IDs for the services
@@ -262,6 +264,7 @@ struct tb_xdomain {
enum tb_link_width link_width;
bool link_usb4;
bool is_unplugged;
+ bool removing;
bool needs_uuid;
struct ida service_ids;
struct ida in_hopids;
base-commit: d73a08958e66849ea713d2f458b2fcf7b183f987
--
2.53.0
Hi,
On Wed, May 27, 2026 at 07:46:04AM -0400, Michael Bommarito wrote:
> tb_xdp_handle_request() runs on system_wq and queues
> xd->state_work via queue_delayed_work() in three request handlers:
> PROPERTIES_CHANGED_REQUEST, UUID_REQUEST (via start_handshake),
> and LINK_STATE_CHANGE_REQUEST. Similarly, update_xdomain() queues
> xd->properties_changed_work when local properties change.
>
> Concurrently, tb_xdomain_remove() calls stop_handshake() which does
> cancel_delayed_work_sync() on both delayed works. Later,
> tb_xdomain_unregister() calls device_unregister() which eventually
> frees the xdomain. Since commit 559c1e1e0134 ("thunderbolt: Run
> tb_xdp_handle_request() in system workqueue") moved the request
> handler off tb->wq, the handler and the remove path are no longer
> serialized. If queue_delayed_work() executes after
> cancel_delayed_work_sync() but before the xdomain is freed, the
> delayed work fires on a freed object.
>
> Add xd->removing that tb_xdomain_remove() sets under xd->lock
> before calling stop_handshake(). Each external queue site holds
> the same lock and checks removing before calling
> queue_delayed_work(). This provides the mutual exclusion needed:
> either the queue site acquires the lock first and queues work that
> the subsequent cancel will see, or the remove path acquires the
> lock first and the queue site observes removing == true and skips
> the queue.
>
> Fixes: 559c1e1e0134 ("thunderbolt: Run tb_xdp_handle_request() in system workqueue")
> Cc: stable@vger.kernel.org
> Assisted-by: Claude:claude-opus-4-7
> Signed-off-by: Michael Bommarito <michael.bommarito@gmail.com>
> ---
> v2: Rebased onto thunderbolt.git/next per Mika's request. Verified
> the race persists on next: tb_xdp_handle_request still runs on
> system_wq, the remove/unregister split does not add
> synchronization with the queue sites. Updated commit message to
> reflect that tb_xdomain_unregister() now does the
> device_unregister (split from tb_xdomain_remove on next).
Thanks Michael! Applied to thunderbolt.git/next. I would like to keep this
one for a while in linux-next and then send it with the rest for v7.2-rc1
where stable folks can then pick it up.
© 2016 - 2026 Red Hat, Inc.