[PATCH v3] rhashtable: Bounce deferred worker kick through irq_work

Tejun Heo posted 1 patch 1 month, 3 weeks ago
include/linux/rhashtable-types.h |  3 +++
include/linux/rhashtable.h       |  3 ++-
lib/rhashtable.c                 | 31 ++++++++++++++++++++++++++++---
3 files changed, 33 insertions(+), 4 deletions(-)
[PATCH v3] rhashtable: Bounce deferred worker kick through irq_work
Posted by Tejun Heo 1 month, 3 weeks ago
Inserts past 75% load call schedule_work(&ht->run_work) to kick an
async resize. If a caller holds a raw spinlock (e.g. an
insecure_elasticity user), schedule_work() under that lock records

  caller_lock -> pool->lock -> pi_lock -> rq->__lock

A cycle forms if any of these locks is acquired in the reverse
direction elsewhere. sched_ext, the only current insecure_elasticity
user, hits this: it holds scx_sched_lock across rhashtable inserts of
sub-schedulers, while scx_bypass() takes rq->__lock -> scx_sched_lock.
Exercising the resize path produces:

  Chain exists of:
    &pool->lock --> &rq->__lock --> scx_sched_lock

Bounce the kick from the insert paths through irq_work so
schedule_work() runs from hard IRQ context with the caller's lock no
longer held. rht_deferred_worker()'s self-rearm on error stays on
schedule_work(&ht->run_work) - the worker runs in process context with
no caller lock held, and keeping the self-requeue on @run_work lets
cancel_work_sync() in rhashtable_free_and_destroy() drain it.

v3: Keep rht_deferred_worker()'s self-rearm on schedule_work(&run_work).
    Routing it through irq_work in v2 broke cancel_work_sync()'s
    self-requeue handling - an irq_work queued after irq_work_sync()
    returned but while cancel_work_sync() was still waiting could fire
    post-teardown.

v2: Bounce unconditionally instead of gating on insecure_elasticity,
    as suggested by Herbert.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
Herbert, dropped your Ack on v2. v2 routed rht_deferred_worker()'s
self-rearm through irq_work, which would race with
cancel_work_sync() in rhashtable_free_and_destroy(). v3 keeps only
the insert-path kicks on irq_work; the worker's self-rearm stays on
schedule_work(&ht->run_work).

 include/linux/rhashtable-types.h |  3 +++
 include/linux/rhashtable.h       |  3 ++-
 lib/rhashtable.c                 | 31 ++++++++++++++++++++++++++++---
 3 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h
index 72082428d6c6..fc2f596a6df1 100644
--- a/include/linux/rhashtable-types.h
+++ b/include/linux/rhashtable-types.h
@@ -12,6 +12,7 @@
 #include <linux/alloc_tag.h>
 #include <linux/atomic.h>
 #include <linux/compiler.h>
+#include <linux/irq_work_types.h>
 #include <linux/mutex.h>
 #include <linux/workqueue_types.h>
 
@@ -77,6 +78,7 @@ struct rhashtable_params {
  * @p: Configuration parameters
  * @rhlist: True if this is an rhltable
  * @run_work: Deferred worker to expand/shrink asynchronously
+ * @run_irq_work: Bounces the @run_work kick through hard IRQ context.
  * @mutex: Mutex to protect current/future table swapping
  * @lock: Spin lock to protect walker list
  * @nelems: Number of elements in table
@@ -88,6 +90,7 @@ struct rhashtable {
 	struct rhashtable_params	p;
 	bool				rhlist;
 	struct work_struct		run_work;
+	struct irq_work			run_irq_work;
 	struct mutex                    mutex;
 	spinlock_t			lock;
 	atomic_t			nelems;
diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 7def3f0f556b..ef5230cece36 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -20,6 +20,7 @@
 
 #include <linux/err.h>
 #include <linux/errno.h>
+#include <linux/irq_work.h>
 #include <linux/jhash.h>
 #include <linux/list_nulls.h>
 #include <linux/workqueue.h>
@@ -847,7 +848,7 @@ static __always_inline void *__rhashtable_insert_fast(
 	rht_assign_unlock(tbl, bkt, obj, flags);
 
 	if (rht_grow_above_75(ht, tbl))
-		schedule_work(&ht->run_work);
+		irq_work_queue(&ht->run_irq_work);
 
 	data = NULL;
 out:
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index fb2b7bc137ba..7a67ef5b67b6 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -441,10 +441,33 @@ static void rht_deferred_worker(struct work_struct *work)
 
 	mutex_unlock(&ht->mutex);
 
+	/*
+	 * Re-arm via @run_work, not @run_irq_work.
+	 * rhashtable_free_and_destroy() drains async work as irq_work_sync()
+	 * followed by cancel_work_sync(). If this site queued irq_work while
+	 * cancel_work_sync() was waiting for us, irq_work_sync() would already
+	 * have returned and the stale irq_work could fire post-teardown.
+	 * cancel_work_sync() natively handles self-requeue on @run_work.
+	 */
 	if (err)
 		schedule_work(&ht->run_work);
 }
 
+/*
+ * Insert-path callers can run under a raw spinlock (e.g. an insecure_elasticity
+ * user). Calling schedule_work() under that lock records caller_lock ->
+ * pool->lock -> pi_lock -> rq->__lock, closing a locking cycle if any of
+ * these is acquired in the reverse direction elsewhere. Bounce through
+ * irq_work so the schedule_work() runs with the caller's lock no longer held.
+ */
+static void rht_deferred_irq_work(struct irq_work *irq_work)
+{
+	struct rhashtable *ht = container_of(irq_work, struct rhashtable,
+					     run_irq_work);
+
+	schedule_work(&ht->run_work);
+}
+
 static int rhashtable_insert_rehash(struct rhashtable *ht,
 				    struct bucket_table *tbl)
 {
@@ -477,7 +500,7 @@ static int rhashtable_insert_rehash(struct rhashtable *ht,
 		if (err == -EEXIST)
 			err = 0;
 	} else
-		schedule_work(&ht->run_work);
+		irq_work_queue(&ht->run_irq_work);
 
 	return err;
 
@@ -488,7 +511,7 @@ static int rhashtable_insert_rehash(struct rhashtable *ht,
 
 	/* Schedule async rehash to retry allocation in process context. */
 	if (err == -ENOMEM)
-		schedule_work(&ht->run_work);
+		irq_work_queue(&ht->run_irq_work);
 
 	return err;
 }
@@ -630,7 +653,7 @@ static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
 			rht_unlock(tbl, bkt, flags);
 
 			if (inserted && rht_grow_above_75(ht, tbl))
-				schedule_work(&ht->run_work);
+				irq_work_queue(&ht->run_irq_work);
 		}
 	} while (!IS_ERR_OR_NULL(new_tbl));
 
@@ -1085,6 +1108,7 @@ int rhashtable_init_noprof(struct rhashtable *ht,
 	RCU_INIT_POINTER(ht->tbl, tbl);
 
 	INIT_WORK(&ht->run_work, rht_deferred_worker);
+	init_irq_work(&ht->run_irq_work, rht_deferred_irq_work);
 
 	return 0;
 }
@@ -1150,6 +1174,7 @@ void rhashtable_free_and_destroy(struct rhashtable *ht,
 	struct bucket_table *tbl, *next_tbl;
 	unsigned int i;
 
+	irq_work_sync(&ht->run_irq_work);
 	cancel_work_sync(&ht->run_work);
 
 	mutex_lock(&ht->mutex);
-- 
2.53.0
Re: [PATCH v3] rhashtable: Bounce deferred worker kick through irq_work
Posted by Hillf Danton 1 month ago
On Mon, 20 Apr 2026 20:03:26 -1000 Tejun Heo wrote:
> --- a/lib/rhashtable.c
> +++ b/lib/rhashtable.c
> @@ -441,10 +441,33 @@ static void rht_deferred_worker(struct work_struct *work)
>  
>  	mutex_unlock(&ht->mutex);
>  
> +	/*
> +	 * Re-arm via @run_work, not @run_irq_work.
> +	 * rhashtable_free_and_destroy() drains async work as irq_work_sync()
> +	 * followed by cancel_work_sync(). If this site queued irq_work while
> +	 * cancel_work_sync() was waiting for us, irq_work_sync() would already
> +	 * have returned and the stale irq_work could fire post-teardown.
> +	 * cancel_work_sync() natively handles self-requeue on @run_work.
> +	 */
>  	if (err)
>  		schedule_work(&ht->run_work);
>  }
> 
Two cents: add BUG to capture the failure of handling self-requeue.

--- x/kernel/workqueue.c
+++ y/kernel/workqueue.c
@@ -2369,6 +2369,10 @@ retry:
 		work_flags |= WORK_STRUCT_INACTIVE;
 		insert_work(pwq, work, &pwq->inactive_works, work_flags);
 	}
+	do {
+		unsigned long data = *work_data_bits(work);
+		BUG_ON(data & WORK_OFFQ_DISABLE_MASK);
+	} while (0);
 
 out:
 	raw_spin_unlock(&pool->lock);
--
Re: [PATCH v3] rhashtable: Bounce deferred worker kick through irq_work
Posted by Herbert Xu 1 month, 3 weeks ago
On Mon, Apr 20, 2026 at 08:03:26PM -1000, Tejun Heo wrote:
> Inserts past 75% load call schedule_work(&ht->run_work) to kick an
> async resize. If a caller holds a raw spinlock (e.g. an
> insecure_elasticity user), schedule_work() under that lock records
> 
>   caller_lock -> pool->lock -> pi_lock -> rq->__lock
> 
> A cycle forms if any of these locks is acquired in the reverse
> direction elsewhere. sched_ext, the only current insecure_elasticity
> user, hits this: it holds scx_sched_lock across rhashtable inserts of
> sub-schedulers, while scx_bypass() takes rq->__lock -> scx_sched_lock.
> Exercising the resize path produces:
> 
>   Chain exists of:
>     &pool->lock --> &rq->__lock --> scx_sched_lock
> 
> Bounce the kick from the insert paths through irq_work so
> schedule_work() runs from hard IRQ context with the caller's lock no
> longer held. rht_deferred_worker()'s self-rearm on error stays on
> schedule_work(&ht->run_work) - the worker runs in process context with
> no caller lock held, and keeping the self-requeue on @run_work lets
> cancel_work_sync() in rhashtable_free_and_destroy() drain it.
> 
> v3: Keep rht_deferred_worker()'s self-rearm on schedule_work(&run_work).
>     Routing it through irq_work in v2 broke cancel_work_sync()'s
>     self-requeue handling - an irq_work queued after irq_work_sync()
>     returned but while cancel_work_sync() was still waiting could fire
>     post-teardown.
> 
> v2: Bounce unconditionally instead of gating on insecure_elasticity,
>     as suggested by Herbert.
> 
> Signed-off-by: Tejun Heo <tj@kernel.org>
> ---
> Herbert, dropped your Ack on v2. v2 routed rht_deferred_worker()'s
> self-rearm through irq_work, which would race with
> cancel_work_sync() in rhashtable_free_and_destroy(). v3 keeps only
> the insert-path kicks on irq_work; the worker's self-rearm stays on
> schedule_work(&ht->run_work).
> 
>  include/linux/rhashtable-types.h |  3 +++
>  include/linux/rhashtable.h       |  3 ++-
>  lib/rhashtable.c                 | 31 ++++++++++++++++++++++++++++---
>  3 files changed, 33 insertions(+), 4 deletions(-)

Acked-by: Herbert Xu <herbert@gondor.apana.org.au>

Thanks,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
Re: [PATCH v3] rhashtable: Bounce deferred worker kick through irq_work
Posted by Tejun Heo 1 month, 3 weeks ago
Hello,

Applied to sched_ext/for-7.1-fixes.

Thanks.
--
tejun