A later patch will require a sleepable context in the idle worker timeout
function. Converting worker_pool.idle_timer to a delayed_work gives us just
that.
One caveat is that we need to be careful about re-queuing the dwork from
its callback function. Lai expressed concerns about overtly violating
documented locking rules, but extra locking is required around delaying the
dwork, else a worker thread adding itself to the idle_list might push the
dwork further back (IDLE_WORKER_TIMEOUT) than the work callback would (next
idle worker expiry).
No change in functionality intended.
Signed-off-by: Valentin Schneider <vschneid@redhat.com>
---
kernel/workqueue.c | 41 +++++++++++++++++++++++++++++------------
1 file changed, 29 insertions(+), 12 deletions(-)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 8185a42848c5..436b1dbdf9ff 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -167,9 +167,9 @@ struct worker_pool {
int nr_workers; /* L: total number of workers */
int nr_idle; /* L: currently idle workers */
- struct list_head idle_list; /* L: list of idle workers */
- struct timer_list idle_timer; /* L: worker idle timeout */
- struct timer_list mayday_timer; /* L: SOS timer for workers */
+ struct list_head idle_list; /* L: list of idle workers */
+ struct delayed_work idle_reaper_work; /* L: worker idle timeout */
+ struct timer_list mayday_timer; /* L: SOS timer for workers */
/* a workers is either on busy_hash or idle_list, or the manager */
DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
@@ -1806,8 +1806,10 @@ static void worker_enter_idle(struct worker *worker)
/* idle_list is LIFO */
list_add(&worker->entry, &pool->idle_list);
- if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
- mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
+ if (too_many_workers(pool) && !delayed_work_pending(&pool->idle_reaper_work))
+ mod_delayed_work(system_unbound_wq,
+ &pool->idle_reaper_work,
+ IDLE_WORKER_TIMEOUT);
/* Sanity check nr_running. */
WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && pool->nr_running);
@@ -2019,22 +2021,37 @@ static void destroy_worker(struct worker *worker)
wake_up_process(worker->task);
}
-static void idle_worker_timeout(struct timer_list *t)
+/*
+ * idle_reaper_fn - reap workers that have been idle for too long.
+ *
+ * The delayed_work is only ever modified under raw_spin_lock_irq(pool->lock).
+ */
+static void idle_reaper_fn(struct work_struct *work)
{
- struct worker_pool *pool = from_timer(pool, t, idle_timer);
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct worker_pool *pool = container_of(dwork, struct worker_pool, idle_reaper_work);
raw_spin_lock_irq(&pool->lock);
while (too_many_workers(pool)) {
struct worker *worker;
unsigned long expires;
+ unsigned long now = jiffies;
- /* idle_list is kept in LIFO order, check the last one */
+ /* idle_list is kept in LIFO order, check the oldest entry */
worker = list_entry(pool->idle_list.prev, struct worker, entry);
expires = worker->last_active + IDLE_WORKER_TIMEOUT;
- if (time_before(jiffies, expires)) {
- mod_timer(&pool->idle_timer, expires);
+ /*
+ * Careful: queueing a work item from here can and will cause a
+ * self-deadlock when dealing with an unbound pool. However,
+ * here the delay *cannot* be zero, so the queuing will
+ * happen in the timer callback.
+ */
+ if (time_before(now, expires)) {
+ mod_delayed_work(system_unbound_wq,
+ &pool->idle_reaper_work,
+ expires - now);
break;
}
@@ -3478,7 +3495,7 @@ static int init_worker_pool(struct worker_pool *pool)
INIT_LIST_HEAD(&pool->idle_list);
hash_init(pool->busy_hash);
- timer_setup(&pool->idle_timer, idle_worker_timeout, TIMER_DEFERRABLE);
+ INIT_DEFERRABLE_WORK(&pool->idle_reaper_work, idle_reaper_fn);
timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0);
@@ -3625,7 +3642,7 @@ static void put_unbound_pool(struct worker_pool *pool)
wait_for_completion(pool->detach_completion);
/* shut down the timers */
- del_timer_sync(&pool->idle_timer);
+ cancel_delayed_work_sync(&pool->idle_reaper_work);
del_timer_sync(&pool->mayday_timer);
/* RCU protected to allow dereferences from get_work_pool() */
--
2.31.1
Hello,
On Tue, Oct 04, 2022 at 04:05:20PM +0100, Valentin Schneider wrote:
> +static void idle_reaper_fn(struct work_struct *work)
> {
> - struct worker_pool *pool = from_timer(pool, t, idle_timer);
> + struct delayed_work *dwork = to_delayed_work(work);
> + struct worker_pool *pool = container_of(dwork, struct worker_pool, idle_reaper_work);
>
> raw_spin_lock_irq(&pool->lock);
>
> while (too_many_workers(pool)) {
> struct worker *worker;
> unsigned long expires;
> + unsigned long now = jiffies;
>
> - /* idle_list is kept in LIFO order, check the last one */
> + /* idle_list is kept in LIFO order, check the oldest entry */
> worker = list_entry(pool->idle_list.prev, struct worker, entry);
> expires = worker->last_active + IDLE_WORKER_TIMEOUT;
>
> - if (time_before(jiffies, expires)) {
> - mod_timer(&pool->idle_timer, expires);
So, one thing which bothers me is that the idle timer is supposed to go off
spuriously periodically. The idea being that letting it expire spuriously
should usually be cheaper than trying to update it constantly as workers
wake up and sleep. Converting the timer to a delayed work makes spurious
wakeups significantly more expensive tho as it's now a full scheduling
event.
Can we separate the timer and work item out so that we can kick off the work
item iff there actually are tasks to kill?
Thanks.
--
tejun
On 31/10/22 08:49, Tejun Heo wrote:
> Hello,
>
> On Tue, Oct 04, 2022 at 04:05:20PM +0100, Valentin Schneider wrote:
>> +static void idle_reaper_fn(struct work_struct *work)
>> {
>> - struct worker_pool *pool = from_timer(pool, t, idle_timer);
>> + struct delayed_work *dwork = to_delayed_work(work);
>> + struct worker_pool *pool = container_of(dwork, struct worker_pool, idle_reaper_work);
>>
>> raw_spin_lock_irq(&pool->lock);
>>
>> while (too_many_workers(pool)) {
>> struct worker *worker;
>> unsigned long expires;
>> + unsigned long now = jiffies;
>>
>> - /* idle_list is kept in LIFO order, check the last one */
>> + /* idle_list is kept in LIFO order, check the oldest entry */
>> worker = list_entry(pool->idle_list.prev, struct worker, entry);
>> expires = worker->last_active + IDLE_WORKER_TIMEOUT;
>>
>> - if (time_before(jiffies, expires)) {
>> - mod_timer(&pool->idle_timer, expires);
>
> So, one thing which bothers me is that the idle timer is supposed to go off
> spuriously periodically. The idea being that letting it expire spuriously
> should usually be cheaper than trying to update it constantly as workers
> wake up and sleep. Converting the timer to a delayed work makes spurious
> wakeups significantly more expensive tho as it's now a full scheduling
> event.
>
Right.
> Can we separate the timer and work item out so that we can kick off the work
> item iff there actually are tasks to kill?
>
One thing I can try to have a DIY delayed_work where the timer callback
doesn't just queue the work but first checks for too_many_workers(). This
will mostly likely result in a different behaviour as worker deletion will
then involve two pool->lock regions, but this will still catch long-idling
workers.
> Thanks.
>
> --
> tejun
© 2016 - 2026 Red Hat, Inc.