From: David Jeffery <djeffery@redhat.com>
To prevent a thundering herd effect, implement a custom wake function for
the async shubsystem which will only wake waiters which have all their
dependencies completed.
The async subsystem currently wakes all waiters on async_done when an async
task completes. When there are many tasks trying to synchronize on differnt
async values, this can create a thundering herd problem when an async task
wakes up all waiters, most of whom go back to waiting after causing
lock contention and wasting CPU.
Signed-off-by: David Jeffery <djeffery@redhat.com>
Signed-off-by: Stuart Hayes <stuart.w.hayes@gmail.com>
---
kernel/async.c | 42 +++++++++++++++++++++++++++++++++++++++++-
1 file changed, 41 insertions(+), 1 deletion(-)
diff --git a/kernel/async.c b/kernel/async.c
index 4c3e6a44595f..ae327f29bac9 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -76,6 +76,12 @@ struct async_entry {
struct async_domain *domain;
};
+struct async_wait_entry {
+ wait_queue_entry_t wait;
+ async_cookie_t cookie;
+ struct async_domain *domain;
+};
+
static DECLARE_WAIT_QUEUE_HEAD(async_done);
static atomic_t entry_count;
@@ -298,6 +304,24 @@ void async_synchronize_full_domain(struct async_domain *domain)
}
EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
+/**
+ * async_domain_wake_function - wait function for cooking synchronization
+ *
+ * Custom wait function for async_synchronize_cookie_domain to check cookie
+ * value. This prevents waking up waiting threads unnecessarily.
+ */
+static int async_domain_wake_function(struct wait_queue_entry *wait,
+ unsigned int mode, int sync, void *key)
+{
+ struct async_wait_entry *await =
+ container_of(wait, struct async_wait_entry, wait);
+
+ if (lowest_in_progress(await->domain) < await->cookie)
+ return 0;
+
+ return autoremove_wake_function(wait, mode, sync, key);
+}
+
/**
* async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing
* @cookie: async_cookie_t to use as checkpoint
@@ -310,11 +334,27 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain)
{
ktime_t starttime;
+ struct async_wait_entry await = {
+ .cookie = cookie,
+ .domain = domain,
+ .wait = {
+ .func = async_domain_wake_function,
+ .private = current,
+ .flags = 0,
+ .entry = LIST_HEAD_INIT(await.wait.entry),
+ }};
pr_debug("async_waiting @ %i\n", task_pid_nr(current));
starttime = ktime_get();
- wait_event(async_done, lowest_in_progress(domain) >= cookie);
+ for (;;) {
+ prepare_to_wait(&async_done, &await.wait, TASK_UNINTERRUPTIBLE);
+
+ if (lowest_in_progress(domain) >= cookie)
+ break;
+ schedule();
+ }
+ finish_wait(&async_done, &await.wait);
pr_debug("async_continuing @ %i after %lli usec\n", task_pid_nr(current),
microseconds_since(starttime));
--
2.39.3
On Wed, Jun 25, 2025 at 03:18:49PM -0500, Stuart Hayes wrote: > From: David Jeffery <djeffery@redhat.com> > > To prevent a thundering herd effect, implement a custom wake function for > the async shubsystem which will only wake waiters which have all their > dependencies completed. > > The async subsystem currently wakes all waiters on async_done when an async > task completes. When there are many tasks trying to synchronize on differnt > async values, this can create a thundering herd problem when an async task > wakes up all waiters, most of whom go back to waiting after causing > lock contention and wasting CPU. > > Signed-off-by: David Jeffery <djeffery@redhat.com> > Signed-off-by: Stuart Hayes <stuart.w.hayes@gmail.com> > --- > kernel/async.c | 42 +++++++++++++++++++++++++++++++++++++++++- > 1 file changed, 41 insertions(+), 1 deletion(-) > > diff --git a/kernel/async.c b/kernel/async.c > index 4c3e6a44595f..ae327f29bac9 100644 > --- a/kernel/async.c > +++ b/kernel/async.c > @@ -76,6 +76,12 @@ struct async_entry { > struct async_domain *domain; > }; > > +struct async_wait_entry { > + wait_queue_entry_t wait; > + async_cookie_t cookie; > + struct async_domain *domain; > +}; > + > static DECLARE_WAIT_QUEUE_HEAD(async_done); > > static atomic_t entry_count; > @@ -298,6 +304,24 @@ void async_synchronize_full_domain(struct async_domain *domain) > } > EXPORT_SYMBOL_GPL(async_synchronize_full_domain); > > +/** > + * async_domain_wake_function - wait function for cooking synchronization > + * > + * Custom wait function for async_synchronize_cookie_domain to check cookie > + * value. This prevents waking up waiting threads unnecessarily. > + */ > +static int async_domain_wake_function(struct wait_queue_entry *wait, > + unsigned int mode, int sync, void *key) > +{ > + struct async_wait_entry *await = > + container_of(wait, struct async_wait_entry, wait); > + > + if (lowest_in_progress(await->domain) < await->cookie) > + return 0; > + > + return autoremove_wake_function(wait, mode, sync, key); > +} > + > /** > * async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing > * @cookie: async_cookie_t to use as checkpoint > @@ -310,11 +334,27 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain); > void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain) > { > ktime_t starttime; > + struct async_wait_entry await = { > + .cookie = cookie, > + .domain = domain, > + .wait = { > + .func = async_domain_wake_function, > + .private = current, > + .flags = 0, > + .entry = LIST_HEAD_INIT(await.wait.entry), > + }}; > > pr_debug("async_waiting @ %i\n", task_pid_nr(current)); > starttime = ktime_get(); > > - wait_event(async_done, lowest_in_progress(domain) >= cookie); > + for (;;) { > + prepare_to_wait(&async_done, &await.wait, TASK_UNINTERRUPTIBLE); > + > + if (lowest_in_progress(domain) >= cookie) This line introduces a bug on PREEMPT_RT because lowest_in_progress() may sleep on PREEMPT_RT. If it does sleep, it'll corrupt the current task's state by setting it to TASK_RUNNING after the sleep is over. IOW, the current task's state might be TASK_RUNNING after lowest_in_progress() returns. lowest_in_progress() may sleep on PREEMPT_RT because it locks a non-raw spin lock (async_lock). > + break; > + schedule(); > + } > + finish_wait(&async_done, &await.wait); > > pr_debug("async_continuing @ %i after %lli usec\n", task_pid_nr(current), > microseconds_since(starttime)); > -- > 2.39.3 > Sultan
© 2016 - 2025 Red Hat, Inc.