include/linux/workqueue.h | 7 ++++- kernel/workqueue.c | 60 +++++++++++++++++++++++++++++++++++---- 2 files changed, 60 insertions(+), 7 deletions(-)
From: Chuck Lever <chuck.lever@oracle.com>
The default affinity scope WQ_AFFN_CACHE assumes systems have multiple
last-level caches. On systems where all CPUs share a single LLC (common
with Intel monolithic dies), this scope degenerates to a single worker
pool. All queue_work() calls then contend on that pool's single lock,
causing severe performance degradation under high-throughput workloads.
For example, on a 12-core system with a single shared L3 cache running
NFS over RDMA with 12 fio jobs, perf shows approximately 39% of CPU
cycles spent in native_queued_spin_lock_slowpath, nearly all from
__queue_work() contending on the single pool lock.
On such systems WQ_AFFN_CACHE, WQ_AFFN_SMT, and WQ_AFFN_NUMA scopes all
collapse to a single pod.
Add wq_effective_affn_scope() to detect when a selected affinity scope
provides only one pod despite having multiple CPUs, and automatically
fall back to a finer-grained scope. This ensures reasonable lock
distribution without requiring manual configuration via the
workqueue.default_affinity_scope parameter or per-workqueue sysfs
tuning.
The fallback is conservative: it triggers only when a scope degenerates
to exactly one pod, and respects explicitly configured (non-default)
scopes.
Also update wq_affn_scope_show() to display the effective scope when
fallback occurs, making the behavior transparent to administrators
via sysfs (e.g., "default (cache -> smt)").
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
include/linux/workqueue.h | 7 ++++-
kernel/workqueue.c | 60 +++++++++++++++++++++++++++++++++++----
2 files changed, 60 insertions(+), 7 deletions(-)
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index dabc351cc127..130c452fcecf 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -128,10 +128,15 @@ struct rcu_work {
struct workqueue_struct *wq;
};
+/*
+ * Affinity scopes are ordered from finest to coarsest granularity. This
+ * ordering is used by the automatic fallback logic in wq_effective_affn_scope()
+ * which walks from coarse toward fine when a scope degenerates to a single pod.
+ */
enum wq_affn_scope {
WQ_AFFN_DFL, /* use system default */
WQ_AFFN_CPU, /* one pod per CPU */
- WQ_AFFN_SMT, /* one pod poer SMT */
+ WQ_AFFN_SMT, /* one pod per SMT */
WQ_AFFN_CACHE, /* one pod per LLC */
WQ_AFFN_NUMA, /* one pod per NUMA node */
WQ_AFFN_SYSTEM, /* one pod across the whole system */
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 253311af47c6..efbc10ef79fb 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4753,6 +4753,39 @@ static void wqattrs_actualize_cpumask(struct workqueue_attrs *attrs,
cpumask_copy(attrs->cpumask, unbound_cpumask);
}
+/*
+ * Determine the effective affinity scope. If the configured scope results
+ * in a single pod (e.g., WQ_AFFN_CACHE on a system with one shared LLC),
+ * fall back to a finer-grained scope to distribute pool lock contention.
+ *
+ * The search stops at WQ_AFFN_CPU, which always provides one pod per CPU
+ * and thus cannot degenerate further.
+ *
+ * Returns the scope to actually use, which may differ from the configured
+ * scope on systems where coarser scopes degenerate.
+ */
+static enum wq_affn_scope wq_effective_affn_scope(enum wq_affn_scope scope)
+{
+ struct wq_pod_type *pt;
+
+ /*
+ * Walk from the requested scope toward finer granularity. Stop when
+ * a scope provides more than one pod, or when CPU scope is reached.
+ * CPU scope always provides nr_possible_cpus() pods.
+ */
+ while (scope > WQ_AFFN_CPU) {
+ pt = &wq_pod_types[scope];
+
+ /* Multiple pods at this scope; no fallback needed */
+ if (pt->nr_pods > 1)
+ break;
+
+ scope--;
+ }
+
+ return scope;
+}
+
/* find wq_pod_type to use for @attrs */
static const struct wq_pod_type *
wqattrs_pod_type(const struct workqueue_attrs *attrs)
@@ -4763,8 +4796,13 @@ wqattrs_pod_type(const struct workqueue_attrs *attrs)
/* to synchronize access to wq_affn_dfl */
lockdep_assert_held(&wq_pool_mutex);
+ /*
+ * For default scope, apply automatic fallback for degenerate
+ * topologies. Explicit scope selection via sysfs or per-workqueue
+ * attributes bypasses fallback, preserving administrator intent.
+ */
if (attrs->affn_scope == WQ_AFFN_DFL)
- scope = wq_affn_dfl;
+ scope = wq_effective_affn_scope(wq_affn_dfl);
else
scope = attrs->affn_scope;
@@ -7206,16 +7244,26 @@ static ssize_t wq_affn_scope_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct workqueue_struct *wq = dev_to_wq(dev);
+ enum wq_affn_scope scope, effective;
int written;
mutex_lock(&wq->mutex);
- if (wq->unbound_attrs->affn_scope == WQ_AFFN_DFL)
- written = scnprintf(buf, PAGE_SIZE, "%s (%s)\n",
- wq_affn_names[WQ_AFFN_DFL],
- wq_affn_names[wq_affn_dfl]);
- else
+ if (wq->unbound_attrs->affn_scope == WQ_AFFN_DFL) {
+ scope = wq_affn_dfl;
+ effective = wq_effective_affn_scope(scope);
+ if (effective != scope)
+ written = scnprintf(buf, PAGE_SIZE, "%s (%s -> %s)\n",
+ wq_affn_names[WQ_AFFN_DFL],
+ wq_affn_names[scope],
+ wq_affn_names[effective]);
+ else
+ written = scnprintf(buf, PAGE_SIZE, "%s (%s)\n",
+ wq_affn_names[WQ_AFFN_DFL],
+ wq_affn_names[scope]);
+ } else {
written = scnprintf(buf, PAGE_SIZE, "%s\n",
wq_affn_names[wq->unbound_attrs->affn_scope]);
+ }
mutex_unlock(&wq->mutex);
return written;
--
2.52.0
Hello, On Tue, Feb 03, 2026 at 09:37:44AM -0500, Chuck Lever wrote: > On such systems WQ_AFFN_CACHE, WQ_AFFN_SMT, and WQ_AFFN_NUMA scopes all > collapse to a single pod. WQ_AFFN_SMT should be on CPU core boundaries, right? > Add wq_effective_affn_scope() to detect when a selected affinity scope > provides only one pod despite having multiple CPUs, and automatically > fall back to a finer-grained scope. This ensures reasonable lock > distribution without requiring manual configuration via the > workqueue.default_affinity_scope parameter or per-workqueue sysfs > tuning. > > The fallback is conservative: it triggers only when a scope degenerates > to exactly one pod, and respects explicitly configured (non-default) > scopes. While I understand the problem, I don't think dropping down to core boundary for unbound workqueues by default makes sense. That may help with some use cases but cause problem with others. Given that WQ_AFFN_CACHE is the same as WQ_AFFN_NUMA on these machines, maybe we can shard it automatically according to some heuristics or maybe we can introduce another affinity level between CACHE and SMT which is sharded on machines with too many CPUs in a single cache domain. Thanks. -- tejun
On 2/3/26 2:10 PM, Tejun Heo wrote: > Hello, > > On Tue, Feb 03, 2026 at 09:37:44AM -0500, Chuck Lever wrote: >> On such systems WQ_AFFN_CACHE, WQ_AFFN_SMT, and WQ_AFFN_NUMA scopes all >> collapse to a single pod. > > WQ_AFFN_SMT should be on CPU core boundaries, right? > >> Add wq_effective_affn_scope() to detect when a selected affinity scope >> provides only one pod despite having multiple CPUs, and automatically >> fall back to a finer-grained scope. This ensures reasonable lock >> distribution without requiring manual configuration via the >> workqueue.default_affinity_scope parameter or per-workqueue sysfs >> tuning. >> >> The fallback is conservative: it triggers only when a scope degenerates >> to exactly one pod, and respects explicitly configured (non-default) >> scopes. > > While I understand the problem, I don't think dropping down to core boundary > for unbound workqueues by default makes sense. That may help with some use > cases but cause problem with others. I've never seen a case where it doesn't help. In order to craft an alternative, I'll need to have some examples to avoid. Is it only the SMT case that is concerning? > Given that WQ_AFFN_CACHE is the same as > WQ_AFFN_NUMA on these machines, maybe we can shard it automatically > according to some heuristics or maybe we can introduce another affinity > level between CACHE and SMT which is sharded on machines with too many CPUs > in a single cache domain. -- Chuck Lever
On Tue, Feb 03, 2026 at 03:14:46PM -0500, Chuck Lever wrote: > > While I understand the problem, I don't think dropping down to core boundary > > for unbound workqueues by default makes sense. That may help with some use > > cases but cause problem with others. > > I've never seen a case where it doesn't help. In order to craft an > alternative, I'll need to have some examples to avoid. Is it only the > SMT case that is concerning? It's just a lot of separate pools on large machines. If you have relatively high concurrency, the number of workers can go pretty high. They'd also migrate back and forth more depending on usage pattern and have worse cache locality. Imagine you have a bursty workload wandering through the system, if you have nr_cores pools, it can easily end up with kworkers > nr_cores * max_concurrency. Thanks. -- tejun
On 2/3/26 3:29 PM, Tejun Heo wrote: > On Tue, Feb 03, 2026 at 03:14:46PM -0500, Chuck Lever wrote: >>> While I understand the problem, I don't think dropping down to core boundary >>> for unbound workqueues by default makes sense. That may help with some use >>> cases but cause problem with others. >> >> I've never seen a case where it doesn't help. In order to craft an >> alternative, I'll need to have some examples to avoid. Is it only the >> SMT case that is concerning? > > It's just a lot of separate pools on large machines. If you have relatively > high concurrency, the number of workers can go pretty high. They'd also > migrate back and forth more depending on usage pattern and have worse cache > locality. Imagine you have a bursty workload wandering through the system, > if you have nr_cores pools, it can easily end up with kworkers > nr_cores * > max_concurrency. The patch addresses that, I'd hope, by only switching to per-CPU on single pod (ie, simple) systems. Larger, more complicated, topologies should be left unchanged. I imagine that on a single pod machine with a large number of cores, having per-CPU locking will nearly always be a win. -- Chuck Lever
Hello, On Tue, Feb 03, 2026 at 03:34:22PM -0500, Chuck Lever wrote: > The patch addresses that, I'd hope, by only switching to per-CPU on > single pod (ie, simple) systems. Larger, more complicated, topologies > should be left unchanged. I imagine that on a single pod machine with a > large number of cores, having per-CPU locking will nearly always be a > win. Oh, I mean, unfortunately, intel produces chips with a lot of CPUs on a single L3 cache. e.g. Recent intel chips have upto 128 cores per socket and each socket still presents as a single L3 cache domain, so falling back to AFFN_SMT would mean that all unbound workqueues by default would be backed by 128 pools per socket. Create some hundreds of threads per pool and then now you end up with hundreds of thousands of kworkers. Thanks. -- tejun
© 2016 - 2026 Red Hat, Inc.