include/linux/workqueue.h | 6 ++++++ kernel/workqueue.c | 34 ++++++++++++++++++++++++---------- 2 files changed, 30 insertions(+), 10 deletions(-)
In a system with high real-time requirements, we have noticed that many
high-priority tasks, such as kernel threads responsible for dispatching
GPU tasks and receiving data sources, often experience latency spikes
due to insufficient real-time execution of work.
The kworker threads are shared globally based on the attributes of the
workqueue (wq) and the parameters of queue_work_on. This means that
regardless of whether you create a new wq or use an existing one, the
kworker thread that processes the work does not exclusively run any
specific work or work from a specific wq. While this design saves
resources, it makes it difficult to ensure the real-time execution of
work by modifying the priority of the kworker thread associated with a
specific work in hard real-time scenarios. Additionally, if I manually
set the real-time priority of the kworker while executing the work task
and then adjust it back upon completion, the next time queue_work_on is
called, the priority of the kworker thread will have reverted, making it
impossible to ensure timely execution of these lower-priority threads.
Moreover, frequent priority adjustments can incur additional overhead.
Perhaps we could implement all logic related to hard real-time tasks
using kernel threads, but I believe this workload is unnecessary. The
existing workqueue mechanism in the system is well-structured and can
guarantee that work executes in an orderly manner in concurrent scenarios
by adjusting the max_active and WQ_ORDERED attributes. We only need to
introduce a WQ_RT flag and add a small amount of code to meet the
requirements of hard real-time workqueues.
Signed-off-by: Xin Zhao <jackzxcui1989@163.com>
---
include/linux/workqueue.h | 6 ++++++
kernel/workqueue.c | 34 ++++++++++++++++++++++++----------
2 files changed, 30 insertions(+), 10 deletions(-)
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 45d5dd470..973876b79 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -16,6 +16,7 @@
#include <linux/cpumask_types.h>
#include <linux/rcupdate.h>
#include <linux/workqueue_types.h>
+#include <uapi/linux/sched/types.h>
/*
* The first word is the work queue pointer and the flags rolled into
@@ -404,6 +405,8 @@ enum wq_flags {
WQ_POWER_EFFICIENT = 1 << 7,
WQ_PERCPU = 1 << 8, /* bound to a specific cpu */
+ WQ_RT = 1 << 9,
+
__WQ_DESTROYING = 1 << 15, /* internal: workqueue is destroying */
__WQ_DRAINING = 1 << 16, /* internal: workqueue is draining */
__WQ_ORDERED = 1 << 17, /* internal: workqueue is ordered */
@@ -460,6 +463,7 @@ enum wq_consts {
extern struct workqueue_struct *system_wq; /* use system_percpu_wq, this will be removed */
extern struct workqueue_struct *system_percpu_wq;
extern struct workqueue_struct *system_highpri_wq;
+extern struct workqueue_struct *system_rt_wq;
extern struct workqueue_struct *system_long_wq;
extern struct workqueue_struct *system_unbound_wq;
extern struct workqueue_struct *system_dfl_wq;
@@ -781,6 +785,8 @@ extern void __warn_flushing_systemwide_wq(void)
_wq == system_wq) || \
(__builtin_constant_p(_wq == system_highpri_wq) && \
_wq == system_highpri_wq) || \
+ (__builtin_constant_p(_wq == system_rt_wq) && \
+ _wq == system_rt_wq) || \
(__builtin_constant_p(_wq == system_long_wq) && \
_wq == system_long_wq) || \
(__builtin_constant_p(_wq == system_unbound_wq) && \
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c6b79b367..ccbf19e3a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -103,7 +103,7 @@ enum work_cancel_flags {
};
enum wq_internal_consts {
- NR_STD_WORKER_POOLS = 2, /* # standard pools per cpu */
+ NR_STD_WORKER_POOLS = 3, /* # standard pools per cpu */
UNBOUND_POOL_HASH_ORDER = 6, /* hashed by pool->attrs */
BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */
@@ -123,6 +123,7 @@ enum wq_internal_consts {
*/
RESCUER_NICE_LEVEL = MIN_NICE,
HIGHPRI_NICE_LEVEL = MIN_NICE,
+ RTPRI_LEVEL = MIN_NICE - 1,
WQ_NAME_LEN = 32,
WORKER_ID_LEN = 10 + WQ_NAME_LEN, /* "kworker/R-" + WQ_NAME_LEN */
@@ -509,6 +510,8 @@ struct workqueue_struct *system_percpu_wq __ro_after_init;
EXPORT_SYMBOL(system_percpu_wq);
struct workqueue_struct *system_highpri_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_highpri_wq);
+struct workqueue_struct *system_rt_wq __read_mostly;
+EXPORT_SYMBOL_GPL(system_rt_wq);
struct workqueue_struct *system_long_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_long_wq);
struct workqueue_struct *system_unbound_wq __ro_after_init;
@@ -2751,7 +2754,8 @@ static int format_worker_id(char *buf, size_t size, struct worker *worker,
if (pool->cpu >= 0)
return scnprintf(buf, size, "kworker/%d:%d%s",
pool->cpu, worker->id,
- pool->attrs->nice < 0 ? "H" : "");
+ pool->attrs->nice < 0 ?
+ (pool->attrs->nice == RTPRI_LEVEL ? "F" : "H") : "");
else
return scnprintf(buf, size, "kworker/u%d:%d",
pool->id, worker->id);
@@ -2760,6 +2764,9 @@ static int format_worker_id(char *buf, size_t size, struct worker *worker,
}
}
+static int kworker_rt_prio = 1;
+module_param(kworker_rt_prio, int, 0444);
+
/**
* create_worker - create a new workqueue worker
* @pool: pool the new worker will belong to
@@ -2776,6 +2783,7 @@ static struct worker *create_worker(struct worker_pool *pool)
{
struct worker *worker;
int id;
+ struct sched_param sp;
/* ID is needed to determine kthread name */
id = ida_alloc(&pool->worker_ida, GFP_KERNEL);
@@ -2810,7 +2818,12 @@ static struct worker *create_worker(struct worker_pool *pool)
goto fail;
}
- set_user_nice(worker->task, pool->attrs->nice);
+ if (pool->attrs->nice == RTPRI_LEVEL) {
+ sp.sched_priority = kworker_rt_prio;
+ sched_setscheduler_nocheck(worker->task, SCHED_FIFO, &sp);
+ } else {
+ set_user_nice(worker->task, pool->attrs->nice);
+ }
kthread_bind_mask(worker->task, pool_allowed_cpus(pool));
}
@@ -5470,7 +5483,7 @@ static void unbound_wq_update_pwq(struct workqueue_struct *wq, int cpu)
static int alloc_and_link_pwqs(struct workqueue_struct *wq)
{
- bool highpri = wq->flags & WQ_HIGHPRI;
+ int prio = (wq->flags & WQ_RT) ? 2 : (wq->flags & WQ_HIGHPRI ? 1 : 0);
int cpu, ret;
lockdep_assert_held(&wq_pool_mutex);
@@ -5491,7 +5504,7 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
struct pool_workqueue **pwq_p;
struct worker_pool *pool;
- pool = &(per_cpu_ptr(pools, cpu)[highpri]);
+ pool = &(per_cpu_ptr(pools, cpu)[prio]);
pwq_p = per_cpu_ptr(wq->cpu_pwq, cpu);
*pwq_p = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL,
@@ -5511,14 +5524,14 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
if (wq->flags & __WQ_ORDERED) {
struct pool_workqueue *dfl_pwq;
- ret = apply_workqueue_attrs_locked(wq, ordered_wq_attrs[highpri]);
+ ret = apply_workqueue_attrs_locked(wq, ordered_wq_attrs[prio]);
/* there should only be single pwq for ordering guarantee */
dfl_pwq = rcu_access_pointer(wq->dfl_pwq);
WARN(!ret && (wq->pwqs.next != &dfl_pwq->pwqs_node ||
wq->pwqs.prev != &dfl_pwq->pwqs_node),
"ordering guarantee broken for workqueue %s\n", wq->name);
} else {
- ret = apply_workqueue_attrs_locked(wq, unbound_std_wq_attrs[highpri]);
+ ret = apply_workqueue_attrs_locked(wq, unbound_std_wq_attrs[prio]);
}
return ret;
@@ -7720,7 +7733,7 @@ static void __init init_cpu_worker_pool(struct worker_pool *pool, int cpu, int n
void __init workqueue_init_early(void)
{
struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_SYSTEM];
- int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
+ int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL, RTPRI_LEVEL };
void (*irq_work_fns[2])(struct irq_work *) = { bh_pool_kick_normal,
bh_pool_kick_highpri };
int i, cpu;
@@ -7805,6 +7818,7 @@ void __init workqueue_init_early(void)
system_wq = alloc_workqueue("events", 0, 0);
system_percpu_wq = alloc_workqueue("events", 0, 0);
system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
+ system_rt_wq = alloc_workqueue("events_rt", WQ_RT, 0);
system_long_wq = alloc_workqueue("events_long", 0, 0);
system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE);
system_dfl_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE);
@@ -7818,8 +7832,8 @@ void __init workqueue_init_early(void)
system_bh_wq = alloc_workqueue("events_bh", WQ_BH, 0);
system_bh_highpri_wq = alloc_workqueue("events_bh_highpri",
WQ_BH | WQ_HIGHPRI, 0);
- BUG_ON(!system_wq || !system_percpu_wq|| !system_highpri_wq || !system_long_wq ||
- !system_unbound_wq || !system_freezable_wq || !system_dfl_wq ||
+ BUG_ON(!system_wq || !system_percpu_wq || !system_highpri_wq || !system_rt_wq ||
+ !system_long_wq || !system_unbound_wq || !system_freezable_wq || !system_dfl_wq ||
!system_power_efficient_wq ||
!system_freezable_power_efficient_wq ||
!system_bh_wq || !system_bh_highpri_wq);
--
2.34.1
This seems to mis an actual user? Did you accidentally only send patch 1 of a series?
On Thu, 16 Oct 2025 23:26:35 -0700 Christoph Hellwig <hch@infradead.org> wrote:
> This seems to mis an actual user? Did you accidentally only send
> patch 1 of a series?
It is a cross-module issue, I'm not quite sure if there is a better way to handle
this. Initially, I was thinking of proposing a patch to RT workqueue. If that gets
accepted, we could then use the newly created system_rt_wq in the patch to fix
the issues we found.
One of the issue is that where the DMA completion work task for UART data is not
processed in a timely manner, it leads to anomalies in handling IMU timestamps.
The proposed change could be to add a new function called tty_flip_buffer_push_rt,
which would be implemented as follows:
void tty_flip_buffer_push_rt(struct tty_port *port,
struct workqueue_struct *wq)
{
struct tty_bufhead *buf = &port->buf;
tty_flip_buffer_commit(buf->tail);
queue_work(system_rt_wq, &buf->work);
}
EXPORT_SYMBOL(tty_flip_buffer_push_rt);
Our 8250 driver is implemented based on the TTY layer, but the tty_flip_buffer_push
function belongs to the TTY layer. Therefore, a possible patch for a user of the RT
workqueue could be the addition of the tty_flip_buffer_push_rt function to utilize
system_rt_wq. In addition, other changes, such as modifications to the GPU driver
code we maintain on our platform, can also use this RT workqueue, just pass WQ_RT
flag when creating the workqueue.
--
Xin Zhao
Hi Xin,
kernel test robot noticed the following build warnings:
[auto build test WARNING on v6.17]
[cannot apply to tj-wq/for-next v6.18-rc1 linus/master next-20251016]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Xin-Zhao/workqueue-Support-RT-workqueue/20251016-182514
base: v6.17
patch link: https://lore.kernel.org/r/20251016102345.2200815-1-jackzxcui1989%40163.com
patch subject: [PATCH] workqueue: Support RT workqueue
config: csky-randconfig-002-20251017 (https://download.01.org/0day-ci/archive/20251017/202510171328.eE4qdf84-lkp@intel.com/config)
compiler: csky-linux-gcc (GCC) 10.5.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251017/202510171328.eE4qdf84-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202510171328.eE4qdf84-lkp@intel.com/
All warnings (new ones prefixed by >>):
kernel/workqueue.c: In function 'workqueue_init_early':
>> kernel/workqueue.c:7791:4: warning: iteration 2 invokes undefined behavior [-Waggressive-loop-optimizations]
7791 | init_irq_work(bh_pool_irq_work(pool), irq_work_fns[i]);
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
kernel/workqueue.c:552:2: note: within this loop
552 | for ((pool) = &per_cpu(bh_worker_pools, cpu)[0]; \
| ^~~
kernel/workqueue.c:7788:3: note: in expansion of macro 'for_each_bh_worker_pool'
7788 | for_each_bh_worker_pool(pool, cpu) {
| ^~~~~~~~~~~~~~~~~~~~~~~
vim +7791 kernel/workqueue.c
2fcdb1b44491e08 Tejun Heo 2024-02-04 7722
3347fa0928210d9 Tejun Heo 2016-09-16 7723 /**
3347fa0928210d9 Tejun Heo 2016-09-16 7724 * workqueue_init_early - early init for workqueue subsystem
3347fa0928210d9 Tejun Heo 2016-09-16 7725 *
2930155b2e27232 Tejun Heo 2023-08-07 7726 * This is the first step of three-staged workqueue subsystem initialization and
2930155b2e27232 Tejun Heo 2023-08-07 7727 * invoked as soon as the bare basics - memory allocation, cpumasks and idr are
2930155b2e27232 Tejun Heo 2023-08-07 7728 * up. It sets up all the data structures and system workqueues and allows early
2930155b2e27232 Tejun Heo 2023-08-07 7729 * boot code to create workqueues and queue/cancel work items. Actual work item
2930155b2e27232 Tejun Heo 2023-08-07 7730 * execution starts only after kthreads can be created and scheduled right
2930155b2e27232 Tejun Heo 2023-08-07 7731 * before early initcalls.
3347fa0928210d9 Tejun Heo 2016-09-16 7732 */
2333e829952fb43 Yu Chen 2020-02-23 7733 void __init workqueue_init_early(void)
^1da177e4c3f415 Linus Torvalds 2005-04-16 7734 {
84193c07105c62d Tejun Heo 2023-08-07 7735 struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_SYSTEM];
7456d72db76b9c5 Xin Zhao 2025-10-16 7736 int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL, RTPRI_LEVEL };
2f34d7337d98f3e Tejun Heo 2024-02-14 7737 void (*irq_work_fns[2])(struct irq_work *) = { bh_pool_kick_normal,
2f34d7337d98f3e Tejun Heo 2024-02-14 7738 bh_pool_kick_highpri };
7a4e344c5675eef Tejun Heo 2013-03-12 7739 int i, cpu;
c34056a3fdde777 Tejun Heo 2010-06-29 7740
10cdb15759540f0 Lai Jiangshan 2020-06-01 7741 BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
e904e6c2668bba7 Tejun Heo 2013-03-12 7742
8d84baf76045f5b Lai Jiangshan 2024-07-11 7743 BUG_ON(!alloc_cpumask_var(&wq_online_cpumask, GFP_KERNEL));
b05a79280b346eb Frederic Weisbecker 2015-04-27 7744 BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
fe28f631fa941fb Waiman Long 2023-10-25 7745 BUG_ON(!alloc_cpumask_var(&wq_requested_unbound_cpumask, GFP_KERNEL));
fe28f631fa941fb Waiman Long 2023-10-25 7746 BUG_ON(!zalloc_cpumask_var(&wq_isolated_cpumask, GFP_KERNEL));
b05a79280b346eb Frederic Weisbecker 2015-04-27 7747
8d84baf76045f5b Lai Jiangshan 2024-07-11 7748 cpumask_copy(wq_online_cpumask, cpu_online_mask);
4a6c5607d4502cc Tejun Heo 2023-11-21 7749 cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
4a6c5607d4502cc Tejun Heo 2023-11-21 7750 restrict_unbound_cpumask("HK_TYPE_WQ", housekeeping_cpumask(HK_TYPE_WQ));
4a6c5607d4502cc Tejun Heo 2023-11-21 7751 restrict_unbound_cpumask("HK_TYPE_DOMAIN", housekeeping_cpumask(HK_TYPE_DOMAIN));
ace3c5499e61ef7 tiozhang 2023-06-29 7752 if (!cpumask_empty(&wq_cmdline_cpumask))
4a6c5607d4502cc Tejun Heo 2023-11-21 7753 restrict_unbound_cpumask("workqueue.unbound_cpus", &wq_cmdline_cpumask);
ace3c5499e61ef7 tiozhang 2023-06-29 7754
fe28f631fa941fb Waiman Long 2023-10-25 7755 cpumask_copy(wq_requested_unbound_cpumask, wq_unbound_cpumask);
261dce3d64021e7 Chuyi Zhou 2025-06-17 7756 cpumask_andnot(wq_isolated_cpumask, cpu_possible_mask,
261dce3d64021e7 Chuyi Zhou 2025-06-17 7757 housekeeping_cpumask(HK_TYPE_DOMAIN));
e904e6c2668bba7 Tejun Heo 2013-03-12 7758 pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
e904e6c2668bba7 Tejun Heo 2013-03-12 7759
b2b1f9338400de0 Lai Jiangshan 2024-07-11 7760 unbound_wq_update_pwq_attrs_buf = alloc_workqueue_attrs();
b2b1f9338400de0 Lai Jiangshan 2024-07-11 7761 BUG_ON(!unbound_wq_update_pwq_attrs_buf);
2930155b2e27232 Tejun Heo 2023-08-07 7762
7bd20b6b87183db Marcelo Tosatti 2024-01-19 7763 /*
7bd20b6b87183db Marcelo Tosatti 2024-01-19 7764 * If nohz_full is enabled, set power efficient workqueue as unbound.
7bd20b6b87183db Marcelo Tosatti 2024-01-19 7765 * This allows workqueue items to be moved to HK CPUs.
7bd20b6b87183db Marcelo Tosatti 2024-01-19 7766 */
7bd20b6b87183db Marcelo Tosatti 2024-01-19 7767 if (housekeeping_enabled(HK_TYPE_TICK))
7bd20b6b87183db Marcelo Tosatti 2024-01-19 7768 wq_power_efficient = true;
7bd20b6b87183db Marcelo Tosatti 2024-01-19 7769
84193c07105c62d Tejun Heo 2023-08-07 7770 /* initialize WQ_AFFN_SYSTEM pods */
84193c07105c62d Tejun Heo 2023-08-07 7771 pt->pod_cpus = kcalloc(1, sizeof(pt->pod_cpus[0]), GFP_KERNEL);
84193c07105c62d Tejun Heo 2023-08-07 7772 pt->pod_node = kcalloc(1, sizeof(pt->pod_node[0]), GFP_KERNEL);
84193c07105c62d Tejun Heo 2023-08-07 7773 pt->cpu_pod = kcalloc(nr_cpu_ids, sizeof(pt->cpu_pod[0]), GFP_KERNEL);
84193c07105c62d Tejun Heo 2023-08-07 7774 BUG_ON(!pt->pod_cpus || !pt->pod_node || !pt->cpu_pod);
84193c07105c62d Tejun Heo 2023-08-07 7775
84193c07105c62d Tejun Heo 2023-08-07 7776 BUG_ON(!zalloc_cpumask_var_node(&pt->pod_cpus[0], GFP_KERNEL, NUMA_NO_NODE));
84193c07105c62d Tejun Heo 2023-08-07 7777
84193c07105c62d Tejun Heo 2023-08-07 7778 pt->nr_pods = 1;
84193c07105c62d Tejun Heo 2023-08-07 7779 cpumask_copy(pt->pod_cpus[0], cpu_possible_mask);
84193c07105c62d Tejun Heo 2023-08-07 7780 pt->pod_node[0] = NUMA_NO_NODE;
84193c07105c62d Tejun Heo 2023-08-07 7781 pt->cpu_pod[0] = 0;
84193c07105c62d Tejun Heo 2023-08-07 7782
4cb1ef64609f9b0 Tejun Heo 2024-02-04 7783 /* initialize BH and CPU pools */
29c91e9912bed70 Tejun Heo 2013-03-12 7784 for_each_possible_cpu(cpu) {
4ce62e9e30cacc2 Tejun Heo 2012-07-13 7785 struct worker_pool *pool;
8b03ae3cde59af9 Tejun Heo 2010-06-29 7786
4cb1ef64609f9b0 Tejun Heo 2024-02-04 7787 i = 0;
4cb1ef64609f9b0 Tejun Heo 2024-02-04 7788 for_each_bh_worker_pool(pool, cpu) {
2f34d7337d98f3e Tejun Heo 2024-02-14 7789 init_cpu_worker_pool(pool, cpu, std_nice[i]);
4cb1ef64609f9b0 Tejun Heo 2024-02-04 7790 pool->flags |= POOL_BH;
2f34d7337d98f3e Tejun Heo 2024-02-14 @7791 init_irq_work(bh_pool_irq_work(pool), irq_work_fns[i]);
2f34d7337d98f3e Tejun Heo 2024-02-14 7792 i++;
4cb1ef64609f9b0 Tejun Heo 2024-02-04 7793 }
4cb1ef64609f9b0 Tejun Heo 2024-02-04 7794
7a4e344c5675eef Tejun Heo 2013-03-12 7795 i = 0;
2fcdb1b44491e08 Tejun Heo 2024-02-04 7796 for_each_cpu_worker_pool(pool, cpu)
2fcdb1b44491e08 Tejun Heo 2024-02-04 7797 init_cpu_worker_pool(pool, cpu, std_nice[i++]);
8b03ae3cde59af9 Tejun Heo 2010-06-29 7798 }
8b03ae3cde59af9 Tejun Heo 2010-06-29 7799
8a2b75384444488 Tejun Heo 2013-09-05 7800 /* create default unbound and ordered wq attrs */
29c91e9912bed70 Tejun Heo 2013-03-12 7801 for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
29c91e9912bed70 Tejun Heo 2013-03-12 7802 struct workqueue_attrs *attrs;
29c91e9912bed70 Tejun Heo 2013-03-12 7803
be69d00d9769575 Thomas Gleixner 2019-06-26 7804 BUG_ON(!(attrs = alloc_workqueue_attrs()));
29c91e9912bed70 Tejun Heo 2013-03-12 7805 attrs->nice = std_nice[i];
29c91e9912bed70 Tejun Heo 2013-03-12 7806 unbound_std_wq_attrs[i] = attrs;
8a2b75384444488 Tejun Heo 2013-09-05 7807
8a2b75384444488 Tejun Heo 2013-09-05 7808 /*
8a2b75384444488 Tejun Heo 2013-09-05 7809 * An ordered wq should have only one pwq as ordering is
8a2b75384444488 Tejun Heo 2013-09-05 7810 * guaranteed by max_active which is enforced by pwqs.
8a2b75384444488 Tejun Heo 2013-09-05 7811 */
be69d00d9769575 Thomas Gleixner 2019-06-26 7812 BUG_ON(!(attrs = alloc_workqueue_attrs()));
8a2b75384444488 Tejun Heo 2013-09-05 7813 attrs->nice = std_nice[i];
af73f5c9febe509 Tejun Heo 2023-08-07 7814 attrs->ordered = true;
8a2b75384444488 Tejun Heo 2013-09-05 7815 ordered_wq_attrs[i] = attrs;
29c91e9912bed70 Tejun Heo 2013-03-12 7816 }
29c91e9912bed70 Tejun Heo 2013-03-12 7817
d320c03830b17af Tejun Heo 2010-06-29 7818 system_wq = alloc_workqueue("events", 0, 0);
128ea9f6ccfb696 Marco Crivellari 2025-06-14 7819 system_percpu_wq = alloc_workqueue("events", 0, 0);
1aabe902ca3638d Joonsoo Kim 2012-08-15 7820 system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
7456d72db76b9c5 Xin Zhao 2025-10-16 7821 system_rt_wq = alloc_workqueue("events_rt", WQ_RT, 0);
d320c03830b17af Tejun Heo 2010-06-29 7822 system_long_wq = alloc_workqueue("events_long", 0, 0);
128ea9f6ccfb696 Marco Crivellari 2025-06-14 7823 system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE);
128ea9f6ccfb696 Marco Crivellari 2025-06-14 7824 system_dfl_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE);
24d51add7438f96 Tejun Heo 2011-02-21 7825 system_freezable_wq = alloc_workqueue("events_freezable",
24d51add7438f96 Tejun Heo 2011-02-21 7826 WQ_FREEZABLE, 0);
0668106ca3865ba Viresh Kumar 2013-04-24 7827 system_power_efficient_wq = alloc_workqueue("events_power_efficient",
0668106ca3865ba Viresh Kumar 2013-04-24 7828 WQ_POWER_EFFICIENT, 0);
8318d6a6362f590 Audra Mitchell 2024-01-25 7829 system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_pwr_efficient",
0668106ca3865ba Viresh Kumar 2013-04-24 7830 WQ_FREEZABLE | WQ_POWER_EFFICIENT,
0668106ca3865ba Viresh Kumar 2013-04-24 7831 0);
4cb1ef64609f9b0 Tejun Heo 2024-02-04 7832 system_bh_wq = alloc_workqueue("events_bh", WQ_BH, 0);
4cb1ef64609f9b0 Tejun Heo 2024-02-04 7833 system_bh_highpri_wq = alloc_workqueue("events_bh_highpri",
4cb1ef64609f9b0 Tejun Heo 2024-02-04 7834 WQ_BH | WQ_HIGHPRI, 0);
7456d72db76b9c5 Xin Zhao 2025-10-16 7835 BUG_ON(!system_wq || !system_percpu_wq || !system_highpri_wq || !system_rt_wq ||
7456d72db76b9c5 Xin Zhao 2025-10-16 7836 !system_long_wq || !system_unbound_wq || !system_freezable_wq || !system_dfl_wq ||
0668106ca3865ba Viresh Kumar 2013-04-24 7837 !system_power_efficient_wq ||
4cb1ef64609f9b0 Tejun Heo 2024-02-04 7838 !system_freezable_power_efficient_wq ||
4cb1ef64609f9b0 Tejun Heo 2024-02-04 7839 !system_bh_wq || !system_bh_highpri_wq);
3347fa0928210d9 Tejun Heo 2016-09-16 7840 }
3347fa0928210d9 Tejun Heo 2016-09-16 7841
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Hello, On Thu, Oct 16, 2025 at 06:23:45PM +0800, Xin Zhao wrote: > In a system with high real-time requirements, we have noticed that many > high-priority tasks, such as kernel threads responsible for dispatching > GPU tasks and receiving data sources, often experience latency spikes > due to insufficient real-time execution of work. > The kworker threads are shared globally based on the attributes of the > workqueue (wq) and the parameters of queue_work_on. This means that > regardless of whether you create a new wq or use an existing one, the > kworker thread that processes the work does not exclusively run any > specific work or work from a specific wq. While this design saves > resources, it makes it difficult to ensure the real-time execution of > work by modifying the priority of the kworker thread associated with a > specific work in hard real-time scenarios. Additionally, if I manually > set the real-time priority of the kworker while executing the work task > and then adjust it back upon completion, the next time queue_work_on is > called, the priority of the kworker thread will have reverted, making it > impossible to ensure timely execution of these lower-priority threads. > Moreover, frequent priority adjustments can incur additional overhead. > Perhaps we could implement all logic related to hard real-time tasks > using kernel threads, but I believe this workload is unnecessary. The > existing workqueue mechanism in the system is well-structured and can > guarantee that work executes in an orderly manner in concurrent scenarios > by adjusting the max_active and WQ_ORDERED attributes. We only need to > introduce a WQ_RT flag and add a small amount of code to meet the > requirements of hard real-time workqueues. For things that may need RT, please use kthread_work. Thanks. -- tejun
On Thu, 16 Oct 2025 05:32:18 -1000 Tejun Heo <tj@kernel.org> wrote: > Hello, > > On Thu, Oct 16, 2025 at 06:23:45PM +0800, Xin Zhao wrote: > > In a system with high real-time requirements, we have noticed that many > > high-priority tasks, such as kernel threads responsible for dispatching > > GPU tasks and receiving data sources, often experience latency spikes > > due to insufficient real-time execution of work. > > The kworker threads are shared globally based on the attributes of the > > workqueue (wq) and the parameters of queue_work_on. This means that > > regardless of whether you create a new wq or use an existing one, the > > kworker thread that processes the work does not exclusively run any > > specific work or work from a specific wq. While this design saves > > resources, it makes it difficult to ensure the real-time execution of > > work by modifying the priority of the kworker thread associated with a > > specific work in hard real-time scenarios. Additionally, if I manually > > set the real-time priority of the kworker while executing the work task > > and then adjust it back upon completion, the next time queue_work_on is > > called, the priority of the kworker thread will have reverted, making it > > impossible to ensure timely execution of these lower-priority threads. > > Moreover, frequent priority adjustments can incur additional overhead. > > Perhaps we could implement all logic related to hard real-time tasks > > using kernel threads, but I believe this workload is unnecessary. The > > existing workqueue mechanism in the system is well-structured and can > > guarantee that work executes in an orderly manner in concurrent scenarios > > by adjusting the max_active and WQ_ORDERED attributes. We only need to > > introduce a WQ_RT flag and add a small amount of code to meet the > > requirements of hard real-time workqueues. > > For things that may need RT, please use kthread_work. > > Thanks. Thank you for your suggestion. We can indeed replace some simple work usage scenarios with kthread_work. However, it is not a panacea, especially in cases where work processing requires concurrency. If we cannot use the RT workqueue implementation, we need to create another kthread to improve concurrency performance? In addition, we may need to incorporate logic to split works that are originally of the same type? This splitting modification can be quite time-consuming and is likely to encounter various strange errors during the split. Given that the workqueue mechanism is so excellent and effectively matches the needs for concurrent and ordered work while also managing work thread resources efficiently, perhaps we should consider adding the RT workqueue feature to the kernel. After all, the changes are too simple to carry any minimal risk, yet the benefits could be substantial in handling concurrent demands and preventing thread resource wastage. If we do not pursue this, continue relying on kthread_work to handle RT-prio work concurrency, it would be equivalent to re-implementing the logic of pwq and worker_pool. -- Xin Zhao
Hello, Xin. On Fri, Oct 17, 2025 at 05:10:41PM +0800, Xin Zhao wrote: > Thank you for your suggestion. We can indeed replace some simple work usage > scenarios with kthread_work. However, it is not a panacea, especially in > cases where work processing requires concurrency. > If we cannot use the RT workqueue implementation, we need to create another > kthread to improve concurrency performance? In addition, we may need to > incorporate logic to split works that are originally of the same type? > This splitting modification can be quite time-consuming and is likely to > encounter various strange errors during the split. > Given that the workqueue mechanism is so excellent and effectively matches > the needs for concurrent and ordered work while also managing work thread > resources efficiently, perhaps we should consider adding the RT workqueue > feature to the kernel. After all, the changes are too simple to carry any > minimal risk, yet the benefits could be substantial in handling concurrent > demands and preventing thread resource wastage. > If we do not pursue this, continue relying on kthread_work to handle RT-prio > work concurrency, it would be equivalent to re-implementing the logic of > pwq and worker_pool. There are a couple reasons why I don't think we don't want to go this way: - Maybe RT is enough for your specific use case but others may want to use e.g. deadline, pinning to specific CPUs, persistent kthread-tied accounting and prioritization (note that kworkers carries arbitrary scheduling history across work item boundaries). - Running anything RT presents a signficant cost to the system overall. There's significant loss in terms of the scheduler's ability to manage the system. Once you have too many things running in RT, you just don't have a working scheduler on the system. So, I think it makes sense to keep the decision to allow / use RT a clearly deliberate choice, something which has to be a lot more intentional than picking a different workqueue. This will be a pretty hard no from me. Of course, I can be wrong and you can argue your case, but it'd probably be most effective if the arguments are based on concrete and specific use cases. Thanks. -- tejun
© 2016 - 2026 Red Hat, Inc.