workqueue: Support RT workqueue

[PATCH] workqueue: Support RT workqueue

Posted by Xin Zhao 3 months, 3 weeks ago

In a system with high real-time requirements, we have noticed that many
high-priority tasks, such as kernel threads responsible for dispatching
GPU tasks and receiving data sources, often experience latency spikes
due to insufficient real-time execution of work.
The kworker threads are shared globally based on the attributes of the
workqueue (wq) and the parameters of queue_work_on. This means that
regardless of whether you create a new wq or use an existing one, the
kworker thread that processes the work does not exclusively run any
specific work or work from a specific wq. While this design saves
resources, it makes it difficult to ensure the real-time execution of
work by modifying the priority of the kworker thread associated with a
specific work in hard real-time scenarios. Additionally, if I manually
set the real-time priority of the kworker while executing the work task
and then adjust it back upon completion, the next time queue_work_on is
called, the priority of the kworker thread will have reverted, making it
impossible to ensure timely execution of these lower-priority threads.
Moreover, frequent priority adjustments can incur additional overhead.
Perhaps we could implement all logic related to hard real-time tasks
using kernel threads, but I believe this workload is unnecessary. The
existing workqueue mechanism in the system is well-structured and can
guarantee that work executes in an orderly manner in concurrent scenarios
by adjusting the max_active and WQ_ORDERED attributes. We only need to
introduce a WQ_RT flag and add a small amount of code to meet the
requirements of hard real-time workqueues.

Signed-off-by: Xin Zhao <jackzxcui1989@163.com>
---
 include/linux/workqueue.h |  6 ++++++
 kernel/workqueue.c        | 34 ++++++++++++++++++++++++----------
 2 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 45d5dd470..973876b79 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -16,6 +16,7 @@
 #include <linux/cpumask_types.h>
 #include <linux/rcupdate.h>
 #include <linux/workqueue_types.h>
+#include <uapi/linux/sched/types.h>
 
 /*
  * The first word is the work queue pointer and the flags rolled into
@@ -404,6 +405,8 @@ enum wq_flags {
 	WQ_POWER_EFFICIENT	= 1 << 7,
 	WQ_PERCPU		= 1 << 8, /* bound to a specific cpu */
 
+	WQ_RT			= 1 << 9,
+
 	__WQ_DESTROYING		= 1 << 15, /* internal: workqueue is destroying */
 	__WQ_DRAINING		= 1 << 16, /* internal: workqueue is draining */
 	__WQ_ORDERED		= 1 << 17, /* internal: workqueue is ordered */
@@ -460,6 +463,7 @@ enum wq_consts {
 extern struct workqueue_struct *system_wq; /* use system_percpu_wq, this will be removed */
 extern struct workqueue_struct *system_percpu_wq;
 extern struct workqueue_struct *system_highpri_wq;
+extern struct workqueue_struct *system_rt_wq;
 extern struct workqueue_struct *system_long_wq;
 extern struct workqueue_struct *system_unbound_wq;
 extern struct workqueue_struct *system_dfl_wq;
@@ -781,6 +785,8 @@ extern void __warn_flushing_systemwide_wq(void)
 	     _wq == system_wq) ||					\
 	    (__builtin_constant_p(_wq == system_highpri_wq) &&		\
 	     _wq == system_highpri_wq) ||				\
+	    (__builtin_constant_p(_wq == system_rt_wq) &&		\
+	     _wq == system_rt_wq) ||					\
 	    (__builtin_constant_p(_wq == system_long_wq) &&		\
 	     _wq == system_long_wq) ||					\
 	    (__builtin_constant_p(_wq == system_unbound_wq) &&		\
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c6b79b367..ccbf19e3a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -103,7 +103,7 @@ enum work_cancel_flags {
 };
 
 enum wq_internal_consts {
-	NR_STD_WORKER_POOLS	= 2,		/* # standard pools per cpu */
+	NR_STD_WORKER_POOLS	= 3,		/* # standard pools per cpu */
 
 	UNBOUND_POOL_HASH_ORDER	= 6,		/* hashed by pool->attrs */
 	BUSY_WORKER_HASH_ORDER	= 6,		/* 64 pointers */
@@ -123,6 +123,7 @@ enum wq_internal_consts {
 	 */
 	RESCUER_NICE_LEVEL	= MIN_NICE,
 	HIGHPRI_NICE_LEVEL	= MIN_NICE,
+	RTPRI_LEVEL		= MIN_NICE - 1,
 
 	WQ_NAME_LEN		= 32,
 	WORKER_ID_LEN		= 10 + WQ_NAME_LEN, /* "kworker/R-" + WQ_NAME_LEN */
@@ -509,6 +510,8 @@ struct workqueue_struct *system_percpu_wq __ro_after_init;
 EXPORT_SYMBOL(system_percpu_wq);
 struct workqueue_struct *system_highpri_wq __ro_after_init;
 EXPORT_SYMBOL_GPL(system_highpri_wq);
+struct workqueue_struct *system_rt_wq __read_mostly;
+EXPORT_SYMBOL_GPL(system_rt_wq);
 struct workqueue_struct *system_long_wq __ro_after_init;
 EXPORT_SYMBOL_GPL(system_long_wq);
 struct workqueue_struct *system_unbound_wq __ro_after_init;
@@ -2751,7 +2754,8 @@ static int format_worker_id(char *buf, size_t size, struct worker *worker,
 		if (pool->cpu >= 0)
 			return scnprintf(buf, size, "kworker/%d:%d%s",
 					 pool->cpu, worker->id,
-					 pool->attrs->nice < 0  ? "H" : "");
+					 pool->attrs->nice < 0 ?
+					 (pool->attrs->nice == RTPRI_LEVEL ? "F" : "H") : "");
 		else
 			return scnprintf(buf, size, "kworker/u%d:%d",
 					 pool->id, worker->id);
@@ -2760,6 +2764,9 @@ static int format_worker_id(char *buf, size_t size, struct worker *worker,
 	}
 }
 
+static int kworker_rt_prio = 1;
+module_param(kworker_rt_prio, int, 0444);
+
 /**
  * create_worker - create a new workqueue worker
  * @pool: pool the new worker will belong to
@@ -2776,6 +2783,7 @@ static struct worker *create_worker(struct worker_pool *pool)
 {
 	struct worker *worker;
 	int id;
+	struct sched_param sp;
 
 	/* ID is needed to determine kthread name */
 	id = ida_alloc(&pool->worker_ida, GFP_KERNEL);
@@ -2810,7 +2818,12 @@ static struct worker *create_worker(struct worker_pool *pool)
 			goto fail;
 		}
 
-		set_user_nice(worker->task, pool->attrs->nice);
+		if (pool->attrs->nice == RTPRI_LEVEL) {
+			sp.sched_priority = kworker_rt_prio;
+			sched_setscheduler_nocheck(worker->task, SCHED_FIFO, &sp);
+		} else {
+			set_user_nice(worker->task, pool->attrs->nice);
+		}
 		kthread_bind_mask(worker->task, pool_allowed_cpus(pool));
 	}
 
@@ -5470,7 +5483,7 @@ static void unbound_wq_update_pwq(struct workqueue_struct *wq, int cpu)
 
 static int alloc_and_link_pwqs(struct workqueue_struct *wq)
 {
-	bool highpri = wq->flags & WQ_HIGHPRI;
+	int prio = (wq->flags & WQ_RT) ? 2 : (wq->flags & WQ_HIGHPRI ? 1 : 0);
 	int cpu, ret;
 
 	lockdep_assert_held(&wq_pool_mutex);
@@ -5491,7 +5504,7 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
 			struct pool_workqueue **pwq_p;
 			struct worker_pool *pool;
 
-			pool = &(per_cpu_ptr(pools, cpu)[highpri]);
+			pool = &(per_cpu_ptr(pools, cpu)[prio]);
 			pwq_p = per_cpu_ptr(wq->cpu_pwq, cpu);
 
 			*pwq_p = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL,
@@ -5511,14 +5524,14 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
 	if (wq->flags & __WQ_ORDERED) {
 		struct pool_workqueue *dfl_pwq;
 
-		ret = apply_workqueue_attrs_locked(wq, ordered_wq_attrs[highpri]);
+		ret = apply_workqueue_attrs_locked(wq, ordered_wq_attrs[prio]);
 		/* there should only be single pwq for ordering guarantee */
 		dfl_pwq = rcu_access_pointer(wq->dfl_pwq);
 		WARN(!ret && (wq->pwqs.next != &dfl_pwq->pwqs_node ||
 			      wq->pwqs.prev != &dfl_pwq->pwqs_node),
 		     "ordering guarantee broken for workqueue %s\n", wq->name);
 	} else {
-		ret = apply_workqueue_attrs_locked(wq, unbound_std_wq_attrs[highpri]);
+		ret = apply_workqueue_attrs_locked(wq, unbound_std_wq_attrs[prio]);
 	}
 
 	return ret;
@@ -7720,7 +7733,7 @@ static void __init init_cpu_worker_pool(struct worker_pool *pool, int cpu, int n
 void __init workqueue_init_early(void)
 {
 	struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_SYSTEM];
-	int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
+	int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL, RTPRI_LEVEL };
 	void (*irq_work_fns[2])(struct irq_work *) = { bh_pool_kick_normal,
 						       bh_pool_kick_highpri };
 	int i, cpu;
@@ -7805,6 +7818,7 @@ void __init workqueue_init_early(void)
 	system_wq = alloc_workqueue("events", 0, 0);
 	system_percpu_wq = alloc_workqueue("events", 0, 0);
 	system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
+	system_rt_wq = alloc_workqueue("events_rt", WQ_RT, 0);
 	system_long_wq = alloc_workqueue("events_long", 0, 0);
 	system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE);
 	system_dfl_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE);
@@ -7818,8 +7832,8 @@ void __init workqueue_init_early(void)
 	system_bh_wq = alloc_workqueue("events_bh", WQ_BH, 0);
 	system_bh_highpri_wq = alloc_workqueue("events_bh_highpri",
 					       WQ_BH | WQ_HIGHPRI, 0);
-	BUG_ON(!system_wq || !system_percpu_wq|| !system_highpri_wq || !system_long_wq ||
-	       !system_unbound_wq || !system_freezable_wq || !system_dfl_wq ||
+	BUG_ON(!system_wq || !system_percpu_wq || !system_highpri_wq || !system_rt_wq ||
+	       !system_long_wq || !system_unbound_wq || !system_freezable_wq || !system_dfl_wq ||
 	       !system_power_efficient_wq ||
 	       !system_freezable_power_efficient_wq ||
 	       !system_bh_wq || !system_bh_highpri_wq);
-- 
2.34.1

Re: [PATCH] workqueue: Support RT workqueue

Posted by Christoph Hellwig 3 months, 3 weeks ago

This seems to mis an actual user?  Did you accidentally only send
patch 1 of a series?

Re: [PATCH] workqueue: Support RT workqueue

Posted by Xin Zhao 3 months, 3 weeks ago

On Thu, 16 Oct 2025 23:26:35 -0700 Christoph Hellwig <hch@infradead.org> wrote:
> This seems to mis an actual user?  Did you accidentally only send
> patch 1 of a series?

It is a cross-module issue, I'm not quite sure if there is a better way to handle
this. Initially, I was thinking of proposing a patch to RT workqueue. If that gets
accepted, we could then use the newly created system_rt_wq in the patch to fix
the issues we found.
One of the issue is that where the DMA completion work task for UART data is not
processed in a timely manner, it leads to anomalies in handling IMU timestamps.
The proposed change could be to add a new function called tty_flip_buffer_push_rt,
which would be implemented as follows:

void tty_flip_buffer_push_rt(struct tty_port *port,
                             struct workqueue_struct *wq)
{
    struct tty_bufhead *buf = &port->buf;

    tty_flip_buffer_commit(buf->tail);
    queue_work(system_rt_wq, &buf->work);
}
EXPORT_SYMBOL(tty_flip_buffer_push_rt);

Our 8250 driver is implemented based on the TTY layer, but the tty_flip_buffer_push
function belongs to the TTY layer. Therefore, a possible patch for a user of the RT
workqueue could be the addition of the tty_flip_buffer_push_rt function to utilize
system_rt_wq. In addition, other changes, such as modifications to the GPU driver
code we maintain on our platform, can also use this RT workqueue, just pass WQ_RT
flag when creating the workqueue.

--
Xin Zhao

Re: [PATCH] workqueue: Support RT workqueue

Posted by kernel test robot 3 months, 3 weeks ago

Hi Xin,

kernel test robot noticed the following build warnings:

[auto build test WARNING on v6.17]
[cannot apply to tj-wq/for-next v6.18-rc1 linus/master next-20251016]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Xin-Zhao/workqueue-Support-RT-workqueue/20251016-182514
base:   v6.17
patch link:    https://lore.kernel.org/r/20251016102345.2200815-1-jackzxcui1989%40163.com
patch subject: [PATCH] workqueue: Support RT workqueue
config: csky-randconfig-002-20251017 (https://download.01.org/0day-ci/archive/20251017/202510171328.eE4qdf84-lkp@intel.com/config)
compiler: csky-linux-gcc (GCC) 10.5.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251017/202510171328.eE4qdf84-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202510171328.eE4qdf84-lkp@intel.com/

All warnings (new ones prefixed by >>):

   kernel/workqueue.c: In function 'workqueue_init_early':
>> kernel/workqueue.c:7791:4: warning: iteration 2 invokes undefined behavior [-Waggressive-loop-optimizations]
    7791 |    init_irq_work(bh_pool_irq_work(pool), irq_work_fns[i]);
         |    ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   kernel/workqueue.c:552:2: note: within this loop
     552 |  for ((pool) = &per_cpu(bh_worker_pools, cpu)[0];  \
         |  ^~~
   kernel/workqueue.c:7788:3: note: in expansion of macro 'for_each_bh_worker_pool'
    7788 |   for_each_bh_worker_pool(pool, cpu) {
         |   ^~~~~~~~~~~~~~~~~~~~~~~


vim +7791 kernel/workqueue.c

2fcdb1b44491e08 Tejun Heo           2024-02-04  7722  
3347fa0928210d9 Tejun Heo           2016-09-16  7723  /**
3347fa0928210d9 Tejun Heo           2016-09-16  7724   * workqueue_init_early - early init for workqueue subsystem
3347fa0928210d9 Tejun Heo           2016-09-16  7725   *
2930155b2e27232 Tejun Heo           2023-08-07  7726   * This is the first step of three-staged workqueue subsystem initialization and
2930155b2e27232 Tejun Heo           2023-08-07  7727   * invoked as soon as the bare basics - memory allocation, cpumasks and idr are
2930155b2e27232 Tejun Heo           2023-08-07  7728   * up. It sets up all the data structures and system workqueues and allows early
2930155b2e27232 Tejun Heo           2023-08-07  7729   * boot code to create workqueues and queue/cancel work items. Actual work item
2930155b2e27232 Tejun Heo           2023-08-07  7730   * execution starts only after kthreads can be created and scheduled right
2930155b2e27232 Tejun Heo           2023-08-07  7731   * before early initcalls.
3347fa0928210d9 Tejun Heo           2016-09-16  7732   */
2333e829952fb43 Yu Chen             2020-02-23  7733  void __init workqueue_init_early(void)
^1da177e4c3f415 Linus Torvalds      2005-04-16  7734  {
84193c07105c62d Tejun Heo           2023-08-07  7735  	struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_SYSTEM];
7456d72db76b9c5 Xin Zhao            2025-10-16  7736  	int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL, RTPRI_LEVEL };
2f34d7337d98f3e Tejun Heo           2024-02-14  7737  	void (*irq_work_fns[2])(struct irq_work *) = { bh_pool_kick_normal,
2f34d7337d98f3e Tejun Heo           2024-02-14  7738  						       bh_pool_kick_highpri };
7a4e344c5675eef Tejun Heo           2013-03-12  7739  	int i, cpu;
c34056a3fdde777 Tejun Heo           2010-06-29  7740  
10cdb15759540f0 Lai Jiangshan       2020-06-01  7741  	BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
e904e6c2668bba7 Tejun Heo           2013-03-12  7742  
8d84baf76045f5b Lai Jiangshan       2024-07-11  7743  	BUG_ON(!alloc_cpumask_var(&wq_online_cpumask, GFP_KERNEL));
b05a79280b346eb Frederic Weisbecker 2015-04-27  7744  	BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
fe28f631fa941fb Waiman Long         2023-10-25  7745  	BUG_ON(!alloc_cpumask_var(&wq_requested_unbound_cpumask, GFP_KERNEL));
fe28f631fa941fb Waiman Long         2023-10-25  7746  	BUG_ON(!zalloc_cpumask_var(&wq_isolated_cpumask, GFP_KERNEL));
b05a79280b346eb Frederic Weisbecker 2015-04-27  7747  
8d84baf76045f5b Lai Jiangshan       2024-07-11  7748  	cpumask_copy(wq_online_cpumask, cpu_online_mask);
4a6c5607d4502cc Tejun Heo           2023-11-21  7749  	cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
4a6c5607d4502cc Tejun Heo           2023-11-21  7750  	restrict_unbound_cpumask("HK_TYPE_WQ", housekeeping_cpumask(HK_TYPE_WQ));
4a6c5607d4502cc Tejun Heo           2023-11-21  7751  	restrict_unbound_cpumask("HK_TYPE_DOMAIN", housekeeping_cpumask(HK_TYPE_DOMAIN));
ace3c5499e61ef7 tiozhang            2023-06-29  7752  	if (!cpumask_empty(&wq_cmdline_cpumask))
4a6c5607d4502cc Tejun Heo           2023-11-21  7753  		restrict_unbound_cpumask("workqueue.unbound_cpus", &wq_cmdline_cpumask);
ace3c5499e61ef7 tiozhang            2023-06-29  7754  
fe28f631fa941fb Waiman Long         2023-10-25  7755  	cpumask_copy(wq_requested_unbound_cpumask, wq_unbound_cpumask);
261dce3d64021e7 Chuyi Zhou          2025-06-17  7756  	cpumask_andnot(wq_isolated_cpumask, cpu_possible_mask,
261dce3d64021e7 Chuyi Zhou          2025-06-17  7757  						housekeeping_cpumask(HK_TYPE_DOMAIN));
e904e6c2668bba7 Tejun Heo           2013-03-12  7758  	pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
e904e6c2668bba7 Tejun Heo           2013-03-12  7759  
b2b1f9338400de0 Lai Jiangshan       2024-07-11  7760  	unbound_wq_update_pwq_attrs_buf = alloc_workqueue_attrs();
b2b1f9338400de0 Lai Jiangshan       2024-07-11  7761  	BUG_ON(!unbound_wq_update_pwq_attrs_buf);
2930155b2e27232 Tejun Heo           2023-08-07  7762  
7bd20b6b87183db Marcelo Tosatti     2024-01-19  7763  	/*
7bd20b6b87183db Marcelo Tosatti     2024-01-19  7764  	 * If nohz_full is enabled, set power efficient workqueue as unbound.
7bd20b6b87183db Marcelo Tosatti     2024-01-19  7765  	 * This allows workqueue items to be moved to HK CPUs.
7bd20b6b87183db Marcelo Tosatti     2024-01-19  7766  	 */
7bd20b6b87183db Marcelo Tosatti     2024-01-19  7767  	if (housekeeping_enabled(HK_TYPE_TICK))
7bd20b6b87183db Marcelo Tosatti     2024-01-19  7768  		wq_power_efficient = true;
7bd20b6b87183db Marcelo Tosatti     2024-01-19  7769  
84193c07105c62d Tejun Heo           2023-08-07  7770  	/* initialize WQ_AFFN_SYSTEM pods */
84193c07105c62d Tejun Heo           2023-08-07  7771  	pt->pod_cpus = kcalloc(1, sizeof(pt->pod_cpus[0]), GFP_KERNEL);
84193c07105c62d Tejun Heo           2023-08-07  7772  	pt->pod_node = kcalloc(1, sizeof(pt->pod_node[0]), GFP_KERNEL);
84193c07105c62d Tejun Heo           2023-08-07  7773  	pt->cpu_pod = kcalloc(nr_cpu_ids, sizeof(pt->cpu_pod[0]), GFP_KERNEL);
84193c07105c62d Tejun Heo           2023-08-07  7774  	BUG_ON(!pt->pod_cpus || !pt->pod_node || !pt->cpu_pod);
84193c07105c62d Tejun Heo           2023-08-07  7775  
84193c07105c62d Tejun Heo           2023-08-07  7776  	BUG_ON(!zalloc_cpumask_var_node(&pt->pod_cpus[0], GFP_KERNEL, NUMA_NO_NODE));
84193c07105c62d Tejun Heo           2023-08-07  7777  
84193c07105c62d Tejun Heo           2023-08-07  7778  	pt->nr_pods = 1;
84193c07105c62d Tejun Heo           2023-08-07  7779  	cpumask_copy(pt->pod_cpus[0], cpu_possible_mask);
84193c07105c62d Tejun Heo           2023-08-07  7780  	pt->pod_node[0] = NUMA_NO_NODE;
84193c07105c62d Tejun Heo           2023-08-07  7781  	pt->cpu_pod[0] = 0;
84193c07105c62d Tejun Heo           2023-08-07  7782  
4cb1ef64609f9b0 Tejun Heo           2024-02-04  7783  	/* initialize BH and CPU pools */
29c91e9912bed70 Tejun Heo           2013-03-12  7784  	for_each_possible_cpu(cpu) {
4ce62e9e30cacc2 Tejun Heo           2012-07-13  7785  		struct worker_pool *pool;
8b03ae3cde59af9 Tejun Heo           2010-06-29  7786  
4cb1ef64609f9b0 Tejun Heo           2024-02-04  7787  		i = 0;
4cb1ef64609f9b0 Tejun Heo           2024-02-04  7788  		for_each_bh_worker_pool(pool, cpu) {
2f34d7337d98f3e Tejun Heo           2024-02-14  7789  			init_cpu_worker_pool(pool, cpu, std_nice[i]);
4cb1ef64609f9b0 Tejun Heo           2024-02-04  7790  			pool->flags |= POOL_BH;
2f34d7337d98f3e Tejun Heo           2024-02-14 @7791  			init_irq_work(bh_pool_irq_work(pool), irq_work_fns[i]);
2f34d7337d98f3e Tejun Heo           2024-02-14  7792  			i++;
4cb1ef64609f9b0 Tejun Heo           2024-02-04  7793  		}
4cb1ef64609f9b0 Tejun Heo           2024-02-04  7794  
7a4e344c5675eef Tejun Heo           2013-03-12  7795  		i = 0;
2fcdb1b44491e08 Tejun Heo           2024-02-04  7796  		for_each_cpu_worker_pool(pool, cpu)
2fcdb1b44491e08 Tejun Heo           2024-02-04  7797  			init_cpu_worker_pool(pool, cpu, std_nice[i++]);
8b03ae3cde59af9 Tejun Heo           2010-06-29  7798  	}
8b03ae3cde59af9 Tejun Heo           2010-06-29  7799  
8a2b75384444488 Tejun Heo           2013-09-05  7800  	/* create default unbound and ordered wq attrs */
29c91e9912bed70 Tejun Heo           2013-03-12  7801  	for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
29c91e9912bed70 Tejun Heo           2013-03-12  7802  		struct workqueue_attrs *attrs;
29c91e9912bed70 Tejun Heo           2013-03-12  7803  
be69d00d9769575 Thomas Gleixner     2019-06-26  7804  		BUG_ON(!(attrs = alloc_workqueue_attrs()));
29c91e9912bed70 Tejun Heo           2013-03-12  7805  		attrs->nice = std_nice[i];
29c91e9912bed70 Tejun Heo           2013-03-12  7806  		unbound_std_wq_attrs[i] = attrs;
8a2b75384444488 Tejun Heo           2013-09-05  7807  
8a2b75384444488 Tejun Heo           2013-09-05  7808  		/*
8a2b75384444488 Tejun Heo           2013-09-05  7809  		 * An ordered wq should have only one pwq as ordering is
8a2b75384444488 Tejun Heo           2013-09-05  7810  		 * guaranteed by max_active which is enforced by pwqs.
8a2b75384444488 Tejun Heo           2013-09-05  7811  		 */
be69d00d9769575 Thomas Gleixner     2019-06-26  7812  		BUG_ON(!(attrs = alloc_workqueue_attrs()));
8a2b75384444488 Tejun Heo           2013-09-05  7813  		attrs->nice = std_nice[i];
af73f5c9febe509 Tejun Heo           2023-08-07  7814  		attrs->ordered = true;
8a2b75384444488 Tejun Heo           2013-09-05  7815  		ordered_wq_attrs[i] = attrs;
29c91e9912bed70 Tejun Heo           2013-03-12  7816  	}
29c91e9912bed70 Tejun Heo           2013-03-12  7817  
d320c03830b17af Tejun Heo           2010-06-29  7818  	system_wq = alloc_workqueue("events", 0, 0);
128ea9f6ccfb696 Marco Crivellari    2025-06-14  7819  	system_percpu_wq = alloc_workqueue("events", 0, 0);
1aabe902ca3638d Joonsoo Kim         2012-08-15  7820  	system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
7456d72db76b9c5 Xin Zhao            2025-10-16  7821  	system_rt_wq = alloc_workqueue("events_rt", WQ_RT, 0);
d320c03830b17af Tejun Heo           2010-06-29  7822  	system_long_wq = alloc_workqueue("events_long", 0, 0);
128ea9f6ccfb696 Marco Crivellari    2025-06-14  7823  	system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE);
128ea9f6ccfb696 Marco Crivellari    2025-06-14  7824  	system_dfl_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE);
24d51add7438f96 Tejun Heo           2011-02-21  7825  	system_freezable_wq = alloc_workqueue("events_freezable",
24d51add7438f96 Tejun Heo           2011-02-21  7826  					      WQ_FREEZABLE, 0);
0668106ca3865ba Viresh Kumar        2013-04-24  7827  	system_power_efficient_wq = alloc_workqueue("events_power_efficient",
0668106ca3865ba Viresh Kumar        2013-04-24  7828  					      WQ_POWER_EFFICIENT, 0);
8318d6a6362f590 Audra Mitchell      2024-01-25  7829  	system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_pwr_efficient",
0668106ca3865ba Viresh Kumar        2013-04-24  7830  					      WQ_FREEZABLE | WQ_POWER_EFFICIENT,
0668106ca3865ba Viresh Kumar        2013-04-24  7831  					      0);
4cb1ef64609f9b0 Tejun Heo           2024-02-04  7832  	system_bh_wq = alloc_workqueue("events_bh", WQ_BH, 0);
4cb1ef64609f9b0 Tejun Heo           2024-02-04  7833  	system_bh_highpri_wq = alloc_workqueue("events_bh_highpri",
4cb1ef64609f9b0 Tejun Heo           2024-02-04  7834  					       WQ_BH | WQ_HIGHPRI, 0);
7456d72db76b9c5 Xin Zhao            2025-10-16  7835  	BUG_ON(!system_wq || !system_percpu_wq || !system_highpri_wq || !system_rt_wq ||
7456d72db76b9c5 Xin Zhao            2025-10-16  7836  	       !system_long_wq || !system_unbound_wq || !system_freezable_wq || !system_dfl_wq ||
0668106ca3865ba Viresh Kumar        2013-04-24  7837  	       !system_power_efficient_wq ||
4cb1ef64609f9b0 Tejun Heo           2024-02-04  7838  	       !system_freezable_power_efficient_wq ||
4cb1ef64609f9b0 Tejun Heo           2024-02-04  7839  	       !system_bh_wq || !system_bh_highpri_wq);
3347fa0928210d9 Tejun Heo           2016-09-16  7840  }
3347fa0928210d9 Tejun Heo           2016-09-16  7841  

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

Re: [PATCH] workqueue: Support RT workqueue

Posted by Tejun Heo 3 months, 3 weeks ago

Hello,

On Thu, Oct 16, 2025 at 06:23:45PM +0800, Xin Zhao wrote:
> In a system with high real-time requirements, we have noticed that many
> high-priority tasks, such as kernel threads responsible for dispatching
> GPU tasks and receiving data sources, often experience latency spikes
> due to insufficient real-time execution of work.
> The kworker threads are shared globally based on the attributes of the
> workqueue (wq) and the parameters of queue_work_on. This means that
> regardless of whether you create a new wq or use an existing one, the
> kworker thread that processes the work does not exclusively run any
> specific work or work from a specific wq. While this design saves
> resources, it makes it difficult to ensure the real-time execution of
> work by modifying the priority of the kworker thread associated with a
> specific work in hard real-time scenarios. Additionally, if I manually
> set the real-time priority of the kworker while executing the work task
> and then adjust it back upon completion, the next time queue_work_on is
> called, the priority of the kworker thread will have reverted, making it
> impossible to ensure timely execution of these lower-priority threads.
> Moreover, frequent priority adjustments can incur additional overhead.
> Perhaps we could implement all logic related to hard real-time tasks
> using kernel threads, but I believe this workload is unnecessary. The
> existing workqueue mechanism in the system is well-structured and can
> guarantee that work executes in an orderly manner in concurrent scenarios
> by adjusting the max_active and WQ_ORDERED attributes. We only need to
> introduce a WQ_RT flag and add a small amount of code to meet the
> requirements of hard real-time workqueues.

For things that may need RT, please use kthread_work.

Thanks.

-- 
tejun

Re: [PATCH] workqueue: Support RT workqueue

Posted by Xin Zhao 3 months, 3 weeks ago

On Thu, 16 Oct 2025 05:32:18 -1000 Tejun Heo <tj@kernel.org> wrote:
> Hello,
> 
> On Thu, Oct 16, 2025 at 06:23:45PM +0800, Xin Zhao wrote:
> > In a system with high real-time requirements, we have noticed that many
> > high-priority tasks, such as kernel threads responsible for dispatching
> > GPU tasks and receiving data sources, often experience latency spikes
> > due to insufficient real-time execution of work.
> > The kworker threads are shared globally based on the attributes of the
> > workqueue (wq) and the parameters of queue_work_on. This means that
> > regardless of whether you create a new wq or use an existing one, the
> > kworker thread that processes the work does not exclusively run any
> > specific work or work from a specific wq. While this design saves
> > resources, it makes it difficult to ensure the real-time execution of
> > work by modifying the priority of the kworker thread associated with a
> > specific work in hard real-time scenarios. Additionally, if I manually
> > set the real-time priority of the kworker while executing the work task
> > and then adjust it back upon completion, the next time queue_work_on is
> > called, the priority of the kworker thread will have reverted, making it
> > impossible to ensure timely execution of these lower-priority threads.
> > Moreover, frequent priority adjustments can incur additional overhead.
> > Perhaps we could implement all logic related to hard real-time tasks
> > using kernel threads, but I believe this workload is unnecessary. The
> > existing workqueue mechanism in the system is well-structured and can
> > guarantee that work executes in an orderly manner in concurrent scenarios
> > by adjusting the max_active and WQ_ORDERED attributes. We only need to
> > introduce a WQ_RT flag and add a small amount of code to meet the
> > requirements of hard real-time workqueues.
> 
> For things that may need RT, please use kthread_work.
> 
> Thanks.

Thank you for your suggestion. We can indeed replace some simple work usage
scenarios with kthread_work. However, it is not a panacea, especially in
cases where work processing requires concurrency.
If we cannot use the RT workqueue implementation, we need to create another
kthread to improve concurrency performance? In addition, we may need to
incorporate logic to split works that are originally of the same type?
This splitting modification can be quite time-consuming and is likely to
encounter various strange errors during the split.
Given that the workqueue mechanism is so excellent and effectively matches
the needs for concurrent and ordered work while also managing work thread
resources efficiently, perhaps we should consider adding the RT workqueue
feature to the kernel. After all, the changes are too simple to carry any
minimal risk, yet the benefits could be substantial in handling concurrent
demands and preventing thread resource wastage.
If we do not pursue this, continue relying on kthread_work to handle RT-prio
work concurrency, it would be equivalent to re-implementing the logic of
pwq and worker_pool.

--
Xin Zhao

Re: [PATCH] workqueue: Support RT workqueue

Posted by Tejun Heo 3 months, 3 weeks ago

Hello, Xin.

On Fri, Oct 17, 2025 at 05:10:41PM +0800, Xin Zhao wrote:
> Thank you for your suggestion. We can indeed replace some simple work usage
> scenarios with kthread_work. However, it is not a panacea, especially in
> cases where work processing requires concurrency.
> If we cannot use the RT workqueue implementation, we need to create another
> kthread to improve concurrency performance? In addition, we may need to
> incorporate logic to split works that are originally of the same type?
> This splitting modification can be quite time-consuming and is likely to
> encounter various strange errors during the split.
> Given that the workqueue mechanism is so excellent and effectively matches
> the needs for concurrent and ordered work while also managing work thread
> resources efficiently, perhaps we should consider adding the RT workqueue
> feature to the kernel. After all, the changes are too simple to carry any
> minimal risk, yet the benefits could be substantial in handling concurrent
> demands and preventing thread resource wastage.
> If we do not pursue this, continue relying on kthread_work to handle RT-prio
> work concurrency, it would be equivalent to re-implementing the logic of
> pwq and worker_pool.

There are a couple reasons why I don't think we don't want to go this way:

- Maybe RT is enough for your specific use case but others may want to use
  e.g. deadline, pinning to specific CPUs, persistent kthread-tied
  accounting and prioritization (note that kworkers carries arbitrary
  scheduling history across work item boundaries).

- Running anything RT presents a signficant cost to the system overall.
  There's significant loss in terms of the scheduler's ability to manage the
  system. Once you have too many things running in RT, you just don't have a
  working scheduler on the system. So, I think it makes sense to keep the
  decision to allow / use RT a clearly deliberate choice, something which
  has to be a lot more intentional than picking a different workqueue.

This will be a pretty hard no from me. Of course, I can be wrong and you can
argue your case, but it'd probably be most effective if the arguments are
based on concrete and specific use cases.

Thanks.

-- 
tejun

Re: [PATCH] workqueue: Support RT workqueue

Posted by Tejun Heo 3 months, 3 weeks ago

On Fri, Oct 17, 2025 at 05:15:58AM -1000, Tejun Heo wrote:
> There are a couple reasons why I don't think we don't want to go this way:

Sorry about the double negative. Drop one of them.

Thanks.

-- 
tejun