workqueue: Share the same PWQ for the CPUs of a pod and distribute max_active across pods

[PATCH 2/7] workqueue: Share the same PWQ for the CPUs of a pod

Posted by Lai Jiangshan 2 years, 1 month ago

From: Lai Jiangshan <jiangshan.ljs@antgroup.com>

PWQs with the same attrs shared the same pool. So just share the same
PWQ for all the CPUs of a pod instead of duplicating them.

Signed-off-by: Lai Jiangshan <jiangshan.ljs@antgroup.com>
---
 kernel/workqueue.c | 78 +++++++++++++++++++++++-----------------------
 1 file changed, 39 insertions(+), 39 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e734625fc8ce..1f52685498f1 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4360,15 +4360,29 @@ apply_wqattrs_prepare(struct workqueue_struct *wq,
 		goto out_free;
 
 	for_each_possible_cpu(cpu) {
+		struct pool_workqueue *pwq;
+		int tcpu;
+
+		if (ctx->pwq_tbl[cpu])
+			continue;
 		wq_calc_pod_cpumask(new_attrs, cpu, -1);
 		if (cpumask_equal(new_attrs->cpumask, new_attrs->__pod_cpumask)) {
 			ctx->dfl_pwq->refcnt++;
 			ctx->pwq_tbl[cpu] = ctx->dfl_pwq;
 			continue;
 		}
-		ctx->pwq_tbl[cpu] = alloc_unbound_pwq(wq, new_attrs);
-		if (!ctx->pwq_tbl[cpu])
+		pwq = alloc_unbound_pwq(wq, new_attrs);
+		if (!pwq)
 			goto out_free;
+		/*
+		 * Reinitialize pwq->refcnt and prepare the new pwd for
+		 * all the CPU of the pod.
+		 */
+		pwq->refcnt = 0;
+		for_each_cpu(tcpu, new_attrs->__pod_cpumask) {
+			pwq->refcnt++;
+			ctx->pwq_tbl[tcpu] = pwq;
+		}
 	}
 
 	/* save the user configured attrs and sanitize it. */
@@ -4483,15 +4497,13 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
 /**
  * wq_update_pod - update pod affinity of a wq for CPU hot[un]plug
  * @wq: the target workqueue
- * @cpu: the CPU to update pool association for
- * @hotplug_cpu: the CPU coming up or going down
+ * @cpu: the CPU coming up or going down
  * @online: whether @cpu is coming up or going down
  *
  * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and
  * %CPU_DOWN_FAILED.  @cpu is being hot[un]plugged, update pod affinity of
  * @wq accordingly.
  *
- *
  * If pod affinity can't be adjusted due to memory allocation failure, it falls
  * back to @wq->dfl_pwq which may not be optimal but is always correct.
  *
@@ -4502,11 +4514,11 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
  * CPU_DOWN. If a workqueue user wants strict affinity, it's the user's
  * responsibility to flush the work item from CPU_DOWN_PREPARE.
  */
-static void wq_update_pod(struct workqueue_struct *wq, int cpu,
-			  int hotplug_cpu, bool online)
+static void wq_update_pod(struct workqueue_struct *wq, int cpu, bool online)
 {
-	int off_cpu = online ? -1 : hotplug_cpu;
-	struct pool_workqueue *old_pwq = NULL, *pwq;
+	int off_cpu = online ? -1 : cpu;
+	int tcpu;
+	struct pool_workqueue *pwq;
 	struct workqueue_attrs *target_attrs;
 
 	lockdep_assert_held(&wq_pool_mutex);
@@ -4541,20 +4553,24 @@ static void wq_update_pod(struct workqueue_struct *wq, int cpu,
 		goto use_dfl_pwq;
 	}
 
-	/* Install the new pwq. */
+	/* Install the new pwq for all the cpus of the pod */
 	mutex_lock(&wq->mutex);
-	old_pwq = install_unbound_pwq(wq, cpu, pwq);
-	goto out_unlock;
+	/* reinitialize pwq->refcnt before installing */
+	pwq->refcnt = 0;
+	for_each_cpu(tcpu, target_attrs->__pod_cpumask)
+		pwq->refcnt++;
+	for_each_cpu(tcpu, target_attrs->__pod_cpumask)
+		put_pwq_unlocked(install_unbound_pwq(wq, tcpu, pwq));
+	mutex_unlock(&wq->mutex);
+	return;
 
 use_dfl_pwq:
 	mutex_lock(&wq->mutex);
 	raw_spin_lock_irq(&wq->dfl_pwq->pool->lock);
 	get_pwq(wq->dfl_pwq);
 	raw_spin_unlock_irq(&wq->dfl_pwq->pool->lock);
-	old_pwq = install_unbound_pwq(wq, cpu, wq->dfl_pwq);
-out_unlock:
+	put_pwq_unlocked(install_unbound_pwq(wq, cpu, wq->dfl_pwq));
 	mutex_unlock(&wq->mutex);
-	put_pwq_unlocked(old_pwq);
 }
 
 static int alloc_and_link_pwqs(struct workqueue_struct *wq)
@@ -5563,15 +5579,8 @@ int workqueue_online_cpu(unsigned int cpu)
 
 	/* update pod affinity of unbound workqueues */
 	list_for_each_entry(wq, &workqueues, list) {
-		struct workqueue_attrs *attrs = wq->unbound_attrs;
-
-		if (attrs) {
-			const struct wq_pod_type *pt = wqattrs_pod_type(attrs);
-			int tcpu;
-
-			for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])
-				wq_update_pod(wq, tcpu, cpu, true);
-		}
+		if (wq->unbound_attrs)
+			wq_update_pod(wq, cpu, true);
 	}
 
 	mutex_unlock(&wq_pool_mutex);
@@ -5591,15 +5600,8 @@ int workqueue_offline_cpu(unsigned int cpu)
 	/* update pod affinity of unbound workqueues */
 	mutex_lock(&wq_pool_mutex);
 	list_for_each_entry(wq, &workqueues, list) {
-		struct workqueue_attrs *attrs = wq->unbound_attrs;
-
-		if (attrs) {
-			const struct wq_pod_type *pt = wqattrs_pod_type(attrs);
-			int tcpu;
-
-			for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])
-				wq_update_pod(wq, tcpu, cpu, false);
-		}
+		if (wq->unbound_attrs)
+			wq_update_pod(wq, cpu, false);
 	}
 	mutex_unlock(&wq_pool_mutex);
 
@@ -5891,9 +5893,8 @@ static int wq_affn_dfl_set(const char *val, const struct kernel_param *kp)
 	wq_affn_dfl = affn;
 
 	list_for_each_entry(wq, &workqueues, list) {
-		for_each_online_cpu(cpu) {
-			wq_update_pod(wq, cpu, cpu, true);
-		}
+		for_each_online_cpu(cpu)
+			wq_update_pod(wq, cpu, true);
 	}
 
 	mutex_unlock(&wq_pool_mutex);
@@ -6803,9 +6804,8 @@ void __init workqueue_init_topology(void)
 	 * combinations to apply per-pod sharing.
 	 */
 	list_for_each_entry(wq, &workqueues, list) {
-		for_each_online_cpu(cpu) {
-			wq_update_pod(wq, cpu, cpu, true);
-		}
+		for_each_online_cpu(cpu)
+			wq_update_pod(wq, cpu, true);
 	}
 
 	mutex_unlock(&wq_pool_mutex);
-- 
2.19.1.6.gb485710b

Re: [PATCH 2/7] workqueue: Share the same PWQ for the CPUs of a pod

Posted by kernel test robot 2 years, 1 month ago


Hello,

kernel test robot noticed "WARNING:at_kernel/workqueue.c:#destroy_workqueue" on:

commit: 3f033de3cf87ef6c769b2d55ee1df715a982d650 ("[PATCH 2/7] workqueue: Share the same PWQ for the CPUs of a pod")
url: https://github.com/intel-lab-lkp/linux/commits/Lai-Jiangshan/workqueue-Reuse-the-default-PWQ-as-much-as-possible/20231227-225337
base: https://git.kernel.org/cgit/linux/kernel/git/tj/wq.git for-next
patch link: https://lore.kernel.org/all/20231227145143.2399-3-jiangshanlai@gmail.com/
patch subject: [PATCH 2/7] workqueue: Share the same PWQ for the CPUs of a pod

in testcase: hackbench
version: hackbench-x86_64-2.3-1_20220518
with following parameters:

	nr_threads: 800%
	iterations: 4
	mode: threads
	ipc: pipe
	cpufreq_governor: performance



compiler: gcc-12
test machine: 224 threads 4 sockets Intel(R) Xeon(R) Platinum 8380H CPU @ 2.90GHz (Cooper Lake) with 192G memory

(please refer to attached dmesg/kmsg for entire log/backtrace)



If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <oliver.sang@intel.com>
| Closes: https://lore.kernel.org/oe-lkp/202401031025.95761451-oliver.sang@intel.com


[   30.471685][    T1] ------------[ cut here ]------------
[ 30.476998][ T1] WARNING: CPU: 111 PID: 1 at kernel/workqueue.c:4842 destroy_workqueue (kernel/workqueue.c:4842 (discriminator 1)) 
[   30.486210][    T1] Modules linked in:
[   30.489964][    T1] CPU: 111 PID: 1 Comm: swapper/0 Not tainted 6.6.0-15761-g3f033de3cf87 #1
[   30.498396][    T1] Hardware name: Inspur NF8260M6/NF8260M6, BIOS 06.00.01 04/22/2022
[ 30.506220][ T1] RIP: 0010:destroy_workqueue (kernel/workqueue.c:4842 (discriminator 1)) 
[ 30.511794][ T1] Code: c2 75 f1 48 8b 43 08 48 39 98 a0 00 00 00 74 06 83 7b 18 01 7f 14 8b 43 5c 85 c0 75 0d 48 8b 53 68 48 8d 43 68 48 39 c2 74 4e <0f> 0b 48 c7 c6 e0 1d 42 82 48 8d 95 b0 00 00 00 48 c7 c7 68 a9 93
All code
========
   0:	c2 75 f1             	retq   $0xf175
   3:	48 8b 43 08          	mov    0x8(%rbx),%rax
   7:	48 39 98 a0 00 00 00 	cmp    %rbx,0xa0(%rax)
   e:	74 06                	je     0x16
  10:	83 7b 18 01          	cmpl   $0x1,0x18(%rbx)
  14:	7f 14                	jg     0x2a
  16:	8b 43 5c             	mov    0x5c(%rbx),%eax
  19:	85 c0                	test   %eax,%eax
  1b:	75 0d                	jne    0x2a
  1d:	48 8b 53 68          	mov    0x68(%rbx),%rdx
  21:	48 8d 43 68          	lea    0x68(%rbx),%rax
  25:	48 39 c2             	cmp    %rax,%rdx
  28:	74 4e                	je     0x78
  2a:*	0f 0b                	ud2    		<-- trapping instruction
  2c:	48 c7 c6 e0 1d 42 82 	mov    $0xffffffff82421de0,%rsi
  33:	48 8d 95 b0 00 00 00 	lea    0xb0(%rbp),%rdx
  3a:	48                   	rex.W
  3b:	c7                   	.byte 0xc7
  3c:	c7                   	(bad)  
  3d:	68                   	.byte 0x68
  3e:	a9                   	.byte 0xa9
  3f:	93                   	xchg   %eax,%ebx

Code starting with the faulting instruction
===========================================
   0:	0f 0b                	ud2    
   2:	48 c7 c6 e0 1d 42 82 	mov    $0xffffffff82421de0,%rsi
   9:	48 8d 95 b0 00 00 00 	lea    0xb0(%rbp),%rdx
  10:	48                   	rex.W
  11:	c7                   	.byte 0xc7
  12:	c7                   	(bad)  
  13:	68                   	.byte 0x68
  14:	a9                   	.byte 0xa9
  15:	93                   	xchg   %eax,%ebx
[   30.531233][    T1] RSP: 0000:ffffc90000073dd8 EFLAGS: 00010002
[   30.537151][    T1] RAX: ffff88a444cd1000 RBX: ffff88a444ce6600 RCX: 0000000000000000
[   30.544968][    T1] RDX: ffff88a444ce665c RSI: 0000000000000286 RDI: ffff88a4444c4000
[   30.552785][    T1] RBP: ffff88a444cd1000 R08: 0004afcaac775f46 R09: 0004afcaac775f46
[   30.560605][    T1] R10: ffff88984f050840 R11: 0000000000008070 R12: ffff88a444cd1020
[   30.568430][    T1] R13: ffffc90000073e00 R14: 0000000000000462 R15: 0000000000000000
[   30.576246][    T1] FS:  0000000000000000(0000) GS:ffff88afcf8c0000(0000) knlGS:0000000000000000
[   30.585017][    T1] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   30.591447][    T1] CR2: 0000000000000000 CR3: 000000303e01c001 CR4: 00000000007706f0
[   30.599266][    T1] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   30.607085][    T1] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[   30.614910][    T1] PKRU: 55555554
[   30.618314][    T1] Call Trace:
[   30.621453][    T1]  <TASK>
[ 30.624242][ T1] ? destroy_workqueue (kernel/workqueue.c:4842 (discriminator 1)) 
[ 30.629201][ T1] ? __warn (kernel/panic.c:677) 
[ 30.633129][ T1] ? destroy_workqueue (kernel/workqueue.c:4842 (discriminator 1)) 
[ 30.638091][ T1] ? report_bug (lib/bug.c:180 lib/bug.c:219) 
[ 30.642454][ T1] ? handle_bug (arch/x86/kernel/traps.c:237) 
[ 30.646639][ T1] ? exc_invalid_op (arch/x86/kernel/traps.c:258 (discriminator 1)) 
[ 30.651171][ T1] ? asm_exc_invalid_op (arch/x86/include/asm/idtentry.h:568) 
[ 30.656049][ T1] ? destroy_workqueue (kernel/workqueue.c:4842 (discriminator 1)) 
[ 30.661009][ T1] ? destroy_workqueue (kernel/workqueue.c:4783 kernel/workqueue.c:4842) 
[ 30.665888][ T1] ? __pfx_ftrace_check_sync (kernel/trace/ftrace.c:3803) 
[ 30.671200][ T1] ftrace_check_sync (kernel/trace/ftrace.c:3808) 
[ 30.675820][ T1] do_one_initcall (init/main.c:1236) 
[ 30.680354][ T1] do_initcalls (init/main.c:1297 init/main.c:1314) 
[ 30.684625][ T1] kernel_init_freeable (init/main.c:1555) 
[ 30.689678][ T1] ? __pfx_kernel_init (init/main.c:1433) 
[ 30.694471][ T1] kernel_init (init/main.c:1443) 
[ 30.698658][ T1] ret_from_fork (arch/x86/kernel/process.c:147) 
[ 30.702927][ T1] ? __pfx_kernel_init (init/main.c:1433) 
[ 30.707713][ T1] ret_from_fork_asm (arch/x86/entry/entry_64.S:250) 
[   30.712333][    T1]  </TASK>
[   30.715217][    T1] ---[ end trace 0000000000000000 ]---
[   30.720522][    T1] destroy_workqueue: ftrace_check_wq has the following busy pwq
[   30.728002][    T1]   pwq 452: cpus=0-223 node=3 flags=0x4 nice=0 active=0/256 refcnt=56


The kernel config and materials to reproduce are available at:
https://download.01.org/0day-ci/archive/20240103/202401031025.95761451-oliver.sang@intel.com



-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

Re: [PATCH 2/7] workqueue: Share the same PWQ for the CPUs of a pod

Posted by Lai Jiangshan 2 years, 1 month ago

On Wed, Jan 3, 2024 at 10:55 AM kernel test robot <oliver.sang@intel.com> wrote:


Hello 0-DAY CI Kernel Test Team

> If you fix the issue in a separate patch/commit (i.e. not just a new version of
> the same patch/commit), kindly add following tags
> | Reported-by: kernel test robot <oliver.sang@intel.com>
> | Closes: https://lore.kernel.org/oe-lkp/202401031025.95761451-oliver.sang@intel.com
>
>
> [   30.471685][    T1] ------------[ cut here ]------------
> [ 30.476998][ T1] WARNING: CPU: 111 PID: 1 at kernel/workqueue.c:4842 destroy_workqueue (kernel/workqueue.c:4842 (discriminator 1))

It hits the check here

        if ((pwq != pwq->wq->dfl_pwq) && (pwq->refcnt > 1))
                return true;

Not only is the default pwq installed multiple times, but also other pwqs
with this patch.

Maybe pwq->installed_refcnt needs to be introduced to fix it.

Thanks
Lai

[PATCH 1/7] workqueue: Reuse the default PWQ as much as possible
[PATCH 2/7] workqueue: Share the same PWQ for the CPUs of a pod
[PATCH 3/7] workqueue: Add pwq_calculate_max_active()
[PATCH 4/7] workqueue: Wrap common code into wq_adjust_pwqs_max_active()
[PATCH 5/7] workqueue: Addjust pwq's max_active when CPU online/offine
[PATCH 6/7] workqueue: Implement system-wide max_active enforcement for unbound workqueues
[PATCH 7/7] workqueue: Rename wq->saved_max_active to wq->max_active