workqueue: Destroy workers in idle_cull_fn().

[PATCH 1/4] workqueue: Reap workers via kthread_stop() and remove detach_completion

Posted by Lai Jiangshan 1 year, 7 months ago

From: Lai Jiangshan <jiangshan.ljs@antgroup.com>

The code to kick off the destruction of workers is now in a process
context (idle_cull_fn()), so kthread_stop() can be used in the process
context to replace the work of pool->detach_completion.

The wakeup in wake_dying_workers() is unneeded after this change, but it
is harmless, jut keep it here until next patch renames wake_dying_workers()
rather than renaming it again and again.

Cc: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: Lai Jiangshan <jiangshan.ljs@antgroup.com>
---
 kernel/workqueue.c | 35 +++++++++++++++++++----------------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 25a7d9a1a7ae..a0fb2f60e938 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -216,7 +216,6 @@ struct worker_pool {
 	struct worker		*manager;	/* L: purely informational */
 	struct list_head	workers;	/* A: attached workers */
 	struct list_head        dying_workers;  /* A: workers about to die */
-	struct completion	*detach_completion; /* all workers detached */
 
 	struct ida		worker_ida;	/* worker IDs for task name */
 
@@ -2696,7 +2695,6 @@ static void worker_attach_to_pool(struct worker *worker,
 static void worker_detach_from_pool(struct worker *worker)
 {
 	struct worker_pool *pool = worker->pool;
-	struct completion *detach_completion = NULL;
 
 	/* there is one permanent BH worker per CPU which should never detach */
 	WARN_ON_ONCE(pool->flags & POOL_BH);
@@ -2707,15 +2705,10 @@ static void worker_detach_from_pool(struct worker *worker)
 	list_del(&worker->node);
 	worker->pool = NULL;
 
-	if (list_empty(&pool->workers) && list_empty(&pool->dying_workers))
-		detach_completion = pool->detach_completion;
 	mutex_unlock(&wq_pool_attach_mutex);
 
 	/* clear leftover flags without pool->lock after it is detached */
 	worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND);
-
-	if (detach_completion)
-		complete(detach_completion);
 }
 
 /**
@@ -2816,10 +2809,9 @@ static void unbind_worker(struct worker *worker)
 
 static void wake_dying_workers(struct list_head *cull_list)
 {
-	struct worker *worker, *tmp;
+	struct worker *worker;
 
-	list_for_each_entry_safe(worker, tmp, cull_list, entry) {
-		list_del_init(&worker->entry);
+	list_for_each_entry(worker, cull_list, entry) {
 		unbind_worker(worker);
 		/*
 		 * If the worker was somehow already running, then it had to be
@@ -2835,6 +2827,17 @@ static void wake_dying_workers(struct list_head *cull_list)
 	}
 }
 
+static void reap_dying_workers(struct list_head *cull_list)
+{
+	struct worker *worker, *tmp;
+
+	list_for_each_entry_safe(worker, tmp, cull_list, entry) {
+		list_del_init(&worker->entry);
+		kthread_stop_put(worker->task);
+		kfree(worker);
+	}
+}
+
 /**
  * set_worker_dying - Tag a worker for destruction
  * @worker: worker to be destroyed
@@ -2866,6 +2869,9 @@ static void set_worker_dying(struct worker *worker, struct list_head *list)
 
 	list_move(&worker->entry, list);
 	list_move(&worker->node, &pool->dying_workers);
+
+	/* get an extra task struct reference for later kthread_stop_put() */
+	get_task_struct(worker->task);
 }
 
 /**
@@ -2949,6 +2955,8 @@ static void idle_cull_fn(struct work_struct *work)
 	raw_spin_unlock_irq(&pool->lock);
 	wake_dying_workers(&cull_list);
 	mutex_unlock(&wq_pool_attach_mutex);
+
+	reap_dying_workers(&cull_list);
 }
 
 static void send_mayday(struct work_struct *work)
@@ -3330,7 +3338,6 @@ static int worker_thread(void *__worker)
 		ida_free(&pool->worker_ida, worker->id);
 		worker_detach_from_pool(worker);
 		WARN_ON_ONCE(!list_empty(&worker->entry));
-		kfree(worker);
 		return 0;
 	}
 
@@ -4863,7 +4870,6 @@ static void rcu_free_pool(struct rcu_head *rcu)
  */
 static void put_unbound_pool(struct worker_pool *pool)
 {
-	DECLARE_COMPLETION_ONSTACK(detach_completion);
 	struct worker *worker;
 	LIST_HEAD(cull_list);
 
@@ -4917,12 +4923,9 @@ static void put_unbound_pool(struct worker_pool *pool)
 
 	wake_dying_workers(&cull_list);
 
-	if (!list_empty(&pool->workers) || !list_empty(&pool->dying_workers))
-		pool->detach_completion = &detach_completion;
 	mutex_unlock(&wq_pool_attach_mutex);
 
-	if (pool->detach_completion)
-		wait_for_completion(pool->detach_completion);
+	reap_dying_workers(&cull_list);
 
 	/* shut down the timers */
 	del_timer_sync(&pool->idle_timer);
-- 
2.19.1.6.gb485710b

Re: [PATCH 1/4] workqueue: Reap workers via kthread_stop() and remove detach_completion

Posted by Marc Hartmayer 1 year, 6 months ago

On Fri, Jun 21, 2024 at 03:32 PM +0800, Lai Jiangshan <jiangshanlai@gmail.com> wrote:
> From: Lai Jiangshan <jiangshan.ljs@antgroup.com>
>
> The code to kick off the destruction of workers is now in a process
> context (idle_cull_fn()), so kthread_stop() can be used in the process
> context to replace the work of pool->detach_completion.
>
> The wakeup in wake_dying_workers() is unneeded after this change, but it
> is harmless, jut keep it here until next patch renames wake_dying_workers()
> rather than renaming it again and again.
>
> Cc: Valentin Schneider <vschneid@redhat.com>
> Signed-off-by: Lai Jiangshan <jiangshan.ljs@antgroup.com>
> ---
>  kernel/workqueue.c | 35 +++++++++++++++++++----------------
>  1 file changed, 19 insertions(+), 16 deletions(-)
>

Hi Lai,

a bisect of a regression in our CI on s390x led to this patch. The bug
is pretty easy to reproduce (currently, I only tested it on s390x - will
try to test it on x86 as well):

1. Start a Linux QEMU/KVM guest with 2 cores using this patch and enable
   `panic_on_warn=1` for the guest kernel.
2. Run the following command in the KVM guest:

  $  dd if=/dev/zero of=/dev/null & while : ; do chcpu -d 1; chcpu -e 1; done

3. Wait for the crash. e.g.:

2024/07/23 18:01:21 [M83LP63]: [  157.267727] ------------[ cut here ]------------
2024/07/23 18:01:21 [M83LP63]: [  157.267735] WARNING: CPU: 21 PID: 725 at kernel/workqueue.c:3340 worker_thread+0x54e/0x558
2024/07/23 18:01:21 [M83LP63]: [  157.267746] Modules linked in: binfmt_misc nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set nf_tables sunrpc dm_service_time s390_trng vfio_ccw mdev vfio_iommu_type1 vfio sch_fq_codel
2024/07/23 18:01:21 [M83LP63]: loop dm_multipath configfs nfnetlink lcs ctcm fsm zfcp scsi_transport_fc ghash_s390 prng chacha_s390 libchacha aes_s390 des_s390 libdes sha3_512_s390 sha3_256_s390 sha512_s390 sha256_s390 sha1_s390 sha_common scm_block eadm_sch scsi_dh_rdac scsi_dh_emc scsi_dh_alua pkey zcrypt rng_core autofs4
2024/07/23 18:01:21 [M83LP63]: [  157.267792] CPU: 21 PID: 725 Comm: kworker/dying Not tainted 6.10.0-rc2-00239-g68f83057b913 #95
2024/07/23 18:01:21 [M83LP63]: [  157.267796] Hardware name: IBM 3906 M04 704 (LPAR)
2024/07/23 18:01:21 [M83LP63]: [  157.267802]            R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:1 PM:0 RI:0 EA:3
2024/07/23 18:01:21 [M83LP63]: [  157.267797] Krnl PSW : 0704d00180000000 000003d600fcd9fa (worker_thread+0x552/0x558)
2024/07/23 18:01:21 [M83LP63]: [  157.267806] Krnl GPRS: 6479696e6700776f 000002c901b62780 000003d602493ec8 000002c914954600
2024/07/23 18:01:21 [M83LP63]: [  157.267809]            0000000000000000 0000000000000008 000002c901a85400 000002c90719e840
2024/07/23 18:01:21 [M83LP63]: [  157.267811]            000002c90719e880 000002c901a85420 000002c91127adf0 000002c901a85400
2024/07/23 18:01:21 [M83LP63]: [  157.267813]            000002c914954600 0000000000000000 000003d600fcd772 000003560452bd98
2024/07/23 18:01:21 [M83LP63]: [  157.267822] Krnl Code: 000003d600fcd9ec: c0e500674262        brasl   %r14,000003d601cb5eb0
2024/07/23 18:01:21 [M83LP63]: [  157.267822]            000003d600fcd9f2: a7f4ffc8            brc     15,000003d600fcd982
2024/07/23 18:01:21 [M83LP63]: [  157.267822]           #000003d600fcd9f6: af000000            mc      0,0
2024/07/23 18:01:21 [M83LP63]: [  157.267822]           >000003d600fcd9fa: a7f4fec2            brc     15,000003d600fcd77e
2024/07/23 18:01:21 [M83LP63]: [  157.267822]            000003d600fcd9fe: 0707                bcr     0,%r7
2024/07/23 18:01:21 [M83LP63]: [  157.267822]            000003d600fcda00: c00400682e10        brcl    0,000003d601cd3620
2024/07/23 18:01:21 [M83LP63]: [  157.267822]            000003d600fcda06: eb7ff0500024        stmg    %r7,%r15,80(%r15)
2024/07/23 18:01:21 [M83LP63]: [  157.267822]            000003d600fcda0c: b90400ef            lgr     %r14,%r15
2024/07/23 18:01:21 [M83LP63]: [  157.267853] Call Trace:
2024/07/23 18:01:21 [M83LP63]: [  157.267855]  [<000003d600fcd9fa>] worker_thread+0x552/0x558
2024/07/23 18:01:21 [M83LP63]: [  157.267859] ([<000003d600fcd772>] worker_thread+0x2ca/0x558)
2024/07/23 18:01:21 [M83LP63]: [  157.267862]  [<000003d600fd6c80>] kthread+0x120/0x128
2024/07/23 18:01:21 [M83LP63]: [  157.267865]  [<000003d600f5305c>] __ret_from_fork+0x3c/0x58
2024/07/23 18:01:21 [M83LP63]: [  157.267868]  [<000003d601cc746a>] ret_from_fork+0xa/0x30
2024/07/23 18:01:21 [M83LP63]: [  157.267873] Last Breaking-Event-Address:
2024/07/23 18:01:21 [M83LP63]: [  157.267874]  [<000003d600fcd778>] worker_thread+0x2d0/0x558
2024/07/23 18:01:21 [M83LP63]: [  157.267879] Kernel panic - not syncing: kernel: panic_on_warn set ...
2024/07/23 18:01:22 [M83LP63]: [  157.267881] CPU: 21 PID: 725 Comm: kworker/dying Not tainted 6.10.0-rc2-00239-g68f83057b913 #95
2024/07/23 18:01:22 [M83LP63]: [  157.267884] Hardware name: IBM 3906 M04 704 (LPAR)
2024/07/23 18:01:22 [M83LP63]: [  157.267885] Call Trace:
2024/07/23 18:01:22 [M83LP63]: [  157.267886]  [<000003d600fa7f8c>] panic+0x1ec/0x308
2024/07/23 18:01:22 [M83LP63]: [  157.267892]  [<000003d600fa822c>] check_panic_on_warn+0x84/0x88
2024/07/23 18:01:22 [M83LP63]: [  157.267896]  [<000003d600fa846e>] __warn+0xa6/0x160
2024/07/23 18:01:22 [M83LP63]: [  157.267899]  [<000003d601c8ac7e>] report_bug+0x146/0x1c0
2024/07/23 18:01:22 [M83LP63]: [  157.267903]  [<000003d600f50e64>] monitor_event_exception+0x44/0x80
2024/07/23 18:01:22 [M83LP63]: [  157.267905]  [<000003d601cb67e0>] __do_pgm_check+0xf0/0x1b0
2024/07/23 18:01:22 [M83LP63]: [  157.267911]  [<000003d601cc75a8>] pgm_check_handler+0x118/0x168
2024/07/23 18:01:22 [M83LP63]: [  157.267914]  [<000003d600fcd9fa>] worker_thread+0x552/0x558
2024/07/23 18:01:22 [M83LP63]: [  157.267917] ([<000003d600fcd772>] worker_thread+0x2ca/0x558)
2024/07/23 18:01:22 [M83LP63]: [  157.267920]  [<000003d600fd6c80>] kthread+0x120/0x128
2024/07/23 18:01:22 [M83LP63]: [  157.267923]  [<000003d600f5305c>] __ret_from_fork+0x3c/0x58
2024/07/23 18:01:22 [M83LP63]: [  157.267926]  [<000003d601cc746a>] ret_from_fork+0xa/0x30

-- 
Kind regards / Beste Grüße
   Marc Hartmayer

IBM Deutschland Research & Development GmbH
Vorsitzender des Aufsichtsrats: Wolfgang Wendt
Geschäftsführung: David Faller
Sitz der Gesellschaft: Böblingen
Registergericht: Amtsgericht Stuttgart, HRB 243294

Re: [PATCH 1/4] workqueue: Reap workers via kthread_stop() and remove detach_completion

Posted by Marc Hartmayer 1 year, 5 months ago

On Tue, Jul 23, 2024 at 06:19 PM +0200, "Marc Hartmayer" <mhartmay@linux.ibm.com> wrote:
> On Fri, Jun 21, 2024 at 03:32 PM +0800, Lai Jiangshan <jiangshanlai@gmail.com> wrote:
>> From: Lai Jiangshan <jiangshan.ljs@antgroup.com>
>>
>> The code to kick off the destruction of workers is now in a process
>> context (idle_cull_fn()), so kthread_stop() can be used in the process
>> context to replace the work of pool->detach_completion.
>>
>> The wakeup in wake_dying_workers() is unneeded after this change, but it
>> is harmless, jut keep it here until next patch renames wake_dying_workers()
>> rather than renaming it again and again.
>>
>> Cc: Valentin Schneider <vschneid@redhat.com>
>> Signed-off-by: Lai Jiangshan <jiangshan.ljs@antgroup.com>
>> ---
>>  kernel/workqueue.c | 35 +++++++++++++++++++----------------
>>  1 file changed, 19 insertions(+), 16 deletions(-)
>>
>

Hi Lai,

I’m not sure if this NULL-pointer crash is related to this patch series
or not. But it is triggered by the same test that also triggered the
other problem that I reported.

        [   23.133876] Unable to handle kernel pointer dereference in virtual kernel address space
        [   23.133950] Failing address: 0000000000000000 TEID: 0000000000000483
        [   23.133954] Fault in home space mode while using kernel ASCE.
        [   23.133957] AS:000000001b8f0007 R3:0000000056cf4007 S:0000000056cf3800 P:000000000000003d 
        [   23.134207] Oops: 0004 ilc:2 [#1] SMP 
        [   23.134273] Modules linked in: essiv authenc dm_crypt encrypted_keys loop pkey zcrypt s390_trng rng_core ghash_s390 prng chacha_s390 libchacha aes_s390 des_s390 virtio_console libdes vmw_vsock_virtio_transport vmw_vsock_virtio_transport_common sha3_512_s390 vsock sha3_256_s390 sha512_s390 sha256_s390 sha1_s390 sha_common vfio_ccw mdev vfio_iommu_type1 vfio sch_fq_codel drm i2c_core drm_panel_orientation_quirks configfs autofs4
        [   23.134386] CPU: 0 UID: 0 PID: 376 Comm: kworker/u10:2 Not tainted 6.11.0-20240902.rc6.git1.67784a74e258.300.fc40.s390x+git #1
        [   23.134394] Hardware name: IBM 8561 T01 703 (KVM/Linux)
        [   23.134406] Workqueue:  0x0 ()
        [   23.134440] Krnl PSW : 0404c00180000000 0000024e326caf28 (worker_thread+0x48/0x430)
        [   23.134471]            R:0 T:1 IO:0 EX:0 Key:0 M:1 W:0 P:0 AS:3 CC:0 PM:0 RI:0 EA:3
        [   23.134474] Krnl GPRS: 0000000058778000 0000000000000000 0000024e00000001 0000000058778000
        [   23.134476]            0000000000000000 0000000058778000 0000000057b8e240 0000000000000002
        [   23.134480]            0000000000000000 0000000000000028 0000000000000000 0000000057b8e240
        [   23.134481]            0000000058778000 0000000058778000 0000024e326caf18 000001ce32953d88
        [   23.134499] Krnl Code: 0000024e326caf1c: acfcf0c8		stnsm	200(%r15),252
        [   23.134499]            0000024e326caf20: a7180000		lhi	%r1,0
        [   23.134499]           #0000024e326caf24: 582083ac		l	%r2,940(%r8)
        [   23.134499]           >0000024e326caf28: ba12a000		cs	%r1,%r2,0(%r10)
        [   23.134499]            0000024e326caf2c: a77400cf		brc	7,0000024e326cb0ca
        [   23.134499]            0000024e326caf30: 5800b078		l	%r0,120(%r11)
        [   23.134499]            0000024e326caf34: a7010002		tmll	%r0,2
        [   23.134499]            0000024e326caf38: a77400d4		brc	7,0000024e326cb0e0
        [   23.134516] Call Trace:
        [   23.134520]  [<0000024e326caf28>] worker_thread+0x48/0x430 
        [   23.134525] ([<0000024e326caf18>] worker_thread+0x38/0x430)
        [   23.134528]  [<0000024e326d3a3e>] kthread+0x11e/0x130 
        [   23.134533]  [<0000024e3264b0dc>] __ret_from_fork+0x3c/0x60 
        [   23.134536]  [<0000024e333fb37a>] ret_from_fork+0xa/0x38 
        [   23.134552] Last Breaking-Event-Address:
        [   23.134553]  [<0000024e333f4c04>] mutex_unlock+0x24/0x30
        [   23.134562] Kernel panic - not syncing: Fatal exception: panic_on_oops

This happened with Linux
6.11.0-20240902.rc6.git1.67784a74e258.300.fc40.s390x (using defconfig),
but also with an older commit
6.11.0-20240719.rc0.git15.720261cfc732.300.fc40.s390x on s390x (both
kernels contain your patches). I have not bisected/debugged the
problem yet, but you may have an idea already. Will try to reproduce the
problem and give you more debug information.

Thanks!

[…snip]

-- 
Kind regards / Beste Grüße
   Marc Hartmayer

IBM Deutschland Research & Development GmbH
Vorsitzender des Aufsichtsrats: Wolfgang Wendt
Geschäftsführung: David Faller
Sitz der Gesellschaft: Böblingen
Registergericht: Amtsgericht Stuttgart, HRB 243294

Re: [PATCH 1/4] workqueue: Reap workers via kthread_stop() and remove detach_completion

Posted by Marc Hartmayer 1 year, 5 months ago

On Tue, Sep 10, 2024 at 11:45 AM +0200, "Marc Hartmayer" <mhartmay@linux.ibm.com> wrote:
> On Tue, Jul 23, 2024 at 06:19 PM +0200, "Marc Hartmayer" <mhartmay@linux.ibm.com> wrote:
>> On Fri, Jun 21, 2024 at 03:32 PM +0800, Lai Jiangshan <jiangshanlai@gmail.com> wrote:
>>> From: Lai Jiangshan <jiangshan.ljs@antgroup.com>
>>>
>>> The code to kick off the destruction of workers is now in a process
>>> context (idle_cull_fn()), so kthread_stop() can be used in the process
>>> context to replace the work of pool->detach_completion.
>>>
>>> The wakeup in wake_dying_workers() is unneeded after this change, but it
>>> is harmless, jut keep it here until next patch renames wake_dying_workers()
>>> rather than renaming it again and again.
>>>
>>> Cc: Valentin Schneider <vschneid@redhat.com>
>>> Signed-off-by: Lai Jiangshan <jiangshan.ljs@antgroup.com>
>>> ---
>>>  kernel/workqueue.c | 35 +++++++++++++++++++----------------
>>>  1 file changed, 19 insertions(+), 16 deletions(-)
>>>
>>

[…snip…]

Hi Lai,

I’ve reproduced the issue using the Linux commit bc83b4d1f086. Here is
the “beautified” stacktrace (output of
`$LINUX/scripts/decode_stacktrace.sh vmlinux auto < dmesg.txt`).

...
[   14.271265] Unable to handle kernel pointer dereference in virtual kernel address space
[   14.271314] Failing address: 0000000000000000 TEID: 0000000000000483
[   14.271317] Fault in home space mode while using kernel ASCE.
[   14.271320] AS:000000001df84007 R3:0000000064888007 S:0000000064887800 P:000000000000003d
[   14.271519] Oops: 0004 ilc:2 [#1] SMP
[   14.271570] Modules linked in: essiv authenc dm_crypt encrypted_keys loop vmw_vsock_virtio_transport vmw_vsock_virtio_transport_common pkey vsock zcrypt s390_trng rng_core ghash_s390 prng chacha_s390 libchacha virtio_console aes_s390 des_s390 libdes sha3_512_s390 sha3_256_s390 sha512_s390 sha256_s390 sha1_s390 sha_common vfio_ccw mdev vfio_iommu_type1 vfio sch_fq_codel drm i2c_core drm_panel_orientation_quirks configfs autofs4
[   14.271661] CPU: 0 UID: 0 PID: 324 Comm: kworker/u10:2 Not tainted 6.11.0-20240909.rc7.git8.bc83b4d1f086.300.fc40.s390x+git #1
[   14.271667] Hardware name: IBM 8561 T01 701 (KVM/Linux)
[   14.271677] Workqueue: . 0x0 ()
[   14.271702] Krnl PSW : 0404c00180000000 000002d8c205ef28 worker_thread (./arch/s390/include/asm/atomic_ops.h:198 ./arch/s390/include/asm/spinlock.h:61 ./arch/s390/include/asm/spinlock.h:66 ./include/linux/spinlock.h:187 ./include/linux/spinlock_api_smp.h:120 kernel/workqueue.c:3346)
[   14.271728]            R:0 T:1 IO:0 EX:0 Key:0 M:1 W:0 P:0 AS:3 CC:0 PM:0 RI:0 EA:3
[   14.271730] Krnl GPRS: 00000000673f0000 0000000000000000 000002d800000001 00000000673f0000
[   14.271732]            0000000000000000 00000000673f0000 0000000067ac5b40 0000000000000002
[   14.271735]            0000000000000000 0000000000000028 0000000000000000 0000000067ac5b40
[   14.271736]            00000000673f0000 00000000673f0000 000002d8c205ef18 00000258c22a7d88
[ 14.271752] Krnl Code: 000002d8c205ef1c: acfcf0c8 stnsm 200(%r15),252
objdump: '/tmp/tmp.yRzOQQynJL.o': No such file
objdump: '/tmp/tmp.yRzOQQynJL.o': No such file
All code
========

Code starting with the faulting instruction
===========================================
000002d8c205ef20: a7180000            lhi     %r1,0
#000002d8c205ef24: 582083ac            l       %r2,940(%r8)
>000002d8c205ef28: ba12a000            cs      %r1,%r2,0(%r10)
000002d8c205ef2c: a77400cf            brc     7,000002d8c205f0ca
000002d8c205ef30: 5800b078            l       %r0,120(%r11)
000002d8c205ef34: a7010002            tmll    %r0,2
000002d8c205ef38: a77400d4            brc     7,000002d8c205f0e0
[   14.271766] Call Trace:
[   14.271769] worker_thread (./arch/s390/include/asm/atomic_ops.h:198 ./arch/s390/include/asm/spinlock.h:61 ./arch/s390/include/asm/spinlock.h:66 ./include/linux/spinlock.h:187 ./include/linux/spinlock_api_smp.h:120 kernel/workqueue.c:3346)
[   14.271774] worker_thread (./arch/s390/include/asm/lowcore.h:226 ./arch/s390/include/asm/spinlock.h:61 ./arch/s390/include/asm/spinlock.h:66 ./include/linux/spinlock.h:187 ./include/linux/spinlock_api_smp.h:120 kernel/workqueue.c:3346)
[   14.271777] kthread (kernel/kthread.c:389)
[   14.271781] __ret_from_fork (arch/s390/kernel/process.c:62)
[   14.271784] ret_from_fork (arch/s390/kernel/entry.S:309)
[   14.271806] Last Breaking-Event-Address:
[   14.271807] mutex_unlock (kernel/locking/mutex.c:549)

So it seems to me that `worker->pool` is NULL in the
`workqueue.c:worker_thread` function and this leads to the crash.

My next step is to try to bisect the bug.

-- 
Kind regards / Beste Grüße
   Marc Hartmayer

IBM Deutschland Research & Development GmbH
Vorsitzender des Aufsichtsrats: Wolfgang Wendt
Geschäftsführung: David Faller
Sitz der Gesellschaft: Böblingen
Registergericht: Amtsgericht Stuttgart, HRB 243294

Re: [PATCH 1/4] workqueue: Reap workers via kthread_stop() and remove detach_completion

Posted by Lai Jiangshan 1 year, 5 months ago

Hello, Marc

On Wed, Sep 11, 2024 at 12:29 AM Marc Hartmayer <mhartmay@linux.ibm.com>
> Code starting with the faulting instruction
> ===========================================
> 000002d8c205ef20: a7180000            lhi     %r1,0
> #000002d8c205ef24: 582083ac            l       %r2,940(%r8)
> >000002d8c205ef28: ba12a000            cs      %r1,%r2,0(%r10)
> 000002d8c205ef2c: a77400cf            brc     7,000002d8c205f0ca
> 000002d8c205ef30: 5800b078            l       %r0,120(%r11)
> 000002d8c205ef34: a7010002            tmll    %r0,2
> 000002d8c205ef38: a77400d4            brc     7,000002d8c205f0e0
> [   14.271766] Call Trace:
> [   14.271769] worker_thread (./arch/s390/include/asm/atomic_ops.h:198 ./arch/s390/include/asm/spinlock.h:61 ./arch/s390/include/asm/spinlock.h:66 ./include/linux/spinlock.h:187 ./include/linux/spinlock_api_smp.h:120 kernel/workqueue.c:3346)
> [   14.271774] worker_thread (./arch/s390/include/asm/lowcore.h:226 ./arch/s390/include/asm/spinlock.h:61 ./arch/s390/include/asm/spinlock.h:66 ./include/linux/spinlock.h:187 ./include/linux/spinlock_api_smp.h:120 kernel/workqueue.c:3346)
> [   14.271777] kthread (kernel/kthread.c:389)
> [   14.271781] __ret_from_fork (arch/s390/kernel/process.c:62)
> [   14.271784] ret_from_fork (arch/s390/kernel/entry.S:309)
> [   14.271806] Last Breaking-Event-Address:
> [   14.271807] mutex_unlock (kernel/locking/mutex.c:549)
>
> So it seems to me that `worker->pool` is NULL in the
> `workqueue.c:worker_thread` function and this leads to the crash.
>

I'm not familiar with s390 asm code, but it might be the case that
`worker->pool` is NULL in the in worker_thread() since detach_worker()
resets worker->pool to NULL.

If it is the case, READ_ONCE(worker->pool) should be used in worker_thread()
to fix the problem.

(It is weird to me if worker->pool is read multi-time in worker_thread()
since it is used many times, but since READ_ONCE() is not used, it can
be possible).

Thanks
Lai

Re: [PATCH 1/4] workqueue: Reap workers via kthread_stop() and remove detach_completion

Posted by Lai Jiangshan 1 year, 5 months ago

On Wed, Sep 11, 2024 at 11:23 AM Lai Jiangshan <jiangshanlai@gmail.com> wrote:
>
> Hello, Marc
>
> On Wed, Sep 11, 2024 at 12:29 AM Marc Hartmayer <mhartmay@linux.ibm.com>
> > Code starting with the faulting instruction
> > ===========================================
> > 000002d8c205ef20: a7180000            lhi     %r1,0
> > #000002d8c205ef24: 582083ac            l       %r2,940(%r8)
> > >000002d8c205ef28: ba12a000            cs      %r1,%r2,0(%r10)
> > 000002d8c205ef2c: a77400cf            brc     7,000002d8c205f0ca
> > 000002d8c205ef30: 5800b078            l       %r0,120(%r11)
> > 000002d8c205ef34: a7010002            tmll    %r0,2
> > 000002d8c205ef38: a77400d4            brc     7,000002d8c205f0e0
> > [   14.271766] Call Trace:
> > [   14.271769] worker_thread (./arch/s390/include/asm/atomic_ops.h:198 ./arch/s390/include/asm/spinlock.h:61 ./arch/s390/include/asm/spinlock.h:66 ./include/linux/spinlock.h:187 ./include/linux/spinlock_api_smp.h:120 kernel/workqueue.c:3346)
> > [   14.271774] worker_thread (./arch/s390/include/asm/lowcore.h:226 ./arch/s390/include/asm/spinlock.h:61 ./arch/s390/include/asm/spinlock.h:66 ./include/linux/spinlock.h:187 ./include/linux/spinlock_api_smp.h:120 kernel/workqueue.c:3346)
> > [   14.271777] kthread (kernel/kthread.c:389)
> > [   14.271781] __ret_from_fork (arch/s390/kernel/process.c:62)
> > [   14.271784] ret_from_fork (arch/s390/kernel/entry.S:309)
> > [   14.271806] Last Breaking-Event-Address:
> > [   14.271807] mutex_unlock (kernel/locking/mutex.c:549)
> >
> > So it seems to me that `worker->pool` is NULL in the
> > `workqueue.c:worker_thread` function and this leads to the crash.
> >
>
> I'm not familiar with s390 asm code, but it might be the case that
> `worker->pool` is NULL in the in worker_thread() since detach_worker()
> resets worker->pool to NULL.
>
> If it is the case, READ_ONCE(worker->pool) should be used in worker_thread()
> to fix the problem.
>
> (It is weird to me if worker->pool is read multi-time in worker_thread()
> since it is used many times, but since READ_ONCE() is not used, it can
> be possible).

Oh, it can be possible that the worker is created and then destroyed before
being waken-up. And if it is the case, READ_ONCE() won't help. I'm going to
explore if "worker->pool = NULL;" can be moved out from detach_worker().

Thanks
Lai

Re: [PATCH 1/4] workqueue: Reap workers via kthread_stop() and remove detach_completion

Posted by Marc Hartmayer 1 year, 5 months ago

On Wed, Sep 11, 2024 at 11:32 AM +0800, Lai Jiangshan <jiangshanlai@gmail.com> wrote:
> On Wed, Sep 11, 2024 at 11:23 AM Lai Jiangshan <jiangshanlai@gmail.com> wrote:
>>
>> Hello, Marc
>>

Hi Lai,

[…snip…]

>>
>> I'm not familiar with s390 asm code, but it might be the case that
>> `worker->pool` is NULL in the in worker_thread() since detach_worker()
>> resets worker->pool to NULL.
>>
>> If it is the case, READ_ONCE(worker->pool) should be used in worker_thread()
>> to fix the problem.
>>
>> (It is weird to me if worker->pool is read multi-time in worker_thread()
>> since it is used many times, but since READ_ONCE() is not used, it can
>> be possible).
>
> Oh, it can be possible that the worker is created and then destroyed before
> being waken-up. And if it is the case, READ_ONCE() won't help. I'm going to
> explore if "worker->pool = NULL;" can be moved out from
> detach_worker().

I’ll double check if my assumption is true or not (worker->poll ==
NULL). It may well be that my assumption is wrong.

Thanks for having a look!

>
> Thanks
> Lai
-- 
Kind regards / Beste Grüße
   Marc Hartmayer

IBM Deutschland Research & Development GmbH
Vorsitzender des Aufsichtsrats: Wolfgang Wendt
Geschäftsführung: David Faller
Sitz der Gesellschaft: Böblingen
Registergericht: Amtsgericht Stuttgart, HRB 243294

Re: [PATCH 1/4] workqueue: Reap workers via kthread_stop() and remove detach_completion

Posted by Marc Hartmayer 1 year, 5 months ago

On Wed, Sep 11, 2024 at 10:27 AM +0200, "Marc Hartmayer" <mhartmay@linux.ibm.com> wrote:
> On Wed, Sep 11, 2024 at 11:32 AM +0800, Lai Jiangshan <jiangshanlai@gmail.com> wrote:
>> On Wed, Sep 11, 2024 at 11:23 AM Lai Jiangshan <jiangshanlai@gmail.com> wrote:
>>>
>>> Hello, Marc
>>>
>
> Hi Lai,
>
> […snip…]
>
>>>
>>> I'm not familiar with s390 asm code, but it might be the case that
>>> `worker->pool` is NULL in the in worker_thread() since detach_worker()
>>> resets worker->pool to NULL.
>>>
>>> If it is the case, READ_ONCE(worker->pool) should be used in worker_thread()
>>> to fix the problem.
>>>
>>> (It is weird to me if worker->pool is read multi-time in worker_thread()
>>> since it is used many times, but since READ_ONCE() is not used, it can
>>> be possible).
>>
>> Oh, it can be possible that the worker is created and then destroyed before
>> being waken-up. And if it is the case, READ_ONCE() won't help. I'm going to
>> explore if "worker->pool = NULL;" can be moved out from
>> detach_worker().
>
> I’ll double check if my assumption is true or not (worker->poll ==
> NULL). It may well be that my assumption is wrong.

I applied the following patch on top of commit bc83b4d1f086 ("Merge tag
'bcachefs-2024-09-09' of git://evilpiepirate.org/bcachefs")

From 9cd804f8e3183422b05a1b36e2544d1175736519 Mon Sep 17 00:00:00 2001
From: Marc Hartmayer <mhartmay@linux.ibm.com>
Date: Wed, 11 Sep 2024 09:11:41 +0000
Subject: [PATCH] Add printk-debug statements

Signed-off-by: Marc Hartmayer <mhartmay@linux.ibm.com>
---
 kernel/workqueue.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e7b005ff3750..d4c5c68457f7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3338,11 +3338,16 @@ static void set_pf_worker(bool val)
 static int worker_thread(void *__worker)
 {
        struct worker *worker = __worker;
-       struct worker_pool *pool = worker->pool;
+       if (unlikely(!worker))
+               printk(KERN_ERR "OOOOOHHHH NOOOOOOO, WE DO NOT HAVE A WORKER.\n");
+
+       struct worker_pool *pool = READ_ONCE(worker->pool);

        /* tell the scheduler that this is a workqueue worker */
        set_pf_worker(true);
 woke_up:
+       if (unlikely(!pool))
+               printk(KERN_ERR "OOOOOHHHH NOOOOOOO, WE DO NOT HAVE A POOL.\n");
        raw_spin_lock_irq(&pool->lock);

        /* am I supposed to die? */
--
2.43.0

And it shows that pool is NULL in case of the crash. Hope this helps.

>
> Thanks for having a look!
>
>>
>> Thanks
>> Lai
> -- 
> Kind regards / Beste Grüße
>    Marc Hartmayer
>
> IBM Deutschland Research & Development GmbH
> Vorsitzender des Aufsichtsrats: Wolfgang Wendt
> Geschäftsführung: David Faller
> Sitz der Gesellschaft: Böblingen
> Registergericht: Amtsgericht Stuttgart, HRB 243294
-- 
Kind regards / Beste Grüße
   Marc Hartmayer

IBM Deutschland Research & Development GmbH
Vorsitzender des Aufsichtsrats: Wolfgang Wendt
Geschäftsführung: David Faller
Sitz der Gesellschaft: Böblingen
Registergericht: Amtsgericht Stuttgart, HRB 243294

Re: [PATCH 1/4] workqueue: Reap workers via kthread_stop() and remove detach_completion

Posted by Lai Jiangshan 1 year, 6 months ago

Hello Marc

Thank you for the report.

On Wed, Jul 24, 2024 at 12:19 AM Marc Hartmayer <mhartmay@linux.ibm.com> wrote:

> Hi Lai,
>
> a bisect of a regression in our CI on s390x led to this patch. The bug
> is pretty easy to reproduce (currently, I only tested it on s390x - will
> try to test it on x86 as well):

I can't reproduce it in x86 after testing it for only 30 minutes.
It can definitely theoretically happen in x86.

>
> 1. Start a Linux QEMU/KVM guest with 2 cores using this patch and enable
>    `panic_on_warn=1` for the guest kernel.
> 2. Run the following command in the KVM guest:
>
>   $  dd if=/dev/zero of=/dev/null & while : ; do chcpu -d 1; chcpu -e 1; done
>
> 3. Wait for the crash. e.g.:
>
> 2024/07/23 18:01:21 [M83LP63]: [  157.267727] ------------[ cut here ]------------
> 2024/07/23 18:01:21 [M83LP63]: [  157.267735] WARNING: CPU: 21 PID: 725 at kernel/workqueue.c:3340 worker_thread+0x54e/0x558


> @@ -3330,7 +3338,6 @@ static int worker_thread(void *__worker)
>                 ida_free(&pool->worker_ida, worker->id);
>                 worker_detach_from_pool(worker);
>                 WARN_ON_ONCE(!list_empty(&worker->entry));
> -               kfree(worker);
>                 return 0;
>         }

The condition "!list_empty(&worker->entry)" can be true when the
worker is still in the cull_list awaiting being reaped by
reap_dying_workers() after
this change.

I will remove the WARN_ON_ONCE().

Thanks
Lai

Re: [PATCH 1/4] workqueue: Reap workers via kthread_stop() and remove detach_completion

Posted by Lai Jiangshan 1 year, 6 months ago

Hello Marc

On Thu, Jul 25, 2024 at 8:11 AM Lai Jiangshan <jiangshanlai@gmail.com> wrote:
> > 2024/07/23 18:01:21 [M83LP63]: [  157.267727] ------------[ cut here ]------------
> > 2024/07/23 18:01:21 [M83LP63]: [  157.267735] WARNING: CPU: 21 PID: 725 at kernel/workqueue.c:3340 worker_thread+0x54e/0x558
>
>
> > @@ -3330,7 +3338,6 @@ static int worker_thread(void *__worker)
> >                 ida_free(&pool->worker_ida, worker->id);
> >                 worker_detach_from_pool(worker);
> >                 WARN_ON_ONCE(!list_empty(&worker->entry));
> > -               kfree(worker);
> >                 return 0;
> >         }
>
> The condition "!list_empty(&worker->entry)" can be true when the
> worker is still in the cull_list awaiting being reaped by
> reap_dying_workers() after
> this change.
>
> I will remove the WARN_ON_ONCE().
>

Here is the fix:
https://lore.kernel.org/lkml/20240725010437.694727-1-jiangshanlai@gmail.com/

Could you take a look please.

Thank you for the report,
Lai