Forwarded: [PATCH] The hung_task report shows multiple io_ring_exit_work

syzbot posted 1 patch 1 week, 4 days ago
io_uring/sqpoll.c | 28 ++++++++++++++++++++++++----
1 file changed, 24 insertions(+), 4 deletions(-)
Forwarded: [PATCH] The hung_task report shows multiple io_ring_exit_work
Posted by syzbot 1 week, 4 days ago
For archival purposes, forwarding an incoming command email to
linux-kernel@vger.kernel.org, syzkaller-bugs@googlegroups.com.

***

Subject: [PATCH] The hung_task report shows multiple io_ring_exit_work
Author: kartikey406@gmail.com

#syz test: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git master


This debug patch instruments io_sq_thread() to identify which segment
of the main loop is failing to release sqd->lock. It adds rate-limited
pr_warn() calls at five points:

  - around __io_sq_thread() to catch a wedge inside io_submit_sqes()
    under ctx->uring_lock
  - around io_sq_tw() to catch a task_work callback that doesn't return
  - on the full loop iteration to catch any other slow path
  - on the SHOULD_PARK-set-but-still-looping condition, in case the
    park flag is being missed
  - at io_sqd_handle_event() entry to confirm which sqds reach the
    park handler

Thresholds are 10s (per-call) and 30s (per-iteration), well above any
legitimate SQPOLL work, so a healthy ring will produce no output.

Not-Signed-off-by: Deepanshu Kartikey <kartikey406@gmail.com>
---
 io_uring/sqpoll.c | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
index 46c12afec73e..26f88b485cfd 100644
--- a/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@ -246,7 +246,8 @@ static bool io_sqd_handle_event(struct io_sq_data *sqd)
 {
 	bool did_sig = false;
 	struct ksignal ksig;
-
+	pr_warn_ratelimited("sqpoll-dbg: handle_event entered sqd=%p park_pending=%d state=0x%lx\n",
+			    sqd, atomic_read(&sqd->park_pending), sqd->state);
 	if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) ||
 	    signal_pending(current)) {
 		mutex_unlock(&sqd->lock);
@@ -333,6 +334,8 @@ static int io_sq_thread(void *data)
 	while (1) {
 		bool cap_entries, sqt_spin = false;
 		struct io_sq_time ist = { };
+		unsigned long iter_start = jiffies;
+		unsigned long t0;
 
 		if (io_sqd_events_pending(sqd) || signal_pending(current)) {
 			if (io_sqd_handle_event(sqd))
@@ -342,14 +345,22 @@ static int io_sq_thread(void *data)
 
 		cap_entries = !list_is_singular(&sqd->ctx_list);
 		list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
-			int ret = __io_sq_thread(ctx, sqd, cap_entries, &ist);
-
+			int ret;
+			t0 = jiffies;
+			ret = __io_sq_thread(ctx, sqd, cap_entries, &ist);
+			if (time_after(jiffies, t0 + 10 * HZ))
+				pr_warn("sqpoll-dbg: __io_sq_thread stuck sqd=%p ctx=%p ret=%d for %ums\n",
+					sqd, ctx, ret,
+					jiffies_to_msecs(jiffies - t0));
 			if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list)))
 				sqt_spin = true;
 		}
+		t0 = jiffies;
 		if (io_sq_tw(&retry_list, IORING_TW_CAP_ENTRIES_VALUE))
 			sqt_spin = true;
-
+		if (time_after(jiffies, t0 + 10 * HZ))
+			pr_warn("sqpoll-dbg: io_sq_tw stuck sqd=%p for %ums\n",
+				sqd, jiffies_to_msecs(jiffies - t0));
 		list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
 			if (io_napi(ctx)) {
 				io_sq_start_worktime(&ist);
@@ -358,6 +369,15 @@ static int io_sq_thread(void *data)
 		}
 
 		io_sq_update_worktime(sqd, &ist);
+		if (time_after(jiffies, iter_start + 30 * HZ))
+			pr_warn("sqpoll-dbg: loop iter took %ums sqd=%p park_pending=%d should_park=%d sqt_spin=%d\n",
+				jiffies_to_msecs(jiffies - iter_start), sqd,
+				atomic_read(&sqd->park_pending),
+				test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state),
+				sqt_spin);
+		if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state))
+			pr_warn_ratelimited("sqpoll-dbg: SHOULD_PARK set but looping sqd=%p sqt_spin=%d\n",
+						sqd, sqt_spin);
 
 		if (sqt_spin || !time_after(jiffies, timeout)) {
 			if (sqt_spin)
-- 
2.43.0