We observed the following hung task during our test:
[ 3987.095999] INFO: task "kworker/u32:7":239 blocked for more than 188 seconds.
[ 3987.096017] task:kworker/u32:7 state:D stack:0 pid:239 tgid:239 ppid:2 flags:0x00000408
[ 3987.096042] Workqueue: writeback wb_workfn (flush-254:59)
[ 3987.096069] Call trace:
[ 3987.096073] __switch_to+0x1a0/0x318
[ 3987.096089] __schedule+0xa38/0xf9c
[ 3987.096104] schedule+0x74/0x10c
[ 3987.096118] __bio_queue_enter+0xb8/0x178
[ 3987.096132] blk_mq_submit_bio+0x104/0x728
[ 3987.096145] __submit_bio+0xa0/0x23c
[ 3987.096159] submit_bio_noacct_nocheck+0x164/0x330
[ 3987.096173] submit_bio_noacct+0x348/0x468
[ 3987.096186] submit_bio+0x17c/0x198
[ 3987.096199] f2fs_submit_write_bio+0x44/0xe8
[ 3987.096211] __submit_merged_bio+0x40/0x11c
[ 3987.096222] __submit_merged_write_cond+0xcc/0x1f8
[ 3987.096233] f2fs_write_data_pages+0xbb8/0xd0c
[ 3987.096246] do_writepages+0xe0/0x2f4
[ 3987.096255] __writeback_single_inode+0x44/0x4ac
[ 3987.096272] writeback_sb_inodes+0x30c/0x538
[ 3987.096289] __writeback_inodes_wb+0x9c/0xec
[ 3987.096305] wb_writeback+0x158/0x440
[ 3987.096321] wb_workfn+0x388/0x5d4
[ 3987.096335] process_scheduled_works+0x1c4/0x45c
[ 3987.096346] worker_thread+0x32c/0x3e8
[ 3987.096356] kthread+0x11c/0x1b0
[ 3987.096372] ret_from_fork+0x10/0x20
T1: T2:
blk_queue_enter
blk_pm_resume_queue
pm_request_resume
__pm_runtime_resume(dev, RPM_ASYNC)
rpm_resume __pm_runtime_disable
dev->power.request_pending = true dev->power.disable_depth++
queue_work(pm_wq, &dev->power.work) __pm_runtime_barrier
wait_event cancel_work_sync(&dev->power.work)
T1 queues the work item, which is then cancelled by T2 before it starts
execution. As a result, q->dev cannot be resumed, and T1 waits here for
a long time.
Signed-off-by: Yang Yang <yang.yang@vivo.com>
---
drivers/base/power/runtime.c | 3 ++-
include/linux/pm.h | 1 +
2 files changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 1b11a3cd4acc..fc9bf3fb3bb7 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -1533,7 +1533,8 @@ void __pm_runtime_disable(struct device *dev, bool check_resume)
* means there probably is some I/O to process and disabling runtime PM
* shouldn't prevent the device from processing the I/O.
*/
- if (check_resume && dev->power.request_pending &&
+ if ((check_resume || dev->power.force_check_resume) &&
+ dev->power.request_pending &&
dev->power.request == RPM_REQ_RESUME) {
/*
* Prevent suspends and idle notifications from being carried
diff --git a/include/linux/pm.h b/include/linux/pm.h
index cc7b2dc28574..4eb20569cdbc 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -708,6 +708,7 @@ struct dev_pm_info {
bool use_autosuspend:1;
bool timer_autosuspends:1;
bool memalloc_noio:1;
+ bool force_check_resume:1;
unsigned int links_count;
enum rpm_request request;
enum rpm_status runtime_status;
--
2.34.1
On Wed, Nov 26, 2025 at 11:17 AM Yang Yang <yang.yang@vivo.com> wrote:
>
> We observed the following hung task during our test:
>
> [ 3987.095999] INFO: task "kworker/u32:7":239 blocked for more than 188 seconds.
> [ 3987.096017] task:kworker/u32:7 state:D stack:0 pid:239 tgid:239 ppid:2 flags:0x00000408
> [ 3987.096042] Workqueue: writeback wb_workfn (flush-254:59)
> [ 3987.096069] Call trace:
> [ 3987.096073] __switch_to+0x1a0/0x318
> [ 3987.096089] __schedule+0xa38/0xf9c
> [ 3987.096104] schedule+0x74/0x10c
> [ 3987.096118] __bio_queue_enter+0xb8/0x178
> [ 3987.096132] blk_mq_submit_bio+0x104/0x728
> [ 3987.096145] __submit_bio+0xa0/0x23c
> [ 3987.096159] submit_bio_noacct_nocheck+0x164/0x330
> [ 3987.096173] submit_bio_noacct+0x348/0x468
> [ 3987.096186] submit_bio+0x17c/0x198
> [ 3987.096199] f2fs_submit_write_bio+0x44/0xe8
> [ 3987.096211] __submit_merged_bio+0x40/0x11c
> [ 3987.096222] __submit_merged_write_cond+0xcc/0x1f8
> [ 3987.096233] f2fs_write_data_pages+0xbb8/0xd0c
> [ 3987.096246] do_writepages+0xe0/0x2f4
> [ 3987.096255] __writeback_single_inode+0x44/0x4ac
> [ 3987.096272] writeback_sb_inodes+0x30c/0x538
> [ 3987.096289] __writeback_inodes_wb+0x9c/0xec
> [ 3987.096305] wb_writeback+0x158/0x440
> [ 3987.096321] wb_workfn+0x388/0x5d4
> [ 3987.096335] process_scheduled_works+0x1c4/0x45c
> [ 3987.096346] worker_thread+0x32c/0x3e8
> [ 3987.096356] kthread+0x11c/0x1b0
> [ 3987.096372] ret_from_fork+0x10/0x20
>
> T1: T2:
> blk_queue_enter
> blk_pm_resume_queue
> pm_request_resume
Shouldn't this be pm_runtime_resume() rather?
> __pm_runtime_resume(dev, RPM_ASYNC)
> rpm_resume __pm_runtime_disable
> dev->power.request_pending = true dev->power.disable_depth++
> queue_work(pm_wq, &dev->power.work) __pm_runtime_barrier
> wait_event cancel_work_sync(&dev->power.work)
>
> T1 queues the work item, which is then cancelled by T2 before it starts
> execution. As a result, q->dev cannot be resumed, and T1 waits here for
> a long time.
>
> Signed-off-by: Yang Yang <yang.yang@vivo.com>
> ---
> drivers/base/power/runtime.c | 3 ++-
> include/linux/pm.h | 1 +
> 2 files changed, 3 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
> index 1b11a3cd4acc..fc9bf3fb3bb7 100644
> --- a/drivers/base/power/runtime.c
> +++ b/drivers/base/power/runtime.c
> @@ -1533,7 +1533,8 @@ void __pm_runtime_disable(struct device *dev, bool check_resume)
> * means there probably is some I/O to process and disabling runtime PM
> * shouldn't prevent the device from processing the I/O.
> */
> - if (check_resume && dev->power.request_pending &&
> + if ((check_resume || dev->power.force_check_resume) &&
> + dev->power.request_pending &&
> dev->power.request == RPM_REQ_RESUME) {
> /*
> * Prevent suspends and idle notifications from being carried
There are only two cases in which false is passed to
__pm_runtime_disable(), one is in device_suspend_late() and I don't
think that's relevant here, and the other is in pm_runtime_remove()
and that's get called when the device is going away.
So apparently, blk_pm_resume_queue() races with the device going away.
Is this expected to happen even?
If so, wouldn't it be better to modify pm_runtime_remove() to pass
true to __pm_runtime_disable() instead of making these ad hoc changes?
> diff --git a/include/linux/pm.h b/include/linux/pm.h
> index cc7b2dc28574..4eb20569cdbc 100644
> --- a/include/linux/pm.h
> +++ b/include/linux/pm.h
> @@ -708,6 +708,7 @@ struct dev_pm_info {
> bool use_autosuspend:1;
> bool timer_autosuspends:1;
> bool memalloc_noio:1;
> + bool force_check_resume:1;
> unsigned int links_count;
> enum rpm_request request;
> enum rpm_status runtime_status;
> --
On 11/26/25 3:30 AM, Rafael J. Wysocki wrote: > On Wed, Nov 26, 2025 at 11:17 AM Yang Yang <yang.yang@vivo.com> wrote: >> T1: T2: >> blk_queue_enter >> blk_pm_resume_queue >> pm_request_resume > > Shouldn't this be pm_runtime_resume() rather? I tried to make that change on an Android device. As a result, the kernel complaint shown below appeared. My understanding is that sleeping in atomic context can trigger a deadlock and hence is not allowed. [ 13.728890][ T1] WARNING: CPU: 6 PID: 1 at kernel/sched/core.c:9714 __might_sleep+0x78/0x84 [ 13.758800][ T1] Call trace: [ 13.759027][ T1] __might_sleep+0x78/0x84 [ 13.759340][ T1] __pm_runtime_resume+0x40/0xb8 [ 13.759781][ T1] __bio_queue_enter+0xc0/0x1cc [ 13.760153][ T1] blk_mq_submit_bio+0x884/0xadc [ 13.760548][ T1] __submit_bio+0x2c8/0x49c [ 13.760879][ T1] __submit_bio_noacct_mq+0x38/0x88 [ 13.761242][ T1] submit_bio_noacct_nocheck+0x4fc/0x7b8 [ 13.761631][ T1] submit_bio+0x214/0x4c0 [ 13.761941][ T1] mpage_readahead+0x1b8/0x1fc [ 13.762284][ T1] blkdev_readahead+0x18/0x28 [ 13.762660][ T1] page_cache_ra_unbounded+0x310/0x4d8 [ 13.763072][ T1] page_cache_ra_order+0xc0/0x5b0 [ 13.763434][ T1] page_cache_sync_ra+0x17c/0x268 [ 13.763782][ T1] filemap_read+0x4c4/0x12f4 [ 13.764125][ T1] blkdev_read_iter+0x100/0x164 [ 13.764475][ T1] vfs_read+0x188/0x348 [ 13.764789][ T1] __se_sys_pread64+0x84/0xc8 [ 13.765180][ T1] __arm64_sys_pread64+0x1c/0x2c [ 13.765556][ T1] invoke_syscall+0x58/0xf0 [ 13.765876][ T1] do_el0_svc+0x8c/0xe0 [ 13.766172][ T1] el0_svc+0x50/0xd4 [ 13.766583][ T1] el0t_64_sync_handler+0x20/0xf4 [ 13.766932][ T1] el0t_64_sync+0x1bc/0x1c0 [ 13.767294][ T1] irq event stamp: 2589614 [ 13.767592][ T1] hardirqs last enabled at (2589613): [<ffffffc0800eaf24>] finish_lock_switch+0x70/0x108 [ 13.768283][ T1] hardirqs last disabled at (2589614): [<ffffffc0814b66f4>] el1_dbg+0x24/0x80 [ 13.768875][ T1] softirqs last enabled at (2589370): [<ffffffc080082a7c>] ____do_softirq+0x10/0x20 [ 13.769529][ T1] softirqs last disabled at (2589349): [<ffffffc080082a7c>] ____do_softirq+0x10/0x20 I think that the filemap_invalidate_lock_shared() call in page_cache_ra_unbounded() forbids sleeping in submit_bio(). Thanks, Bart.
On Wed, Nov 26, 2025 at 7:06 PM Bart Van Assche <bvanassche@acm.org> wrote: > > On 11/26/25 3:30 AM, Rafael J. Wysocki wrote: > > On Wed, Nov 26, 2025 at 11:17 AM Yang Yang <yang.yang@vivo.com> wrote: > >> T1: T2: > >> blk_queue_enter > >> blk_pm_resume_queue > >> pm_request_resume > > > > Shouldn't this be pm_runtime_resume() rather? > > I tried to make that change on an Android device. As a result, the > kernel complaint shown below appeared. My understanding is that sleeping > in atomic context can trigger a deadlock and hence is not allowed. > > [ 13.728890][ T1] WARNING: CPU: 6 PID: 1 at > kernel/sched/core.c:9714 __might_sleep+0x78/0x84 > [ 13.758800][ T1] Call trace: > [ 13.759027][ T1] __might_sleep+0x78/0x84 > [ 13.759340][ T1] __pm_runtime_resume+0x40/0xb8 > [ 13.759781][ T1] __bio_queue_enter+0xc0/0x1cc > [ 13.760153][ T1] blk_mq_submit_bio+0x884/0xadc > [ 13.760548][ T1] __submit_bio+0x2c8/0x49c > [ 13.760879][ T1] __submit_bio_noacct_mq+0x38/0x88 > [ 13.761242][ T1] submit_bio_noacct_nocheck+0x4fc/0x7b8 > [ 13.761631][ T1] submit_bio+0x214/0x4c0 > [ 13.761941][ T1] mpage_readahead+0x1b8/0x1fc > [ 13.762284][ T1] blkdev_readahead+0x18/0x28 > [ 13.762660][ T1] page_cache_ra_unbounded+0x310/0x4d8 > [ 13.763072][ T1] page_cache_ra_order+0xc0/0x5b0 > [ 13.763434][ T1] page_cache_sync_ra+0x17c/0x268 > [ 13.763782][ T1] filemap_read+0x4c4/0x12f4 > [ 13.764125][ T1] blkdev_read_iter+0x100/0x164 > [ 13.764475][ T1] vfs_read+0x188/0x348 > [ 13.764789][ T1] __se_sys_pread64+0x84/0xc8 > [ 13.765180][ T1] __arm64_sys_pread64+0x1c/0x2c > [ 13.765556][ T1] invoke_syscall+0x58/0xf0 > [ 13.765876][ T1] do_el0_svc+0x8c/0xe0 > [ 13.766172][ T1] el0_svc+0x50/0xd4 > [ 13.766583][ T1] el0t_64_sync_handler+0x20/0xf4 > [ 13.766932][ T1] el0t_64_sync+0x1bc/0x1c0 > [ 13.767294][ T1] irq event stamp: 2589614 > [ 13.767592][ T1] hardirqs last enabled at (2589613): > [<ffffffc0800eaf24>] finish_lock_switch+0x70/0x108 > [ 13.768283][ T1] hardirqs last disabled at (2589614): > [<ffffffc0814b66f4>] el1_dbg+0x24/0x80 > [ 13.768875][ T1] softirqs last enabled at (2589370): > [<ffffffc080082a7c>] ____do_softirq+0x10/0x20 > [ 13.769529][ T1] softirqs last disabled at (2589349): > [<ffffffc080082a7c>] ____do_softirq+0x10/0x20 > > I think that the filemap_invalidate_lock_shared() call in > page_cache_ra_unbounded() forbids sleeping in submit_bio(). The wait_event() macro in __bio_queue_enter() calls might_sleep() at the very beginning, so why would it not complain? IIUC, this is the WARN_ONCE() in __might_sleep() about the task state being different from TASK_RUNNING, which triggers because prepare_to_wait_event() changes the task state to TASK_UNINTERRUPTIBLE. This means that calling pm_runtime_resume() cannot be part of the wait_event() condition, so blk_pm_resume_queue() and the wait_event() macros involving it would need some rewriting.
On Wed, Nov 26, 2025 at 8:16 PM Rafael J. Wysocki <rafael@kernel.org> wrote: > > On Wed, Nov 26, 2025 at 7:06 PM Bart Van Assche <bvanassche@acm.org> wrote: > > > > On 11/26/25 3:30 AM, Rafael J. Wysocki wrote: > > > On Wed, Nov 26, 2025 at 11:17 AM Yang Yang <yang.yang@vivo.com> wrote: > > >> T1: T2: > > >> blk_queue_enter > > >> blk_pm_resume_queue > > >> pm_request_resume > > > > > > Shouldn't this be pm_runtime_resume() rather? > > > > I tried to make that change on an Android device. As a result, the > > kernel complaint shown below appeared. My understanding is that sleeping > > in atomic context can trigger a deadlock and hence is not allowed. > > > > [ 13.728890][ T1] WARNING: CPU: 6 PID: 1 at > > kernel/sched/core.c:9714 __might_sleep+0x78/0x84 > > [ 13.758800][ T1] Call trace: > > [ 13.759027][ T1] __might_sleep+0x78/0x84 > > [ 13.759340][ T1] __pm_runtime_resume+0x40/0xb8 > > [ 13.759781][ T1] __bio_queue_enter+0xc0/0x1cc > > [ 13.760153][ T1] blk_mq_submit_bio+0x884/0xadc > > [ 13.760548][ T1] __submit_bio+0x2c8/0x49c > > [ 13.760879][ T1] __submit_bio_noacct_mq+0x38/0x88 > > [ 13.761242][ T1] submit_bio_noacct_nocheck+0x4fc/0x7b8 > > [ 13.761631][ T1] submit_bio+0x214/0x4c0 > > [ 13.761941][ T1] mpage_readahead+0x1b8/0x1fc > > [ 13.762284][ T1] blkdev_readahead+0x18/0x28 > > [ 13.762660][ T1] page_cache_ra_unbounded+0x310/0x4d8 > > [ 13.763072][ T1] page_cache_ra_order+0xc0/0x5b0 > > [ 13.763434][ T1] page_cache_sync_ra+0x17c/0x268 > > [ 13.763782][ T1] filemap_read+0x4c4/0x12f4 > > [ 13.764125][ T1] blkdev_read_iter+0x100/0x164 > > [ 13.764475][ T1] vfs_read+0x188/0x348 > > [ 13.764789][ T1] __se_sys_pread64+0x84/0xc8 > > [ 13.765180][ T1] __arm64_sys_pread64+0x1c/0x2c > > [ 13.765556][ T1] invoke_syscall+0x58/0xf0 > > [ 13.765876][ T1] do_el0_svc+0x8c/0xe0 > > [ 13.766172][ T1] el0_svc+0x50/0xd4 > > [ 13.766583][ T1] el0t_64_sync_handler+0x20/0xf4 > > [ 13.766932][ T1] el0t_64_sync+0x1bc/0x1c0 > > [ 13.767294][ T1] irq event stamp: 2589614 > > [ 13.767592][ T1] hardirqs last enabled at (2589613): > > [<ffffffc0800eaf24>] finish_lock_switch+0x70/0x108 > > [ 13.768283][ T1] hardirqs last disabled at (2589614): > > [<ffffffc0814b66f4>] el1_dbg+0x24/0x80 > > [ 13.768875][ T1] softirqs last enabled at (2589370): > > [<ffffffc080082a7c>] ____do_softirq+0x10/0x20 > > [ 13.769529][ T1] softirqs last disabled at (2589349): > > [<ffffffc080082a7c>] ____do_softirq+0x10/0x20 > > > > I think that the filemap_invalidate_lock_shared() call in > > page_cache_ra_unbounded() forbids sleeping in submit_bio(). > > The wait_event() macro in __bio_queue_enter() calls might_sleep() at > the very beginning, so why would it not complain? > > IIUC, this is the WARN_ONCE() in __might_sleep() about the task state > being different from TASK_RUNNING, which triggers because > prepare_to_wait_event() changes the task state to > TASK_UNINTERRUPTIBLE. > > This means that calling pm_runtime_resume() cannot be part of the > wait_event() condition, so blk_pm_resume_queue() and the wait_event() > macros involving it would need some rewriting. Interestingly enough, the pm_request_resume() call in blk_pm_resume_queue() is not even necessary in the __bio_queue_enter() case because pm is false there and it doesn't even check q->rpm_status. So in fact the resume is only necessary in blk_queue_enter() if pm is nonzero.
On 2025/11/26 19:30, Rafael J. Wysocki wrote:
> On Wed, Nov 26, 2025 at 11:17 AM Yang Yang <yang.yang@vivo.com> wrote:
>>
>> We observed the following hung task during our test:
>>
>> [ 3987.095999] INFO: task "kworker/u32:7":239 blocked for more than 188 seconds.
>> [ 3987.096017] task:kworker/u32:7 state:D stack:0 pid:239 tgid:239 ppid:2 flags:0x00000408
>> [ 3987.096042] Workqueue: writeback wb_workfn (flush-254:59)
>> [ 3987.096069] Call trace:
>> [ 3987.096073] __switch_to+0x1a0/0x318
>> [ 3987.096089] __schedule+0xa38/0xf9c
>> [ 3987.096104] schedule+0x74/0x10c
>> [ 3987.096118] __bio_queue_enter+0xb8/0x178
>> [ 3987.096132] blk_mq_submit_bio+0x104/0x728
>> [ 3987.096145] __submit_bio+0xa0/0x23c
>> [ 3987.096159] submit_bio_noacct_nocheck+0x164/0x330
>> [ 3987.096173] submit_bio_noacct+0x348/0x468
>> [ 3987.096186] submit_bio+0x17c/0x198
>> [ 3987.096199] f2fs_submit_write_bio+0x44/0xe8
>> [ 3987.096211] __submit_merged_bio+0x40/0x11c
>> [ 3987.096222] __submit_merged_write_cond+0xcc/0x1f8
>> [ 3987.096233] f2fs_write_data_pages+0xbb8/0xd0c
>> [ 3987.096246] do_writepages+0xe0/0x2f4
>> [ 3987.096255] __writeback_single_inode+0x44/0x4ac
>> [ 3987.096272] writeback_sb_inodes+0x30c/0x538
>> [ 3987.096289] __writeback_inodes_wb+0x9c/0xec
>> [ 3987.096305] wb_writeback+0x158/0x440
>> [ 3987.096321] wb_workfn+0x388/0x5d4
>> [ 3987.096335] process_scheduled_works+0x1c4/0x45c
>> [ 3987.096346] worker_thread+0x32c/0x3e8
>> [ 3987.096356] kthread+0x11c/0x1b0
>> [ 3987.096372] ret_from_fork+0x10/0x20
>>
>> T1: T2:
>> blk_queue_enter
>> blk_pm_resume_queue
>> pm_request_resume
>
> Shouldn't this be pm_runtime_resume() rather?
I'm not sure about that, I'll check if pm_runtime_resume() should be
used here instead.
>
>> __pm_runtime_resume(dev, RPM_ASYNC)
>> rpm_resume __pm_runtime_disable
>> dev->power.request_pending = true dev->power.disable_depth++
>> queue_work(pm_wq, &dev->power.work) __pm_runtime_barrier
>> wait_event cancel_work_sync(&dev->power.work)
>>
>> T1 queues the work item, which is then cancelled by T2 before it starts
>> execution. As a result, q->dev cannot be resumed, and T1 waits here for
>> a long time.
>>
>> Signed-off-by: Yang Yang <yang.yang@vivo.com>
>> ---
>> drivers/base/power/runtime.c | 3 ++-
>> include/linux/pm.h | 1 +
>> 2 files changed, 3 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
>> index 1b11a3cd4acc..fc9bf3fb3bb7 100644
>> --- a/drivers/base/power/runtime.c
>> +++ b/drivers/base/power/runtime.c
>> @@ -1533,7 +1533,8 @@ void __pm_runtime_disable(struct device *dev, bool check_resume)
>> * means there probably is some I/O to process and disabling runtime PM
>> * shouldn't prevent the device from processing the I/O.
>> */
>> - if (check_resume && dev->power.request_pending &&
>> + if ((check_resume || dev->power.force_check_resume) &&
>> + dev->power.request_pending &&
>> dev->power.request == RPM_REQ_RESUME) {
>> /*
>> * Prevent suspends and idle notifications from being carried
>
> There are only two cases in which false is passed to
> __pm_runtime_disable(), one is in device_suspend_late() and I don't
> think that's relevant here, and the other is in pm_runtime_remove()
> and that's get called when the device is going away.
>
> So apparently, blk_pm_resume_queue() races with the device going away.
> Is this expected to happen even?
>
> If so, wouldn't it be better to modify pm_runtime_remove() to pass
> true to __pm_runtime_disable() instead of making these ad hoc changes?
>
Sorry, I didn't make it clear in my previous message.
I can confirm that __pm_runtime_disable() is called from
device_suspend_late(), and this issue occurs during system suspend.
>> diff --git a/include/linux/pm.h b/include/linux/pm.h
>> index cc7b2dc28574..4eb20569cdbc 100644
>> --- a/include/linux/pm.h
>> +++ b/include/linux/pm.h
>> @@ -708,6 +708,7 @@ struct dev_pm_info {
>> bool use_autosuspend:1;
>> bool timer_autosuspends:1;
>> bool memalloc_noio:1;
>> + bool force_check_resume:1;
>> unsigned int links_count;
>> enum rpm_request request;
>> enum rpm_status runtime_status;
>> --
On Wed, Nov 26, 2025 at 12:59 PM YangYang <yang.yang@vivo.com> wrote:
>
> On 2025/11/26 19:30, Rafael J. Wysocki wrote:
> > On Wed, Nov 26, 2025 at 11:17 AM Yang Yang <yang.yang@vivo.com> wrote:
> >>
> >> We observed the following hung task during our test:
> >>
> >> [ 3987.095999] INFO: task "kworker/u32:7":239 blocked for more than 188 seconds.
> >> [ 3987.096017] task:kworker/u32:7 state:D stack:0 pid:239 tgid:239 ppid:2 flags:0x00000408
> >> [ 3987.096042] Workqueue: writeback wb_workfn (flush-254:59)
> >> [ 3987.096069] Call trace:
> >> [ 3987.096073] __switch_to+0x1a0/0x318
> >> [ 3987.096089] __schedule+0xa38/0xf9c
> >> [ 3987.096104] schedule+0x74/0x10c
> >> [ 3987.096118] __bio_queue_enter+0xb8/0x178
> >> [ 3987.096132] blk_mq_submit_bio+0x104/0x728
> >> [ 3987.096145] __submit_bio+0xa0/0x23c
> >> [ 3987.096159] submit_bio_noacct_nocheck+0x164/0x330
> >> [ 3987.096173] submit_bio_noacct+0x348/0x468
> >> [ 3987.096186] submit_bio+0x17c/0x198
> >> [ 3987.096199] f2fs_submit_write_bio+0x44/0xe8
> >> [ 3987.096211] __submit_merged_bio+0x40/0x11c
> >> [ 3987.096222] __submit_merged_write_cond+0xcc/0x1f8
> >> [ 3987.096233] f2fs_write_data_pages+0xbb8/0xd0c
> >> [ 3987.096246] do_writepages+0xe0/0x2f4
> >> [ 3987.096255] __writeback_single_inode+0x44/0x4ac
> >> [ 3987.096272] writeback_sb_inodes+0x30c/0x538
> >> [ 3987.096289] __writeback_inodes_wb+0x9c/0xec
> >> [ 3987.096305] wb_writeback+0x158/0x440
> >> [ 3987.096321] wb_workfn+0x388/0x5d4
> >> [ 3987.096335] process_scheduled_works+0x1c4/0x45c
> >> [ 3987.096346] worker_thread+0x32c/0x3e8
> >> [ 3987.096356] kthread+0x11c/0x1b0
> >> [ 3987.096372] ret_from_fork+0x10/0x20
> >>
> >> T1: T2:
> >> blk_queue_enter
> >> blk_pm_resume_queue
> >> pm_request_resume
> >
> > Shouldn't this be pm_runtime_resume() rather?
>
> I'm not sure about that, I'll check if pm_runtime_resume() should be
> used here instead.
Well, the code as is now schedules an async resume of the device and
then waits for it to complete. It would be more straightforward to
resume the device synchronously IMV.
> >
> >> __pm_runtime_resume(dev, RPM_ASYNC)
> >> rpm_resume __pm_runtime_disable
> >> dev->power.request_pending = true dev->power.disable_depth++
> >> queue_work(pm_wq, &dev->power.work) __pm_runtime_barrier
> >> wait_event cancel_work_sync(&dev->power.work)
> >>
> >> T1 queues the work item, which is then cancelled by T2 before it starts
> >> execution. As a result, q->dev cannot be resumed, and T1 waits here for
> >> a long time.
> >>
> >> Signed-off-by: Yang Yang <yang.yang@vivo.com>
> >> ---
> >> drivers/base/power/runtime.c | 3 ++-
> >> include/linux/pm.h | 1 +
> >> 2 files changed, 3 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
> >> index 1b11a3cd4acc..fc9bf3fb3bb7 100644
> >> --- a/drivers/base/power/runtime.c
> >> +++ b/drivers/base/power/runtime.c
> >> @@ -1533,7 +1533,8 @@ void __pm_runtime_disable(struct device *dev, bool check_resume)
> >> * means there probably is some I/O to process and disabling runtime PM
> >> * shouldn't prevent the device from processing the I/O.
> >> */
> >> - if (check_resume && dev->power.request_pending &&
> >> + if ((check_resume || dev->power.force_check_resume) &&
> >> + dev->power.request_pending &&
> >> dev->power.request == RPM_REQ_RESUME) {
> >> /*
> >> * Prevent suspends and idle notifications from being carried
> >
> > There are only two cases in which false is passed to
> > __pm_runtime_disable(), one is in device_suspend_late() and I don't
> > think that's relevant here, and the other is in pm_runtime_remove()
> > and that's get called when the device is going away.
> >
> > So apparently, blk_pm_resume_queue() races with the device going away.
> > Is this expected to happen even?
> >
> > If so, wouldn't it be better to modify pm_runtime_remove() to pass
> > true to __pm_runtime_disable() instead of making these ad hoc changes?
> >
>
> Sorry, I didn't make it clear in my previous message.
> I can confirm that __pm_runtime_disable() is called from
> device_suspend_late(), and this issue occurs during system suspend.
Interesting because the runtime PM workqueue is frozen at this point,
so waiting for a work item in it to complete is pointless.
What the patch does is to declare that the device can be
runtime-resumed in device_suspend_late(), but this is kind of a hack
IMV as it potentially affects the device's parent etc.
If the device cannot stay in runtime suspend across the entire system
suspend transition, it should be resumed (synchronously) earlier, in
device_suspend() or in device_prepare() even.
On 11/26/25 4:36 AM, Rafael J. Wysocki wrote: > Well, the code as is now schedules an async resume of the device and > then waits for it to complete. It would be more straightforward to > resume the device synchronously IMV. That would increase the depth of the call stack significantly. I'm not sure that's safe in this context. Thanks, Bart.
On Wed, Nov 26, 2025 at 4:34 PM Bart Van Assche <bvanassche@acm.org> wrote: > > On 11/26/25 4:36 AM, Rafael J. Wysocki wrote: > > Well, the code as is now schedules an async resume of the device and > > then waits for it to complete. It would be more straightforward to > > resume the device synchronously IMV. > > That would increase the depth of the call stack significantly. I'm not > sure that's safe in this context. As it stands, you have a basic problem with respect to system suspend/hibernation. As I said before, the PM workqueue is frozen during system suspend/hibernation transitions, so waiting for an async resume request to complete then is pointless.
On 11/26/25 7:41 AM, Rafael J. Wysocki wrote: > As it stands, you have a basic problem with respect to system > suspend/hibernation. As I said before, the PM workqueue is frozen > during system suspend/hibernation transitions, so waiting for an async > resume request to complete then is pointless. Agreed. I noticed that any attempt to call request_firmware() from driver system resume callback functions causes a deadlock if these calls happen before the block device has been resumed. Thanks, Bart.
On 2025/11/27 2:40, Bart Van Assche wrote:
> On 11/26/25 7:41 AM, Rafael J. Wysocki wrote:
>> As it stands, you have a basic problem with respect to system
>> suspend/hibernation. As I said before, the PM workqueue is frozen
>> during system suspend/hibernation transitions, so waiting for an async
>> resume request to complete then is pointless.
>
> Agreed. I noticed that any attempt to call request_firmware() from
> driver system resume callback functions causes a deadlock if these
> calls happen before the block device has been resumed.
>
> Thanks,
>
> Bart.
Does this patch look reasonable to you? It hasn't been fully tested
yet, but the resume is now performed synchronously.
diff --git a/block/blk-core.c b/block/blk-core.c
index 66fb2071d..041d29ba4 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -323,12 +323,15 @@ int blk_queue_enter(struct request_queue *q,
blk_mq_req_flags_t flags)
* reordered.
*/
smp_rmb();
- wait_event(q->mq_freeze_wq,
- (!q->mq_freeze_depth &&
- blk_pm_resume_queue(pm, q)) ||
- blk_queue_dying(q));
+check:
+ wait_event(q->mq_freeze_wq, !q->mq_freeze_depth);
+
if (blk_queue_dying(q))
return -ENODEV;
+ if (!blk_pm_resume_queue(pm, q)) {
+ pm_runtime_resume(q->dev);
+ goto check;
+ }
}
rwsem_acquire_read(&q->q_lockdep_map, 0, 0, _RET_IP_);
@@ -356,12 +359,15 @@ int __bio_queue_enter(struct request_queue *q,
struct bio *bio)
* reordered.
*/
smp_rmb();
- wait_event(q->mq_freeze_wq,
- (!q->mq_freeze_depth &&
- blk_pm_resume_queue(false, q)) ||
- test_bit(GD_DEAD, &disk->state));
+check:
+ wait_event(q->mq_freeze_wq, !q->mq_freeze_depth);
+
if (test_bit(GD_DEAD, &disk->state))
goto dead;
+ if (!blk_pm_resume_queue(false, q)) {
+ pm_runtime_resume(q->dev);
+ goto check;
+ }
}
rwsem_acquire_read(&q->io_lockdep_map, 0, 0, _RET_IP_);
diff --git a/block/blk-pm.h b/block/blk-pm.h
index 8a5a0d4b3..c28fad105 100644
--- a/block/blk-pm.h
+++ b/block/blk-pm.h
@@ -12,7 +12,6 @@ static inline int blk_pm_resume_queue(const bool pm,
struct request_queue *q)
return 1; /* Nothing to do */
if (pm && q->rpm_status != RPM_SUSPENDED)
return 1; /* Request allowed */
- pm_request_resume(q->dev);
return 0;
}
On 11/27/25 3:29 AM, YangYang wrote: > diff --git a/block/blk-core.c b/block/blk-core.c > index 66fb2071d..041d29ba4 100644 > --- a/block/blk-core.c > +++ b/block/blk-core.c > @@ -323,12 +323,15 @@ int blk_queue_enter(struct request_queue *q, > blk_mq_req_flags_t flags) > * reordered. > */ > smp_rmb(); > - wait_event(q->mq_freeze_wq, > - (!q->mq_freeze_depth && > - blk_pm_resume_queue(pm, q)) || > - blk_queue_dying(q)); > +check: > + wait_event(q->mq_freeze_wq, !q->mq_freeze_depth); > + > if (blk_queue_dying(q)) > return -ENODEV; This can't work. blk_mq_destroy_queue() freezes a request queue without unfreezing it so the above code will introduce a deadlock and/or a use- after-free if it executes concurrently with blk_mq_destroy_queue(). Bart.
On Thu, Nov 27, 2025 at 12:29 PM YangYang <yang.yang@vivo.com> wrote:
>
> On 2025/11/27 2:40, Bart Van Assche wrote:
> > On 11/26/25 7:41 AM, Rafael J. Wysocki wrote:
> >> As it stands, you have a basic problem with respect to system
> >> suspend/hibernation. As I said before, the PM workqueue is frozen
> >> during system suspend/hibernation transitions, so waiting for an async
> >> resume request to complete then is pointless.
> >
> > Agreed. I noticed that any attempt to call request_firmware() from
> > driver system resume callback functions causes a deadlock if these
> > calls happen before the block device has been resumed.
> >
> > Thanks,
> >
> > Bart.
>
> Does this patch look reasonable to you? It hasn't been fully tested
> yet, but the resume is now performed synchronously.
>
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 66fb2071d..041d29ba4 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -323,12 +323,15 @@ int blk_queue_enter(struct request_queue *q,
> blk_mq_req_flags_t flags)
> * reordered.
> */
> smp_rmb();
> - wait_event(q->mq_freeze_wq,
> - (!q->mq_freeze_depth &&
> - blk_pm_resume_queue(pm, q)) ||
> - blk_queue_dying(q));
> +check:
> + wait_event(q->mq_freeze_wq, !q->mq_freeze_depth);
I think that you still need to check blk_queue_dying(q) under
wait_even() or you may not stop waiting when this happens.
> +
> if (blk_queue_dying(q))
> return -ENODEV;
> + if (!blk_pm_resume_queue(pm, q)) {
> + pm_runtime_resume(q->dev);
> + goto check;
> + }
> }
>
> rwsem_acquire_read(&q->q_lockdep_map, 0, 0, _RET_IP_);
> @@ -356,12 +359,15 @@ int __bio_queue_enter(struct request_queue *q,
> struct bio *bio)
> * reordered.
> */
> smp_rmb();
> - wait_event(q->mq_freeze_wq,
> - (!q->mq_freeze_depth &&
> - blk_pm_resume_queue(false, q)) ||
> - test_bit(GD_DEAD, &disk->state));
> +check:
> + wait_event(q->mq_freeze_wq, !q->mq_freeze_depth);
Analogously here, you may not stop waiting when test_bit(GD_DEAD,
&disk->state) is true.
> +
> if (test_bit(GD_DEAD, &disk->state))
> goto dead;
> + if (!blk_pm_resume_queue(false, q)) {
> + pm_runtime_resume(q->dev);
> + goto check;
> + }
> }
>
> rwsem_acquire_read(&q->io_lockdep_map, 0, 0, _RET_IP_);
> diff --git a/block/blk-pm.h b/block/blk-pm.h
> index 8a5a0d4b3..c28fad105 100644
> --- a/block/blk-pm.h
> +++ b/block/blk-pm.h
> @@ -12,7 +12,6 @@ static inline int blk_pm_resume_queue(const bool pm,
> struct request_queue *q)
> return 1; /* Nothing to do */
> if (pm && q->rpm_status != RPM_SUSPENDED)
> return 1; /* Request allowed */
> - pm_request_resume(q->dev);
> return 0;
> }
And I would rename blk_pm_resume_queue() to something like
blk_pm_queue_active() because it is a bit confusing as it stands.
Apart from the above remarks this makes sense to me FWIW.
On 2025/11/27 20:44, Rafael J. Wysocki wrote:
> On Thu, Nov 27, 2025 at 12:29 PM YangYang <yang.yang@vivo.com> wrote:
>>
>> On 2025/11/27 2:40, Bart Van Assche wrote:
>>> On 11/26/25 7:41 AM, Rafael J. Wysocki wrote:
>>>> As it stands, you have a basic problem with respect to system
>>>> suspend/hibernation. As I said before, the PM workqueue is frozen
>>>> during system suspend/hibernation transitions, so waiting for an async
>>>> resume request to complete then is pointless.
>>>
>>> Agreed. I noticed that any attempt to call request_firmware() from
>>> driver system resume callback functions causes a deadlock if these
>>> calls happen before the block device has been resumed.
>>>
>>> Thanks,
>>>
>>> Bart.
>>
>> Does this patch look reasonable to you? It hasn't been fully tested
>> yet, but the resume is now performed synchronously.
>>
>> diff --git a/block/blk-core.c b/block/blk-core.c
>> index 66fb2071d..041d29ba4 100644
>> --- a/block/blk-core.c
>> +++ b/block/blk-core.c
>> @@ -323,12 +323,15 @@ int blk_queue_enter(struct request_queue *q,
>> blk_mq_req_flags_t flags)
>> * reordered.
>> */
>> smp_rmb();
>> - wait_event(q->mq_freeze_wq,
>> - (!q->mq_freeze_depth &&
>> - blk_pm_resume_queue(pm, q)) ||
>> - blk_queue_dying(q));
>> +check:
>> + wait_event(q->mq_freeze_wq, !q->mq_freeze_depth);
>
> I think that you still need to check blk_queue_dying(q) under
> wait_even() or you may not stop waiting when this happens.
>
Got it.
>> +
>> if (blk_queue_dying(q))
>> return -ENODEV;
>> + if (!blk_pm_resume_queue(pm, q)) {
>> + pm_runtime_resume(q->dev);
>> + goto check;
>> + }
>> }
>>
>> rwsem_acquire_read(&q->q_lockdep_map, 0, 0, _RET_IP_);
>> @@ -356,12 +359,15 @@ int __bio_queue_enter(struct request_queue *q,
>> struct bio *bio)
>> * reordered.
>> */
>> smp_rmb();
>> - wait_event(q->mq_freeze_wq,
>> - (!q->mq_freeze_depth &&
>> - blk_pm_resume_queue(false, q)) ||
>> - test_bit(GD_DEAD, &disk->state));
>> +check:
>> + wait_event(q->mq_freeze_wq, !q->mq_freeze_depth);
>
> Analogously here, you may not stop waiting when test_bit(GD_DEAD,
> &disk->state) is true.
>
Got it.
>> +
>> if (test_bit(GD_DEAD, &disk->state))
>> goto dead;
>> + if (!blk_pm_resume_queue(false, q)) {
>> + pm_runtime_resume(q->dev);
>> + goto check;
>> + }
>> }
>>
>> rwsem_acquire_read(&q->io_lockdep_map, 0, 0, _RET_IP_);
>> diff --git a/block/blk-pm.h b/block/blk-pm.h
>> index 8a5a0d4b3..c28fad105 100644
>> --- a/block/blk-pm.h
>> +++ b/block/blk-pm.h
>> @@ -12,7 +12,6 @@ static inline int blk_pm_resume_queue(const bool pm,
>> struct request_queue *q)
>> return 1; /* Nothing to do */
>> if (pm && q->rpm_status != RPM_SUSPENDED)
>> return 1; /* Request allowed */
>> - pm_request_resume(q->dev);
>> return 0;
>> }
>
> And I would rename blk_pm_resume_queue() to something like
> blk_pm_queue_active() because it is a bit confusing as it stands.
>
> Apart from the above remarks this makes sense to me FWIW.
Got it. I'll fix these in the next version and run some tests before
sending it out. Thanks for the review.
© 2016 - 2025 Red Hat, Inc.