drivers/gpu/drm/msm/msm_gpu.c | 11 +++++++++++ 1 file changed, 11 insertions(+)
We sometimes get into a situtation where GPU hangcheck fails to
recover GPU:
[..]
msm_dpu ae01000.display-controller: [drm:hangcheck_handler] *ERROR* (IPv4: 1): hangcheck detected gpu lockup rb 0!
msm_dpu ae01000.display-controller: [drm:hangcheck_handler] *ERROR* (IPv4: 1): completed fence: 7840161
msm_dpu ae01000.display-controller: [drm:hangcheck_handler] *ERROR* (IPv4: 1): submitted fence: 7840162
msm_dpu ae01000.display-controller: [drm:hangcheck_handler] *ERROR* (IPv4: 1): hangcheck detected gpu lockup rb 0!
msm_dpu ae01000.display-controller: [drm:hangcheck_handler] *ERROR* (IPv4: 1): completed fence: 7840162
msm_dpu ae01000.display-controller: [drm:hangcheck_handler] *ERROR* (IPv4: 1): submitted fence: 7840163
[..]
The problem is that msm_job worker is blocked on gpu->lock
INFO: task ring0:155 blocked for more than 122 seconds.
Not tainted 6.6.99-08727-gaac38b365d2c #1
task:ring0 state:D stack:0 pid:155 ppid:2 flags:0x00000008
Call trace:
__switch_to+0x108/0x208
schedule+0x544/0x11f0
schedule_preempt_disabled+0x30/0x50
__mutex_lock_common+0x410/0x850
__mutex_lock_slowpath+0x28/0x40
mutex_lock+0x5c/0x90
msm_job_run+0x9c/0x140
drm_sched_main+0x514/0x938
kthread+0x114/0x138
ret_from_fork+0x10/0x20
which is owned by recover worker, which is waiting for DMA fences
from a memory reclaim path, under the very same gpu->lock
INFO: task ring0:155 is blocked on a mutex likely owned by task gpu-worker:154.
task:gpu-worker state:D stack:0 pid:154 ppid:2 flags:0x00000008
Call trace:
__switch_to+0x108/0x208
schedule+0x544/0x11f0
schedule_timeout+0x1f8/0x770
dma_fence_default_wait+0x108/0x218
dma_fence_wait_timeout+0x6c/0x1c0
dma_resv_wait_timeout+0xe4/0x118
active_purge+0x34/0x98
drm_gem_lru_scan+0x1d0/0x388
msm_gem_shrinker_scan+0x1cc/0x2e8
shrink_slab+0x228/0x478
shrink_node+0x380/0x730
try_to_free_pages+0x204/0x510
__alloc_pages_direct_reclaim+0x90/0x158
__alloc_pages_slowpath+0x1d4/0x4a0
__alloc_pages+0x9f0/0xc88
vm_area_alloc_pages+0x17c/0x260
__vmalloc_node_range+0x1c0/0x420
kvmalloc_node+0xe8/0x108
msm_gpu_crashstate_capture+0x1e4/0x280
recover_worker+0x1c0/0x638
kthread_worker_fn+0x150/0x2d8
kthread+0x114/0x138
So no one can make any further progress.
Forbid recover/fault worker to enter memory reclaim (under
gpu->lock) to address this deadlock scenario.
Cc: Tomasz Figa <tfiga@chromium.org>
Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
---
drivers/gpu/drm/msm/msm_gpu.c | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
index 995549d0bbbc..ddcd9e1c217a 100644
--- a/drivers/gpu/drm/msm/msm_gpu.c
+++ b/drivers/gpu/drm/msm/msm_gpu.c
@@ -17,6 +17,7 @@
#include <linux/string_helpers.h>
#include <linux/devcoredump.h>
#include <linux/sched/task.h>
+#include <linux/sched/mm.h>
/*
* Power Management:
@@ -469,6 +470,7 @@ static void recover_worker(struct kthread_work *work)
struct msm_gem_submit *submit;
struct msm_ringbuffer *cur_ring = gpu->funcs->active_ring(gpu);
char *comm = NULL, *cmd = NULL;
+ unsigned int noreclaim_flag;
struct task_struct *task;
int i;
@@ -506,6 +508,8 @@ static void recover_worker(struct kthread_work *work)
msm_gem_vm_unusable(submit->vm);
}
+ noreclaim_flag = memalloc_noreclaim_save();
+
get_comm_cmdline(submit, &comm, &cmd);
if (comm && cmd) {
@@ -524,6 +528,8 @@ static void recover_worker(struct kthread_work *work)
pm_runtime_get_sync(&gpu->pdev->dev);
msm_gpu_crashstate_capture(gpu, submit, NULL, comm, cmd);
+ memalloc_noreclaim_restore(noreclaim_flag);
+
kfree(cmd);
kfree(comm);
@@ -588,6 +594,7 @@ void msm_gpu_fault_crashstate_capture(struct msm_gpu *gpu, struct msm_gpu_fault_
struct msm_gem_submit *submit;
struct msm_ringbuffer *cur_ring = gpu->funcs->active_ring(gpu);
char *comm = NULL, *cmd = NULL;
+ unsigned int noreclaim_flag;
mutex_lock(&gpu->lock);
@@ -595,6 +602,8 @@ void msm_gpu_fault_crashstate_capture(struct msm_gpu *gpu, struct msm_gpu_fault_
if (submit && submit->fault_dumped)
goto resume_smmu;
+ noreclaim_flag = memalloc_noreclaim_save();
+
if (submit) {
get_comm_cmdline(submit, &comm, &cmd);
@@ -610,6 +619,8 @@ void msm_gpu_fault_crashstate_capture(struct msm_gpu *gpu, struct msm_gpu_fault_
msm_gpu_crashstate_capture(gpu, submit, fault_info, comm, cmd);
pm_runtime_put_sync(&gpu->pdev->dev);
+ memalloc_noreclaim_restore(noreclaim_flag);
+
kfree(cmd);
kfree(comm);
--
2.53.0.rc1.217.geba53bf80e-goog
On (26/01/27 16:33), Sergey Senozhatsky wrote: > We sometimes get into a situtation where GPU hangcheck fails to > recover GPU: > > [..] > msm_dpu ae01000.display-controller: [drm:hangcheck_handler] *ERROR* (IPv4: 1): hangcheck detected gpu lockup rb 0! > msm_dpu ae01000.display-controller: [drm:hangcheck_handler] *ERROR* (IPv4: 1): completed fence: 7840161 > msm_dpu ae01000.display-controller: [drm:hangcheck_handler] *ERROR* (IPv4: 1): submitted fence: 7840162 > msm_dpu ae01000.display-controller: [drm:hangcheck_handler] *ERROR* (IPv4: 1): hangcheck detected gpu lockup rb 0! > msm_dpu ae01000.display-controller: [drm:hangcheck_handler] *ERROR* (IPv4: 1): completed fence: 7840162 > msm_dpu ae01000.display-controller: [drm:hangcheck_handler] *ERROR* (IPv4: 1): submitted fence: 7840163 > [..] > > The problem is that msm_job worker is blocked on gpu->lock > > INFO: task ring0:155 blocked for more than 122 seconds. > Not tainted 6.6.99-08727-gaac38b365d2c #1 > task:ring0 state:D stack:0 pid:155 ppid:2 flags:0x00000008 > Call trace: > __switch_to+0x108/0x208 > schedule+0x544/0x11f0 > schedule_preempt_disabled+0x30/0x50 > __mutex_lock_common+0x410/0x850 > __mutex_lock_slowpath+0x28/0x40 > mutex_lock+0x5c/0x90 > msm_job_run+0x9c/0x140 > drm_sched_main+0x514/0x938 > kthread+0x114/0x138 > ret_from_fork+0x10/0x20 > > which is owned by recover worker, which is waiting for DMA fences > from a memory reclaim path, under the very same gpu->lock > > INFO: task ring0:155 is blocked on a mutex likely owned by task gpu-worker:154. > task:gpu-worker state:D stack:0 pid:154 ppid:2 flags:0x00000008 > Call trace: > __switch_to+0x108/0x208 > schedule+0x544/0x11f0 > schedule_timeout+0x1f8/0x770 > dma_fence_default_wait+0x108/0x218 > dma_fence_wait_timeout+0x6c/0x1c0 > dma_resv_wait_timeout+0xe4/0x118 > active_purge+0x34/0x98 > drm_gem_lru_scan+0x1d0/0x388 > msm_gem_shrinker_scan+0x1cc/0x2e8 > shrink_slab+0x228/0x478 > shrink_node+0x380/0x730 > try_to_free_pages+0x204/0x510 > __alloc_pages_direct_reclaim+0x90/0x158 > __alloc_pages_slowpath+0x1d4/0x4a0 > __alloc_pages+0x9f0/0xc88 > vm_area_alloc_pages+0x17c/0x260 > __vmalloc_node_range+0x1c0/0x420 > kvmalloc_node+0xe8/0x108 > msm_gpu_crashstate_capture+0x1e4/0x280 > recover_worker+0x1c0/0x638 > kthread_worker_fn+0x150/0x2d8 > kthread+0x114/0x138 > > So no one can make any further progress. > > Forbid recover/fault worker to enter memory reclaim (under > gpu->lock) to address this deadlock scenario. Gentle ping.
© 2016 - 2026 Red Hat, Inc.