[PATCH] net: wwan: t7xx: fix race between TX thread and system PM suspend

Tim JH Chen posted 1 patch an hour ago
drivers/net/wwan/t7xx/t7xx_hif_dpmaif.c    |  6 ++++++
drivers/net/wwan/t7xx/t7xx_hif_dpmaif.h    |  3 +++
drivers/net/wwan/t7xx/t7xx_hif_dpmaif_tx.c | 16 +++++++++++++---
3 files changed, 22 insertions(+), 3 deletions(-)
[PATCH] net: wwan: t7xx: fix race between TX thread and system PM suspend
Posted by Tim JH Chen an hour ago
v2: Address two concerns raised in AI-assisted code review of v1:

1. [High] t7xx_dpmaif_resume() was unconditionally restoring state to
   DPMAIF_STATE_PWRON regardless of the state before suspend.  If the
   modem had already been moved to DPMAIF_STATE_PWROFF by
   t7xx_dpmaif_md_state_callback() (MD_STATE_EXCEPTION or
   MD_STATE_STOPPED) prior to system suspend, resume would incorrectly
   re-arm the TX kthread guard, allowing TX HW writes against a modem
   the MD state machine considers stopped or in exception.

   Fix: save dpmaif_ctrl->state into pre_suspend_state at the start of
   t7xx_dpmaif_suspend() and restore that saved value in
   t7xx_dpmaif_resume(), so a pre-suspend PWROFF is preserved across
   the suspend/resume cycle.

2. [Medium] The v1 second state check before pm_runtime_resume_and_get()
   only narrowed the TOCTOU window -- it did not close it.  The state
   field was a plain enum read and written without any lock or
   READ_ONCE/WRITE_ONCE annotation.  After the check passed on one CPU,
   the suspend path on another CPU could still set state=PWROFF and
   begin PM teardown before the kthread reached pm_runtime_resume_and_get(),
   reproducing the deadlock.

   Fix: introduce tx_pm_lock (struct mutex) held by the kthread across
   the [state check -> pm_runtime_resume_and_get -> pm_runtime_put]
   sequence.  t7xx_dpmaif_suspend() acquires this lock before setting
   DPMAIF_STATE_PWROFF, which serialises with any in-progress kthread
   PM section and guarantees the kthread cannot enter
   pm_runtime_resume_and_get() after the state flag is set.
   READ_ONCE/WRITE_ONCE are added at every access point of the state
   flag that crosses the suspend/resume boundary to prevent
   compiler-visible tearing.

The original v1 description of the root cause and tested fix still
applies (deadlock between t7xx_dpmaif_tx_hw_push_thread calling
pm_runtime_resume_and_get() and the system PM suspend path, triggered
with ASPM L1 enabled after repeated suspend/resume cycles).

Tested: no soft lockup over 500+ suspend/resume cycles with SIM
registered and ASPM L1 enabled (previously triggered in < 300).

Fixes: 05f7e89ab ("Linux 6.19")
Signed-off-by: Tim JH Chen <tim.jh.chen@wnc.com.tw>
---
 drivers/net/wwan/t7xx/t7xx_hif_dpmaif.c    |  6 ++++++
 drivers/net/wwan/t7xx/t7xx_hif_dpmaif.h    |  3 +++
 drivers/net/wwan/t7xx/t7xx_hif_dpmaif_tx.c | 16 +++++++++++++---
 3 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/drivers/net/wwan/t7xx/t7xx_hif_dpmaif.c b/drivers/net/wwan/t7xx/t7xx_hif_dpmaif.c
index 7ff33c1d6ac7..e1f3eeb2c947 100644
--- a/drivers/net/wwan/t7xx/t7xx_hif_dpmaif.c
+++ b/drivers/net/wwan/t7xx/t7xx_hif_dpmaif.c
@@ -412,6 +412,11 @@ static int t7xx_dpmaif_suspend(struct t7xx_pci_dev *t7xx_dev, void *param)
 	struct dpmaif_ctrl *dpmaif_ctrl = param;
 
 	t7xx_dpmaif_tx_stop(dpmaif_ctrl);
+	dpmaif_ctrl->pre_suspend_state = READ_ONCE(dpmaif_ctrl->state);
+	mutex_lock(&dpmaif_ctrl->tx_pm_lock);
+	WRITE_ONCE(dpmaif_ctrl->state, DPMAIF_STATE_PWROFF);
+	mutex_unlock(&dpmaif_ctrl->tx_pm_lock);
+	wake_up(&dpmaif_ctrl->tx_wq);
 	t7xx_dpmaif_hw_stop_all_txq(&dpmaif_ctrl->hw_info);
 	t7xx_dpmaif_hw_stop_all_rxq(&dpmaif_ctrl->hw_info);
 	t7xx_dpmaif_disable_irq(dpmaif_ctrl);
@@ -451,6 +456,7 @@ static int t7xx_dpmaif_resume(struct t7xx_pci_dev *t7xx_dev, void *param)
 	if (!dpmaif_ctrl)
 		return 0;
 
+	WRITE_ONCE(dpmaif_ctrl->state, dpmaif_ctrl->pre_suspend_state);
 	t7xx_dpmaif_start_txrx_qs(dpmaif_ctrl);
 	t7xx_dpmaif_enable_irq(dpmaif_ctrl);
 	t7xx_dpmaif_unmask_dlq_intr(dpmaif_ctrl);
diff --git a/drivers/net/wwan/t7xx/t7xx_hif_dpmaif.h b/drivers/net/wwan/t7xx/t7xx_hif_dpmaif.h
index 0ce4505e813d..670ed2cca761 100644
--- a/drivers/net/wwan/t7xx/t7xx_hif_dpmaif.h
+++ b/drivers/net/wwan/t7xx/t7xx_hif_dpmaif.h
@@ -20,6 +20,7 @@
 
 #include <linux/bitmap.h>
 #include <linux/mm_types.h>
+#include <linux/mutex.h>
 #include <linux/netdevice.h>
 #include <linux/sched.h>
 #include <linux/skbuff.h>
@@ -172,6 +173,8 @@ struct dpmaif_ctrl {
 	struct t7xx_pci_dev		*t7xx_dev;
 	struct md_pm_entity		dpmaif_pm_entity;
 	enum dpmaif_state		state;
+	enum dpmaif_state		pre_suspend_state;
+	struct mutex			tx_pm_lock;
 	bool				dpmaif_sw_init_done;
 	struct dpmaif_hw_info		hw_info;
 	struct dpmaif_tx_queue		txq[DPMAIF_TXQ_NUM];
diff --git a/drivers/net/wwan/t7xx/t7xx_hif_dpmaif_tx.c b/drivers/net/wwan/t7xx/t7xx_hif_dpmaif_tx.c
index 236d632cf591..197c0ab3fd39 100644
--- a/drivers/net/wwan/t7xx/t7xx_hif_dpmaif_tx.c
+++ b/drivers/net/wwan/t7xx/t7xx_hif_dpmaif_tx.c
@@ -449,10 +449,10 @@ static int t7xx_dpmaif_tx_hw_push_thread(void *arg)
 
 	while (!kthread_should_stop()) {
 		if (t7xx_tx_lists_are_all_empty(dpmaif_ctrl) ||
-		    dpmaif_ctrl->state != DPMAIF_STATE_PWRON) {
+		    READ_ONCE(dpmaif_ctrl->state) != DPMAIF_STATE_PWRON) {
 			if (wait_event_interruptible(dpmaif_ctrl->tx_wq,
 						     (!t7xx_tx_lists_are_all_empty(dpmaif_ctrl) &&
-						     dpmaif_ctrl->state == DPMAIF_STATE_PWRON) ||
+						     READ_ONCE(dpmaif_ctrl->state) == DPMAIF_STATE_PWRON) ||
 						     kthread_should_stop()))
 				continue;
 
@@ -460,14 +460,23 @@ static int t7xx_dpmaif_tx_hw_push_thread(void *arg)
 				break;
 		}
 
+		mutex_lock(&dpmaif_ctrl->tx_pm_lock);
+		if (READ_ONCE(dpmaif_ctrl->state) != DPMAIF_STATE_PWRON) {
+			mutex_unlock(&dpmaif_ctrl->tx_pm_lock);
+			continue;
+		}
+
 		ret = pm_runtime_resume_and_get(dpmaif_ctrl->dev);
-		if (ret < 0 && ret != -EACCES)
+		if (ret < 0 && ret != -EACCES) {
+			mutex_unlock(&dpmaif_ctrl->tx_pm_lock);
 			return ret;
+		}
 
 		t7xx_pci_disable_sleep(dpmaif_ctrl->t7xx_dev);
 		t7xx_do_tx_hw_push(dpmaif_ctrl);
 		t7xx_pci_enable_sleep(dpmaif_ctrl->t7xx_dev);
 		pm_runtime_put_autosuspend(dpmaif_ctrl->dev);
+		mutex_unlock(&dpmaif_ctrl->tx_pm_lock);
 	}
 
 	return 0;
@@ -475,6 +484,7 @@ static int t7xx_dpmaif_tx_hw_push_thread(void *arg)
 
 int t7xx_dpmaif_tx_thread_init(struct dpmaif_ctrl *dpmaif_ctrl)
 {
+	mutex_init(&dpmaif_ctrl->tx_pm_lock);
 	init_waitqueue_head(&dpmaif_ctrl->tx_wq);
 	dpmaif_ctrl->tx_thread = kthread_run(t7xx_dpmaif_tx_hw_push_thread,
 					     dpmaif_ctrl, "dpmaif_tx_hw_push");
-- 
2.43.0