[PATCH 11/11] wifi: mt76: mt7925: fix ROC deadlocks and race conditions

Zac posted 11 patches 2 weeks, 6 days ago
[PATCH 11/11] wifi: mt76: mt7925: fix ROC deadlocks and race conditions
Posted by Zac 2 weeks, 6 days ago
From: Zac Bowling <zac@zacbowling.com>

Fix multiple interrelated issues in the remain-on-channel (ROC) handling
that cause deadlocks, race conditions, and resource leaks.

Problems fixed:

1. Deadlock in sta removal ROC abort path:
   When a station is removed while a ROC operation is in progress, the
   driver would call mt7925_roc_abort_sync() which waits for ROC completion.
   However, the ROC work itself needs to acquire mt792x_mutex which is
   already held during station removal, causing a deadlock.

   Fix: Use async ROC abort (mt76_connac_mcu_abort_roc) when called from
   paths that already hold the mutex, and add MT76_STATE_ROC_ABORT flag
   to coordinate between the abort and the ROC timer.

2. ROC timer race during suspend:
   The ROC timer could fire after the device started suspending but before
   the ROC was properly aborted, causing undefined behavior.

   Fix: Delete ROC timer synchronously before suspend and check device
   state before processing ROC timeout.

3. ROC rate limiting for MLO auth failures:
   Rapid ROC requests during MLO authentication can overwhelm the firmware,
   causing authentication timeouts. The MT7925 firmware has limited ROC
   handling capacity.

   Fix: Add rate limiting infrastructure with configurable minimum interval
   between ROC requests. Track last ROC completion time and defer new
   requests if they arrive too quickly.

4. WCID leak in ROC cleanup:
   When ROC operations are aborted, the associated WCID resources were
   not being properly released, causing resource exhaustion over time.

   Fix: Ensure WCID cleanup happens in all ROC termination paths.

5. Async ROC abort race condition:
   The async ROC abort could race with normal ROC completion, causing
   double-free or use-after-free of ROC resources.

   Fix: Use MT76_STATE_ROC_ABORT flag and proper synchronization to
   prevent races between async abort and normal completion paths.

These fixes work together to provide robust ROC handling that doesn't
deadlock, properly releases resources, and handles edge cases during
suspend and MLO operations.

Fixes: c948b5da6bbe ("wifi: mt76: mt7925: add Mediatek Wi-Fi7 driver for mt7925 device")
Signed-off-by: Zac Bowling <zac@zacbowling.com>
---
 drivers/net/wireless/mediatek/mt76/mt76.h     |   1 +
 .../net/wireless/mediatek/mt76/mt7925/main.c  | 175 ++++++++++++++++--
 drivers/net/wireless/mediatek/mt76/mt792x.h   |   7 +
 3 files changed, 170 insertions(+), 13 deletions(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mt76.h b/drivers/net/wireless/mediatek/mt76/mt76.h
index d05e83ea1cac..91f9dd95c89e 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76.h
+++ b/drivers/net/wireless/mediatek/mt76/mt76.h
@@ -511,6 +511,7 @@ enum {
 	MT76_STATE_POWER_OFF,
 	MT76_STATE_SUSPEND,
 	MT76_STATE_ROC,
+	MT76_STATE_ROC_ABORT,
 	MT76_STATE_PM,
 	MT76_STATE_WED_RESET,
 };
diff --git a/drivers/net/wireless/mediatek/mt76/mt7925/main.c b/drivers/net/wireless/mediatek/mt76/mt7925/main.c
index cc7ef2c17032..2404f7812897 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7925/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7925/main.c
@@ -453,6 +453,24 @@ static void mt7925_roc_iter(void *priv, u8 *mac,
 	mt7925_mcu_abort_roc(phy, &mvif->bss_conf, phy->roc_token_id);
 }
 
+/* Async ROC abort - safe to call while holding mutex.
+ * Sets abort flag and lets roc_work handle cleanup without blocking.
+ * This prevents deadlock when called from sta_remove path which holds mutex.
+ */
+static void mt7925_roc_abort_async(struct mt792x_dev *dev)
+{
+	struct mt792x_phy *phy = &dev->phy;
+
+	/* Set abort flag - roc_work checks this before acquiring mutex */
+	set_bit(MT76_STATE_ROC_ABORT, &phy->mt76->state);
+
+	/* Stop timer and schedule work to handle cleanup.
+	 * Must schedule work since timer may not have fired yet.
+	 */
+	timer_delete(&phy->roc_timer);
+	ieee80211_queue_work(phy->mt76->hw, &phy->roc_work);
+}
+
 void mt7925_roc_abort_sync(struct mt792x_dev *dev)
 {
 	struct mt792x_phy *phy = &dev->phy;
@@ -473,6 +491,17 @@ void mt7925_roc_work(struct work_struct *work)
 	phy = (struct mt792x_phy *)container_of(work, struct mt792x_phy,
 						roc_work);
 
+	/* Check abort flag BEFORE acquiring mutex to prevent deadlock.
+	 * If abort is requested while we're in the sta_remove path (which
+	 * holds the mutex), we must not try to acquire it or we'll deadlock.
+	 * Clear the flags and only notify mac80211 if ROC was actually active.
+	 */
+	if (test_and_clear_bit(MT76_STATE_ROC_ABORT, &phy->mt76->state)) {
+		if (test_and_clear_bit(MT76_STATE_ROC, &phy->mt76->state))
+			ieee80211_remain_on_channel_expired(phy->mt76->hw);
+		return;
+	}
+
 	if (!test_and_clear_bit(MT76_STATE_ROC, &phy->mt76->state))
 		return;
 
@@ -500,14 +529,93 @@ static int mt7925_abort_roc(struct mt792x_phy *phy,
 	return err;
 }
 
+/* ROC rate limiting constants - exponential backoff to prevent MCU overload
+ * when upper layers trigger rapid reconnection cycles (e.g., MLO auth failures).
+ * Max backoff ~1.6s, resets after 10s of no timeouts.
+ */
+#define MT7925_ROC_BACKOFF_BASE_MS	100
+#define MT7925_ROC_BACKOFF_MAX_MS	1600
+#define MT7925_ROC_TIMEOUT_RESET_MS	10000
+#define MT7925_ROC_TIMEOUT_WARN_THRESH	5
+
+/* Check if ROC should be throttled due to recent timeouts.
+ * Returns delay in jiffies if throttling, 0 if OK to proceed.
+ */
+static unsigned long mt7925_roc_throttle_check(struct mt792x_phy *phy)
+{
+	unsigned long now = jiffies;
+
+	/* Reset timeout counter if it's been a while since last timeout */
+	if (phy->roc_timeout_count &&
+	    time_after(now, phy->roc_last_timeout +
+		       msecs_to_jiffies(MT7925_ROC_TIMEOUT_RESET_MS))) {
+		phy->roc_timeout_count = 0;
+		phy->roc_backoff_until = 0;
+	}
+
+	/* Check if we're still in backoff period */
+	if (phy->roc_backoff_until && time_before(now, phy->roc_backoff_until))
+		return phy->roc_backoff_until - now;
+
+	return 0;
+}
+
+/* Record ROC timeout and calculate backoff period */
+static void mt7925_roc_record_timeout(struct mt792x_phy *phy)
+{
+	unsigned int backoff_ms;
+
+	phy->roc_last_timeout = jiffies;
+	phy->roc_timeout_count++;
+
+	/* Exponential backoff: 100ms, 200ms, 400ms, 800ms, 1600ms (capped) */
+	backoff_ms = MT7925_ROC_BACKOFF_BASE_MS <<
+		     min_t(u8, phy->roc_timeout_count - 1, 4);
+	if (backoff_ms > MT7925_ROC_BACKOFF_MAX_MS)
+		backoff_ms = MT7925_ROC_BACKOFF_MAX_MS;
+
+	phy->roc_backoff_until = jiffies + msecs_to_jiffies(backoff_ms);
+
+	/* Warn if we're seeing repeated timeouts - likely upper layer issue */
+	if (phy->roc_timeout_count == MT7925_ROC_TIMEOUT_WARN_THRESH)
+		dev_warn(phy->dev->mt76.dev,
+			 "mt7925: %u consecutive ROC timeouts, possible mac80211/wpa_supplicant issue (MLO key race?)\n",
+			 phy->roc_timeout_count);
+}
+
+/* Clear timeout tracking on successful ROC */
+static void mt7925_roc_clear_timeout(struct mt792x_phy *phy)
+{
+	phy->roc_timeout_count = 0;
+	phy->roc_backoff_until = 0;
+}
+
 static int mt7925_set_roc(struct mt792x_phy *phy,
 			  struct mt792x_bss_conf *mconf,
 			  struct ieee80211_channel *chan,
 			  int duration,
 			  enum mt7925_roc_req type)
 {
+	unsigned long throttle;
 	int err;
 
+	/* Check rate limiting - if in backoff period, wait or return busy */
+	throttle = mt7925_roc_throttle_check(phy);
+	if (throttle) {
+		/* For short backoffs, wait; for longer ones, return busy */
+		if (throttle < msecs_to_jiffies(200)) {
+			msleep(jiffies_to_msecs(throttle));
+		} else {
+			dev_dbg(phy->dev->mt76.dev,
+				"mt7925: ROC throttled, %lu ms remaining\n",
+				jiffies_to_msecs(throttle));
+			return -EBUSY;
+		}
+	}
+
+	/* Clear stale abort flag from previous ROC */
+	clear_bit(MT76_STATE_ROC_ABORT, &phy->mt76->state);
+
 	if (test_and_set_bit(MT76_STATE_ROC, &phy->mt76->state))
 		return -EBUSY;
 
@@ -523,7 +631,11 @@ static int mt7925_set_roc(struct mt792x_phy *phy,
 	if (!wait_event_timeout(phy->roc_wait, phy->roc_grant, 4 * HZ)) {
 		mt7925_mcu_abort_roc(phy, mconf, phy->roc_token_id);
 		clear_bit(MT76_STATE_ROC, &phy->mt76->state);
+		mt7925_roc_record_timeout(phy);
 		err = -ETIMEDOUT;
+	} else {
+		/* Successful ROC - reset timeout tracking */
+		mt7925_roc_clear_timeout(phy);
 	}
 
 out:
@@ -534,8 +646,27 @@ static int mt7925_set_mlo_roc(struct mt792x_phy *phy,
 			      struct mt792x_bss_conf *mconf,
 			      u16 sel_links)
 {
+	unsigned long throttle;
 	int err;
 
+	/* Check rate limiting - MLO ROC is especially prone to rapid-fire
+	 * during reconnection cycles after MLO authentication failures.
+	 */
+	throttle = mt7925_roc_throttle_check(phy);
+	if (throttle) {
+		if (throttle < msecs_to_jiffies(200)) {
+			msleep(jiffies_to_msecs(throttle));
+		} else {
+			dev_dbg(phy->dev->mt76.dev,
+				"mt7925: MLO ROC throttled, %lu ms remaining\n",
+				jiffies_to_msecs(throttle));
+			return -EBUSY;
+		}
+	}
+
+	/* Clear stale abort flag from previous ROC */
+	clear_bit(MT76_STATE_ROC_ABORT, &phy->mt76->state);
+
 	if (WARN_ON_ONCE(test_and_set_bit(MT76_STATE_ROC, &phy->mt76->state)))
 		return -EBUSY;
 
@@ -550,7 +681,10 @@ static int mt7925_set_mlo_roc(struct mt792x_phy *phy,
 	if (!wait_event_timeout(phy->roc_wait, phy->roc_grant, 4 * HZ)) {
 		mt7925_mcu_abort_roc(phy, mconf, phy->roc_token_id);
 		clear_bit(MT76_STATE_ROC, &phy->mt76->state);
+		mt7925_roc_record_timeout(phy);
 		err = -ETIMEDOUT;
+	} else {
+		mt7925_roc_clear_timeout(phy);
 	}
 
 out:
@@ -567,6 +701,7 @@ static int mt7925_remain_on_channel(struct ieee80211_hw *hw,
 	struct mt792x_phy *phy = mt792x_hw_phy(hw);
 	int err;
 
+	cancel_work_sync(&phy->roc_work);
 	mt792x_mutex_acquire(phy->dev);
 	err = mt7925_set_roc(phy, &mvif->bss_conf,
 			     chan, duration, MT7925_ROC_REQ_ROC);
@@ -874,14 +1009,14 @@ static int mt7925_mac_link_sta_add(struct mt76_dev *mdev,
 	if (!mlink)
 		return -EINVAL;
 
-	idx = mt76_wcid_alloc(dev->mt76.wcid_mask, MT792x_WTBL_STA - 1);
-	if (idx < 0)
-		return -ENOSPC;
-
 	mconf = mt792x_vif_to_link(mvif, link_id);
 	if (!mconf)
 		return -EINVAL;
 
+	idx = mt76_wcid_alloc(dev->mt76.wcid_mask, MT792x_WTBL_STA - 1);
+	if (idx < 0)
+		return -ENOSPC;
+
 	mt76_wcid_init(&mlink->wcid, 0);
 	mlink->wcid.sta = 1;
 	mlink->wcid.idx = idx;
@@ -901,14 +1036,16 @@ static int mt7925_mac_link_sta_add(struct mt76_dev *mdev,
 
 	ret = mt76_connac_pm_wake(&dev->mphy, &dev->pm);
 	if (ret)
-		return ret;
+		goto err_wcid;
 
 	mt7925_mac_wtbl_update(dev, idx,
 			       MT_WTBL_UPDATE_ADM_COUNT_CLEAR);
 
 	link_conf = mt792x_vif_to_bss_conf(vif, link_id);
-	if (!link_conf)
-		return -EINVAL;
+	if (!link_conf) {
+		ret = -EINVAL;
+		goto err_wcid;
+	}
 
 	/* should update bss info before STA add */
 	if (vif->type == NL80211_IFTYPE_STATION && !link_sta->sta->tdls) {
@@ -920,7 +1057,7 @@ static int mt7925_mac_link_sta_add(struct mt76_dev *mdev,
 			ret = mt7925_mcu_add_bss_info(&dev->phy, mconf->mt76.ctx,
 						      link_conf, link_sta, false);
 		if (ret)
-			return ret;
+			goto err_wcid;
 	}
 
 	if (ieee80211_vif_is_mld(vif) &&
@@ -928,28 +1065,34 @@ static int mt7925_mac_link_sta_add(struct mt76_dev *mdev,
 		ret = mt7925_mcu_sta_update(dev, link_sta, vif, true,
 					    MT76_STA_INFO_STATE_NONE);
 		if (ret)
-			return ret;
+			goto err_wcid;
 	} else if (ieee80211_vif_is_mld(vif) &&
 		   link_sta != mlink->pri_link) {
 		ret = mt7925_mcu_sta_update(dev, mlink->pri_link, vif,
 					    true, MT76_STA_INFO_STATE_ASSOC);
 		if (ret)
-			return ret;
+			goto err_wcid;
 
 		ret = mt7925_mcu_sta_update(dev, link_sta, vif, true,
 					    MT76_STA_INFO_STATE_ASSOC);
 		if (ret)
-			return ret;
+			goto err_wcid;
 	} else {
 		ret = mt7925_mcu_sta_update(dev, link_sta, vif, true,
 					    MT76_STA_INFO_STATE_NONE);
 		if (ret)
-			return ret;
+			goto err_wcid;
 	}
 
 	mt76_connac_power_save_sched(&dev->mphy, &dev->pm);
 
 	return 0;
+
+err_wcid:
+	rcu_assign_pointer(dev->mt76.wcid[idx], NULL);
+	mt76_wcid_mask_clear(dev->mt76.wcid_mask, idx);
+	mt76_connac_power_save_sched(&dev->mphy, &dev->pm);
+	return ret;
 }
 
 static int
@@ -1135,7 +1278,8 @@ static void mt7925_mac_link_sta_remove(struct mt76_dev *mdev,
 	if (!mlink)
 		return;
 
-	mt7925_roc_abort_sync(dev);
+	/* Async abort - caller already holds mutex */
+	mt7925_roc_abort_async(dev);
 
 	mt76_connac_free_pending_tx_skbs(&dev->pm, &mlink->wcid);
 	mt76_connac_pm_wake(&dev->mphy, &dev->pm);
@@ -1530,6 +1674,8 @@ static int mt7925_suspend(struct ieee80211_hw *hw,
 	cancel_delayed_work_sync(&dev->pm.ps_work);
 	mt76_connac_free_pending_tx_skbs(&dev->pm, NULL);
 
+	/* Cancel ROC before quiescing starts */
+	mt7925_roc_abort_sync(dev);
 	mt792x_mutex_acquire(dev);
 
 	clear_bit(MT76_STATE_RUNNING, &phy->mt76->state);
@@ -1876,6 +2022,8 @@ static void mt7925_mgd_prepare_tx(struct ieee80211_hw *hw,
 	u16 duration = info->duration ? info->duration :
 		       jiffies_to_msecs(HZ);
 
+	cancel_work_sync(&mvif->phy->roc_work);
+
 	mt792x_mutex_acquire(dev);
 	mt7925_set_roc(mvif->phy, &mvif->bss_conf,
 		       mvif->bss_conf.mt76.ctx->def.chan, duration,
@@ -2033,6 +2181,7 @@ mt7925_change_vif_links(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 	if (old_links == new_links)
 		return 0;
 
+	cancel_work_sync(&phy->roc_work);
 	mt792x_mutex_acquire(dev);
 
 	for_each_set_bit(link_id, &rem, IEEE80211_MLD_MAX_NUM_LINKS) {
diff --git a/drivers/net/wireless/mediatek/mt76/mt792x.h b/drivers/net/wireless/mediatek/mt76/mt792x.h
index 8388638ed550..d9c1ea709390 100644
--- a/drivers/net/wireless/mediatek/mt76/mt792x.h
+++ b/drivers/net/wireless/mediatek/mt76/mt792x.h
@@ -186,6 +186,13 @@ struct mt792x_phy {
 	wait_queue_head_t roc_wait;
 	u8 roc_token_id;
 	bool roc_grant;
+
+	/* ROC rate limiting to prevent MCU overload during rapid reconnection
+	 * cycles (e.g., MLO authentication failures causing repeated ROC).
+	 */
+	u8 roc_timeout_count;		/* consecutive ROC timeouts */
+	unsigned long roc_last_timeout;	/* jiffies of last timeout */
+	unsigned long roc_backoff_until;/* don't issue ROC until this time */
 };
 
 struct mt792x_irq_map {
-- 
2.52.0
Re: [PATCH 11/11] wifi: mt76: mt7925: fix ROC deadlocks and race conditions
Posted by kernel test robot 2 weeks, 5 days ago
Hi Zac,

kernel test robot noticed the following build warnings:

[auto build test WARNING on wireless-next/main]
[also build test WARNING on wireless/main linus/master v6.19-rc6 next-20260119]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Zac/wifi-mt76-fix-list-corruption-in-mt76_wcid_cleanup/20260120-143842
base:   https://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless-next.git main
patch link:    https://lore.kernel.org/r/20260120062854.126501-12-zac%40zacbowling.com
patch subject: [PATCH 11/11] wifi: mt76: mt7925: fix ROC deadlocks and race conditions
config: i386-randconfig-015-20260120 (https://download.01.org/0day-ci/archive/20260120/202601202144.ee4DM9Pz-lkp@intel.com/config)
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260120/202601202144.ee4DM9Pz-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202601202144.ee4DM9Pz-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> drivers/net/wireless/mediatek/mt76/mt7925/main.c:611:5: warning: format specifies type 'unsigned long' but the argument has type 'unsigned int' [-Wformat]
     610 |                                 "mt7925: ROC throttled, %lu ms remaining\n",
         |                                                         ~~~
         |                                                         %u
     611 |                                 jiffies_to_msecs(throttle));
         |                                 ^~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/dev_printk.h:165:39: note: expanded from macro 'dev_dbg'
     165 |         dynamic_dev_dbg(dev, dev_fmt(fmt), ##__VA_ARGS__)
         |                                      ~~~     ^~~~~~~~~~~
   include/linux/dynamic_debug.h:285:19: note: expanded from macro 'dynamic_dev_dbg'
     285 |                            dev, fmt, ##__VA_ARGS__)
         |                                 ~~~    ^~~~~~~~~~~
   include/linux/dynamic_debug.h:261:59: note: expanded from macro '_dynamic_func_call'
     261 |         _dynamic_func_call_cls(_DPRINTK_CLASS_DFLT, fmt, func, ##__VA_ARGS__)
         |                                                                  ^~~~~~~~~~~
   include/linux/dynamic_debug.h:259:65: note: expanded from macro '_dynamic_func_call_cls'
     259 |         __dynamic_func_call_cls(__UNIQUE_ID(ddebug), cls, fmt, func, ##__VA_ARGS__)
         |                                                                        ^~~~~~~~~~~
   include/linux/dynamic_debug.h:231:15: note: expanded from macro '__dynamic_func_call_cls'
     231 |                 func(&id, ##__VA_ARGS__);                       \
         |                             ^~~~~~~~~~~
   drivers/net/wireless/mediatek/mt76/mt7925/main.c:662:5: warning: format specifies type 'unsigned long' but the argument has type 'unsigned int' [-Wformat]
     661 |                                 "mt7925: MLO ROC throttled, %lu ms remaining\n",
         |                                                             ~~~
         |                                                             %u
     662 |                                 jiffies_to_msecs(throttle));
         |                                 ^~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/dev_printk.h:165:39: note: expanded from macro 'dev_dbg'
     165 |         dynamic_dev_dbg(dev, dev_fmt(fmt), ##__VA_ARGS__)
         |                                      ~~~     ^~~~~~~~~~~
   include/linux/dynamic_debug.h:285:19: note: expanded from macro 'dynamic_dev_dbg'
     285 |                            dev, fmt, ##__VA_ARGS__)
         |                                 ~~~    ^~~~~~~~~~~
   include/linux/dynamic_debug.h:261:59: note: expanded from macro '_dynamic_func_call'
     261 |         _dynamic_func_call_cls(_DPRINTK_CLASS_DFLT, fmt, func, ##__VA_ARGS__)
         |                                                                  ^~~~~~~~~~~
   include/linux/dynamic_debug.h:259:65: note: expanded from macro '_dynamic_func_call_cls'
     259 |         __dynamic_func_call_cls(__UNIQUE_ID(ddebug), cls, fmt, func, ##__VA_ARGS__)
         |                                                                        ^~~~~~~~~~~
   include/linux/dynamic_debug.h:231:15: note: expanded from macro '__dynamic_func_call_cls'
     231 |                 func(&id, ##__VA_ARGS__);                       \
         |                             ^~~~~~~~~~~
   2 warnings generated.


vim +611 drivers/net/wireless/mediatek/mt76/mt7925/main.c

   592	
   593	static int mt7925_set_roc(struct mt792x_phy *phy,
   594				  struct mt792x_bss_conf *mconf,
   595				  struct ieee80211_channel *chan,
   596				  int duration,
   597				  enum mt7925_roc_req type)
   598	{
   599		unsigned long throttle;
   600		int err;
   601	
   602		/* Check rate limiting - if in backoff period, wait or return busy */
   603		throttle = mt7925_roc_throttle_check(phy);
   604		if (throttle) {
   605			/* For short backoffs, wait; for longer ones, return busy */
   606			if (throttle < msecs_to_jiffies(200)) {
   607				msleep(jiffies_to_msecs(throttle));
   608			} else {
   609				dev_dbg(phy->dev->mt76.dev,
   610					"mt7925: ROC throttled, %lu ms remaining\n",
 > 611					jiffies_to_msecs(throttle));
   612				return -EBUSY;
   613			}
   614		}
   615	
   616		/* Clear stale abort flag from previous ROC */
   617		clear_bit(MT76_STATE_ROC_ABORT, &phy->mt76->state);
   618	
   619		if (test_and_set_bit(MT76_STATE_ROC, &phy->mt76->state))
   620			return -EBUSY;
   621	
   622		phy->roc_grant = false;
   623	
   624		err = mt7925_mcu_set_roc(phy, mconf, chan, duration, type,
   625					 ++phy->roc_token_id);
   626		if (err < 0) {
   627			clear_bit(MT76_STATE_ROC, &phy->mt76->state);
   628			goto out;
   629		}
   630	
   631		if (!wait_event_timeout(phy->roc_wait, phy->roc_grant, 4 * HZ)) {
   632			mt7925_mcu_abort_roc(phy, mconf, phy->roc_token_id);
   633			clear_bit(MT76_STATE_ROC, &phy->mt76->state);
   634			mt7925_roc_record_timeout(phy);
   635			err = -ETIMEDOUT;
   636		} else {
   637			/* Successful ROC - reset timeout tracking */
   638			mt7925_roc_clear_timeout(phy);
   639		}
   640	
   641	out:
   642		return err;
   643	}
   644	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Re: [PATCH 11/11] wifi: mt76: mt7925: fix ROC deadlocks and race conditions
Posted by kernel test robot 2 weeks, 5 days ago
Hi Zac,

kernel test robot noticed the following build warnings:

[auto build test WARNING on wireless-next/main]
[also build test WARNING on wireless/main linus/master v6.19-rc6 next-20260119]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Zac/wifi-mt76-fix-list-corruption-in-mt76_wcid_cleanup/20260120-143842
base:   https://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless-next.git main
patch link:    https://lore.kernel.org/r/20260120062854.126501-12-zac%40zacbowling.com
patch subject: [PATCH 11/11] wifi: mt76: mt7925: fix ROC deadlocks and race conditions
config: m68k-allyesconfig (https://download.01.org/0day-ci/archive/20260120/202601201954.zxO1N1DS-lkp@intel.com/config)
compiler: m68k-linux-gcc (GCC) 15.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260120/202601201954.zxO1N1DS-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202601201954.zxO1N1DS-lkp@intel.com/

All warnings (new ones prefixed by >>):

   In file included from include/linux/printk.h:621,
                    from include/linux/kernel.h:31,
                    from include/linux/skbuff.h:13,
                    from include/linux/if_ether.h:19,
                    from include/linux/etherdevice.h:20,
                    from drivers/net/wireless/mediatek/mt76/mt7925/main.c:4:
   drivers/net/wireless/mediatek/mt76/mt7925/main.c: In function 'mt7925_set_roc':
>> drivers/net/wireless/mediatek/mt76/mt7925/main.c:610:33: warning: format '%lu' expects argument of type 'long unsigned int', but argument 4 has type 'unsigned int' [-Wformat=]
     610 |                                 "mt7925: ROC throttled, %lu ms remaining\n",
         |                                 ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/dynamic_debug.h:231:29: note: in definition of macro '__dynamic_func_call_cls'
     231 |                 func(&id, ##__VA_ARGS__);                       \
         |                             ^~~~~~~~~~~
   include/linux/dynamic_debug.h:261:9: note: in expansion of macro '_dynamic_func_call_cls'
     261 |         _dynamic_func_call_cls(_DPRINTK_CLASS_DFLT, fmt, func, ##__VA_ARGS__)
         |         ^~~~~~~~~~~~~~~~~~~~~~
   include/linux/dynamic_debug.h:284:9: note: in expansion of macro '_dynamic_func_call'
     284 |         _dynamic_func_call(fmt, __dynamic_dev_dbg,              \
         |         ^~~~~~~~~~~~~~~~~~
   include/linux/dev_printk.h:165:9: note: in expansion of macro 'dynamic_dev_dbg'
     165 |         dynamic_dev_dbg(dev, dev_fmt(fmt), ##__VA_ARGS__)
         |         ^~~~~~~~~~~~~~~
   include/linux/dev_printk.h:165:30: note: in expansion of macro 'dev_fmt'
     165 |         dynamic_dev_dbg(dev, dev_fmt(fmt), ##__VA_ARGS__)
         |                              ^~~~~~~
   drivers/net/wireless/mediatek/mt76/mt7925/main.c:609:25: note: in expansion of macro 'dev_dbg'
     609 |                         dev_dbg(phy->dev->mt76.dev,
         |                         ^~~~~~~
   drivers/net/wireless/mediatek/mt76/mt7925/main.c:610:59: note: format string is defined here
     610 |                                 "mt7925: ROC throttled, %lu ms remaining\n",
         |                                                         ~~^
         |                                                           |
         |                                                           long unsigned int
         |                                                         %u
   drivers/net/wireless/mediatek/mt76/mt7925/main.c: In function 'mt7925_set_mlo_roc':
   drivers/net/wireless/mediatek/mt76/mt7925/main.c:661:33: warning: format '%lu' expects argument of type 'long unsigned int', but argument 4 has type 'unsigned int' [-Wformat=]
     661 |                                 "mt7925: MLO ROC throttled, %lu ms remaining\n",
         |                                 ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/dynamic_debug.h:231:29: note: in definition of macro '__dynamic_func_call_cls'
     231 |                 func(&id, ##__VA_ARGS__);                       \
         |                             ^~~~~~~~~~~
   include/linux/dynamic_debug.h:261:9: note: in expansion of macro '_dynamic_func_call_cls'
     261 |         _dynamic_func_call_cls(_DPRINTK_CLASS_DFLT, fmt, func, ##__VA_ARGS__)
         |         ^~~~~~~~~~~~~~~~~~~~~~
   include/linux/dynamic_debug.h:284:9: note: in expansion of macro '_dynamic_func_call'
     284 |         _dynamic_func_call(fmt, __dynamic_dev_dbg,              \
         |         ^~~~~~~~~~~~~~~~~~
   include/linux/dev_printk.h:165:9: note: in expansion of macro 'dynamic_dev_dbg'
     165 |         dynamic_dev_dbg(dev, dev_fmt(fmt), ##__VA_ARGS__)
         |         ^~~~~~~~~~~~~~~
   include/linux/dev_printk.h:165:30: note: in expansion of macro 'dev_fmt'
     165 |         dynamic_dev_dbg(dev, dev_fmt(fmt), ##__VA_ARGS__)
         |                              ^~~~~~~
   drivers/net/wireless/mediatek/mt76/mt7925/main.c:660:25: note: in expansion of macro 'dev_dbg'
     660 |                         dev_dbg(phy->dev->mt76.dev,
         |                         ^~~~~~~
   drivers/net/wireless/mediatek/mt76/mt7925/main.c:661:63: note: format string is defined here
     661 |                                 "mt7925: MLO ROC throttled, %lu ms remaining\n",
         |                                                             ~~^
         |                                                               |
         |                                                               long unsigned int
         |                                                             %u


vim +610 drivers/net/wireless/mediatek/mt76/mt7925/main.c

   592	
   593	static int mt7925_set_roc(struct mt792x_phy *phy,
   594				  struct mt792x_bss_conf *mconf,
   595				  struct ieee80211_channel *chan,
   596				  int duration,
   597				  enum mt7925_roc_req type)
   598	{
   599		unsigned long throttle;
   600		int err;
   601	
   602		/* Check rate limiting - if in backoff period, wait or return busy */
   603		throttle = mt7925_roc_throttle_check(phy);
   604		if (throttle) {
   605			/* For short backoffs, wait; for longer ones, return busy */
   606			if (throttle < msecs_to_jiffies(200)) {
   607				msleep(jiffies_to_msecs(throttle));
   608			} else {
   609				dev_dbg(phy->dev->mt76.dev,
 > 610					"mt7925: ROC throttled, %lu ms remaining\n",
   611					jiffies_to_msecs(throttle));
   612				return -EBUSY;
   613			}
   614		}
   615	
   616		/* Clear stale abort flag from previous ROC */
   617		clear_bit(MT76_STATE_ROC_ABORT, &phy->mt76->state);
   618	
   619		if (test_and_set_bit(MT76_STATE_ROC, &phy->mt76->state))
   620			return -EBUSY;
   621	
   622		phy->roc_grant = false;
   623	
   624		err = mt7925_mcu_set_roc(phy, mconf, chan, duration, type,
   625					 ++phy->roc_token_id);
   626		if (err < 0) {
   627			clear_bit(MT76_STATE_ROC, &phy->mt76->state);
   628			goto out;
   629		}
   630	
   631		if (!wait_event_timeout(phy->roc_wait, phy->roc_grant, 4 * HZ)) {
   632			mt7925_mcu_abort_roc(phy, mconf, phy->roc_token_id);
   633			clear_bit(MT76_STATE_ROC, &phy->mt76->state);
   634			mt7925_roc_record_timeout(phy);
   635			err = -ETIMEDOUT;
   636		} else {
   637			/* Successful ROC - reset timeout tracking */
   638			mt7925_roc_clear_timeout(phy);
   639		}
   640	
   641	out:
   642		return err;
   643	}
   644	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Re: [PATCH 11/11] wifi: mt76: mt7925: fix ROC deadlocks and race conditions
Posted by Sean Wang 2 weeks, 6 days ago
On Tue, Jan 20, 2026 at 12:29 AM Zac <zac@zacbowling.com> wrote:
>
> From: Zac Bowling <zac@zacbowling.com>
>
> Fix multiple interrelated issues in the remain-on-channel (ROC) handling
> that cause deadlocks, race conditions, and resource leaks.
>
> Problems fixed:
>
> 1. Deadlock in sta removal ROC abort path:
>    When a station is removed while a ROC operation is in progress, the
>    driver would call mt7925_roc_abort_sync() which waits for ROC completion.
>    However, the ROC work itself needs to acquire mt792x_mutex which is
>    already held during station removal, causing a deadlock.
>
>    Fix: Use async ROC abort (mt76_connac_mcu_abort_roc) when called from
>    paths that already hold the mutex, and add MT76_STATE_ROC_ABORT flag
>    to coordinate between the abort and the ROC timer.
>

Hi Zac,

Thanks for your continued efforts on the driver.
We’ve sent a patch to address the mt7925 deadlock at the link below:
https://lists.infradead.org/pipermail/linux-mediatek/2025-December/102164.html
We plan to send the same fix to mt7921 as well.

I had a couple of questions and suggestions:
1. Would it be possible to rebase your patchset on top of this fix
(and any other pending patches that are not yet merged)? We noticed
some conflicts when applying the series, and rebasing it this way
would make it easier for nbd to integrate the full patchset.
2. Could you please elaborate on the test scenarios that would trigger
ROC rate limiting for MLO authentication failures? If I recall
correctly, ROC operations are typically handled sequentially unless
multiple interfaces are created on the same physical device. In that
case, how many virtual interfaces and which operating modes (GC/STA or
multiple STAs) are required to reproduce the issue?

I will try to prepare an out-of-tree branch with the current pending
patches to help your patchset integrate more smoothly. Thanks for
collecting community issues and fixes and incorporating them into the
driver.

             Sean

> 2. ROC timer race during suspend:
>    The ROC timer could fire after the device started suspending but before
>    the ROC was properly aborted, causing undefined behavior.
>
>    Fix: Delete ROC timer synchronously before suspend and check device
>    state before processing ROC timeout.
>
> 3. ROC rate limiting for MLO auth failures:
>    Rapid ROC requests during MLO authentication can overwhelm the firmware,
>    causing authentication timeouts. The MT7925 firmware has limited ROC
>    handling capacity.
>
>    Fix: Add rate limiting infrastructure with configurable minimum interval
>    between ROC requests. Track last ROC completion time and defer new
>    requests if they arrive too quickly.
>
> 4. WCID leak in ROC cleanup:
>    When ROC operations are aborted, the associated WCID resources were
>    not being properly released, causing resource exhaustion over time.
>
>    Fix: Ensure WCID cleanup happens in all ROC termination paths.
>
> 5. Async ROC abort race condition:
>    The async ROC abort could race with normal ROC completion, causing
>    double-free or use-after-free of ROC resources.
>
>    Fix: Use MT76_STATE_ROC_ABORT flag and proper synchronization to
>    prevent races between async abort and normal completion paths.
>
> These fixes work together to provide robust ROC handling that doesn't
> deadlock, properly releases resources, and handles edge cases during
> suspend and MLO operations.
>
> Fixes: c948b5da6bbe ("wifi: mt76: mt7925: add Mediatek Wi-Fi7 driver for mt7925 device")
> Signed-off-by: Zac Bowling <zac@zacbowling.com>
> ---
>  drivers/net/wireless/mediatek/mt76/mt76.h     |   1 +
>  .../net/wireless/mediatek/mt76/mt7925/main.c  | 175 ++++++++++++++++--
>  drivers/net/wireless/mediatek/mt76/mt792x.h   |   7 +
>  3 files changed, 170 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/net/wireless/mediatek/mt76/mt76.h b/drivers/net/wireless/mediatek/mt76/mt76.h
> index d05e83ea1cac..91f9dd95c89e 100644
> --- a/drivers/net/wireless/mediatek/mt76/mt76.h
> +++ b/drivers/net/wireless/mediatek/mt76/mt76.h
> @@ -511,6 +511,7 @@ enum {
>         MT76_STATE_POWER_OFF,
>         MT76_STATE_SUSPEND,
>         MT76_STATE_ROC,
> +       MT76_STATE_ROC_ABORT,
>         MT76_STATE_PM,
>         MT76_STATE_WED_RESET,
>  };
> diff --git a/drivers/net/wireless/mediatek/mt76/mt7925/main.c b/drivers/net/wireless/mediatek/mt76/mt7925/main.c
> index cc7ef2c17032..2404f7812897 100644
> --- a/drivers/net/wireless/mediatek/mt76/mt7925/main.c
> +++ b/drivers/net/wireless/mediatek/mt76/mt7925/main.c
> @@ -453,6 +453,24 @@ static void mt7925_roc_iter(void *priv, u8 *mac,
>         mt7925_mcu_abort_roc(phy, &mvif->bss_conf, phy->roc_token_id);
>  }
>
> +/* Async ROC abort - safe to call while holding mutex.
> + * Sets abort flag and lets roc_work handle cleanup without blocking.
> + * This prevents deadlock when called from sta_remove path which holds mutex.
> + */
> +static void mt7925_roc_abort_async(struct mt792x_dev *dev)
> +{
> +       struct mt792x_phy *phy = &dev->phy;
> +
> +       /* Set abort flag - roc_work checks this before acquiring mutex */
> +       set_bit(MT76_STATE_ROC_ABORT, &phy->mt76->state);
> +
> +       /* Stop timer and schedule work to handle cleanup.
> +        * Must schedule work since timer may not have fired yet.
> +        */
> +       timer_delete(&phy->roc_timer);
> +       ieee80211_queue_work(phy->mt76->hw, &phy->roc_work);
> +}
> +
>  void mt7925_roc_abort_sync(struct mt792x_dev *dev)
>  {
>         struct mt792x_phy *phy = &dev->phy;
> @@ -473,6 +491,17 @@ void mt7925_roc_work(struct work_struct *work)
>         phy = (struct mt792x_phy *)container_of(work, struct mt792x_phy,
>                                                 roc_work);
>
> +       /* Check abort flag BEFORE acquiring mutex to prevent deadlock.
> +        * If abort is requested while we're in the sta_remove path (which
> +        * holds the mutex), we must not try to acquire it or we'll deadlock.
> +        * Clear the flags and only notify mac80211 if ROC was actually active.
> +        */
> +       if (test_and_clear_bit(MT76_STATE_ROC_ABORT, &phy->mt76->state)) {
> +               if (test_and_clear_bit(MT76_STATE_ROC, &phy->mt76->state))
> +                       ieee80211_remain_on_channel_expired(phy->mt76->hw);
> +               return;
> +       }
> +
>         if (!test_and_clear_bit(MT76_STATE_ROC, &phy->mt76->state))
>                 return;
>
> @@ -500,14 +529,93 @@ static int mt7925_abort_roc(struct mt792x_phy *phy,
>         return err;
>  }
>
> +/* ROC rate limiting constants - exponential backoff to prevent MCU overload
> + * when upper layers trigger rapid reconnection cycles (e.g., MLO auth failures).
> + * Max backoff ~1.6s, resets after 10s of no timeouts.
> + */
> +#define MT7925_ROC_BACKOFF_BASE_MS     100
> +#define MT7925_ROC_BACKOFF_MAX_MS      1600
> +#define MT7925_ROC_TIMEOUT_RESET_MS    10000
> +#define MT7925_ROC_TIMEOUT_WARN_THRESH 5
> +
> +/* Check if ROC should be throttled due to recent timeouts.
> + * Returns delay in jiffies if throttling, 0 if OK to proceed.
> + */
> +static unsigned long mt7925_roc_throttle_check(struct mt792x_phy *phy)
> +{
> +       unsigned long now = jiffies;
> +
> +       /* Reset timeout counter if it's been a while since last timeout */
> +       if (phy->roc_timeout_count &&
> +           time_after(now, phy->roc_last_timeout +
> +                      msecs_to_jiffies(MT7925_ROC_TIMEOUT_RESET_MS))) {
> +               phy->roc_timeout_count = 0;
> +               phy->roc_backoff_until = 0;
> +       }
> +
> +       /* Check if we're still in backoff period */
> +       if (phy->roc_backoff_until && time_before(now, phy->roc_backoff_until))
> +               return phy->roc_backoff_until - now;
> +
> +       return 0;
> +}
> +
> +/* Record ROC timeout and calculate backoff period */
> +static void mt7925_roc_record_timeout(struct mt792x_phy *phy)
> +{
> +       unsigned int backoff_ms;
> +
> +       phy->roc_last_timeout = jiffies;
> +       phy->roc_timeout_count++;
> +
> +       /* Exponential backoff: 100ms, 200ms, 400ms, 800ms, 1600ms (capped) */
> +       backoff_ms = MT7925_ROC_BACKOFF_BASE_MS <<
> +                    min_t(u8, phy->roc_timeout_count - 1, 4);
> +       if (backoff_ms > MT7925_ROC_BACKOFF_MAX_MS)
> +               backoff_ms = MT7925_ROC_BACKOFF_MAX_MS;
> +
> +       phy->roc_backoff_until = jiffies + msecs_to_jiffies(backoff_ms);
> +
> +       /* Warn if we're seeing repeated timeouts - likely upper layer issue */
> +       if (phy->roc_timeout_count == MT7925_ROC_TIMEOUT_WARN_THRESH)
> +               dev_warn(phy->dev->mt76.dev,
> +                        "mt7925: %u consecutive ROC timeouts, possible mac80211/wpa_supplicant issue (MLO key race?)\n",
> +                        phy->roc_timeout_count);
> +}
> +
> +/* Clear timeout tracking on successful ROC */
> +static void mt7925_roc_clear_timeout(struct mt792x_phy *phy)
> +{
> +       phy->roc_timeout_count = 0;
> +       phy->roc_backoff_until = 0;
> +}
> +
>  static int mt7925_set_roc(struct mt792x_phy *phy,
>                           struct mt792x_bss_conf *mconf,
>                           struct ieee80211_channel *chan,
>                           int duration,
>                           enum mt7925_roc_req type)
>  {
> +       unsigned long throttle;
>         int err;
>
> +       /* Check rate limiting - if in backoff period, wait or return busy */
> +       throttle = mt7925_roc_throttle_check(phy);
> +       if (throttle) {
> +               /* For short backoffs, wait; for longer ones, return busy */
> +               if (throttle < msecs_to_jiffies(200)) {
> +                       msleep(jiffies_to_msecs(throttle));
> +               } else {
> +                       dev_dbg(phy->dev->mt76.dev,
> +                               "mt7925: ROC throttled, %lu ms remaining\n",
> +                               jiffies_to_msecs(throttle));
> +                       return -EBUSY;
> +               }
> +       }
> +
> +       /* Clear stale abort flag from previous ROC */
> +       clear_bit(MT76_STATE_ROC_ABORT, &phy->mt76->state);
> +
>         if (test_and_set_bit(MT76_STATE_ROC, &phy->mt76->state))
>                 return -EBUSY;
>
> @@ -523,7 +631,11 @@ static int mt7925_set_roc(struct mt792x_phy *phy,
>         if (!wait_event_timeout(phy->roc_wait, phy->roc_grant, 4 * HZ)) {
>                 mt7925_mcu_abort_roc(phy, mconf, phy->roc_token_id);
>                 clear_bit(MT76_STATE_ROC, &phy->mt76->state);
> +               mt7925_roc_record_timeout(phy);
>                 err = -ETIMEDOUT;
> +       } else {
> +               /* Successful ROC - reset timeout tracking */
> +               mt7925_roc_clear_timeout(phy);
>         }
>
>  out:
> @@ -534,8 +646,27 @@ static int mt7925_set_mlo_roc(struct mt792x_phy *phy,
>                               struct mt792x_bss_conf *mconf,
>                               u16 sel_links)
>  {
> +       unsigned long throttle;
>         int err;
>
> +       /* Check rate limiting - MLO ROC is especially prone to rapid-fire
> +        * during reconnection cycles after MLO authentication failures.
> +        */
> +       throttle = mt7925_roc_throttle_check(phy);
> +       if (throttle) {
> +               if (throttle < msecs_to_jiffies(200)) {
> +                       msleep(jiffies_to_msecs(throttle));
> +               } else {
> +                       dev_dbg(phy->dev->mt76.dev,
> +                               "mt7925: MLO ROC throttled, %lu ms remaining\n",
> +                               jiffies_to_msecs(throttle));
> +                       return -EBUSY;
> +               }
> +       }
> +
> +       /* Clear stale abort flag from previous ROC */
> +       clear_bit(MT76_STATE_ROC_ABORT, &phy->mt76->state);
> +
>         if (WARN_ON_ONCE(test_and_set_bit(MT76_STATE_ROC, &phy->mt76->state)))
>                 return -EBUSY;
>
> @@ -550,7 +681,10 @@ static int mt7925_set_mlo_roc(struct mt792x_phy *phy,
>         if (!wait_event_timeout(phy->roc_wait, phy->roc_grant, 4 * HZ)) {
>                 mt7925_mcu_abort_roc(phy, mconf, phy->roc_token_id);
>                 clear_bit(MT76_STATE_ROC, &phy->mt76->state);
> +               mt7925_roc_record_timeout(phy);
>                 err = -ETIMEDOUT;
> +       } else {
> +               mt7925_roc_clear_timeout(phy);
>         }
>
>  out:
> @@ -567,6 +701,7 @@ static int mt7925_remain_on_channel(struct ieee80211_hw *hw,
>         struct mt792x_phy *phy = mt792x_hw_phy(hw);
>         int err;
>
> +       cancel_work_sync(&phy->roc_work);
>         mt792x_mutex_acquire(phy->dev);
>         err = mt7925_set_roc(phy, &mvif->bss_conf,
>                              chan, duration, MT7925_ROC_REQ_ROC);
> @@ -874,14 +1009,14 @@ static int mt7925_mac_link_sta_add(struct mt76_dev *mdev,
>         if (!mlink)
>                 return -EINVAL;
>
> -       idx = mt76_wcid_alloc(dev->mt76.wcid_mask, MT792x_WTBL_STA - 1);
> -       if (idx < 0)
> -               return -ENOSPC;
> -
>         mconf = mt792x_vif_to_link(mvif, link_id);
>         if (!mconf)
>                 return -EINVAL;
>
> +       idx = mt76_wcid_alloc(dev->mt76.wcid_mask, MT792x_WTBL_STA - 1);
> +       if (idx < 0)
> +               return -ENOSPC;
> +
>         mt76_wcid_init(&mlink->wcid, 0);
>         mlink->wcid.sta = 1;
>         mlink->wcid.idx = idx;
> @@ -901,14 +1036,16 @@ static int mt7925_mac_link_sta_add(struct mt76_dev *mdev,
>
>         ret = mt76_connac_pm_wake(&dev->mphy, &dev->pm);
>         if (ret)
> -               return ret;
> +               goto err_wcid;
>
>         mt7925_mac_wtbl_update(dev, idx,
>                                MT_WTBL_UPDATE_ADM_COUNT_CLEAR);
>
>         link_conf = mt792x_vif_to_bss_conf(vif, link_id);
> -       if (!link_conf)
> -               return -EINVAL;
> +       if (!link_conf) {
> +               ret = -EINVAL;
> +               goto err_wcid;
> +       }
>
>         /* should update bss info before STA add */
>         if (vif->type == NL80211_IFTYPE_STATION && !link_sta->sta->tdls) {
> @@ -920,7 +1057,7 @@ static int mt7925_mac_link_sta_add(struct mt76_dev *mdev,
>                         ret = mt7925_mcu_add_bss_info(&dev->phy, mconf->mt76.ctx,
>                                                       link_conf, link_sta, false);
>                 if (ret)
> -                       return ret;
> +                       goto err_wcid;
>         }
>
>         if (ieee80211_vif_is_mld(vif) &&
> @@ -928,28 +1065,34 @@ static int mt7925_mac_link_sta_add(struct mt76_dev *mdev,
>                 ret = mt7925_mcu_sta_update(dev, link_sta, vif, true,
>                                             MT76_STA_INFO_STATE_NONE);
>                 if (ret)
> -                       return ret;
> +                       goto err_wcid;
>         } else if (ieee80211_vif_is_mld(vif) &&
>                    link_sta != mlink->pri_link) {
>                 ret = mt7925_mcu_sta_update(dev, mlink->pri_link, vif,
>                                             true, MT76_STA_INFO_STATE_ASSOC);
>                 if (ret)
> -                       return ret;
> +                       goto err_wcid;
>
>                 ret = mt7925_mcu_sta_update(dev, link_sta, vif, true,
>                                             MT76_STA_INFO_STATE_ASSOC);
>                 if (ret)
> -                       return ret;
> +                       goto err_wcid;
>         } else {
>                 ret = mt7925_mcu_sta_update(dev, link_sta, vif, true,
>                                             MT76_STA_INFO_STATE_NONE);
>                 if (ret)
> -                       return ret;
> +                       goto err_wcid;
>         }
>
>         mt76_connac_power_save_sched(&dev->mphy, &dev->pm);
>
>         return 0;
> +
> +err_wcid:
> +       rcu_assign_pointer(dev->mt76.wcid[idx], NULL);
> +       mt76_wcid_mask_clear(dev->mt76.wcid_mask, idx);
> +       mt76_connac_power_save_sched(&dev->mphy, &dev->pm);
> +       return ret;
>  }
>
>  static int
> @@ -1135,7 +1278,8 @@ static void mt7925_mac_link_sta_remove(struct mt76_dev *mdev,
>         if (!mlink)
>                 return;
>
> -       mt7925_roc_abort_sync(dev);
> +       /* Async abort - caller already holds mutex */
> +       mt7925_roc_abort_async(dev);
>
>         mt76_connac_free_pending_tx_skbs(&dev->pm, &mlink->wcid);
>         mt76_connac_pm_wake(&dev->mphy, &dev->pm);
> @@ -1530,6 +1674,8 @@ static int mt7925_suspend(struct ieee80211_hw *hw,
>         cancel_delayed_work_sync(&dev->pm.ps_work);
>         mt76_connac_free_pending_tx_skbs(&dev->pm, NULL);
>
> +       /* Cancel ROC before quiescing starts */
> +       mt7925_roc_abort_sync(dev);
>         mt792x_mutex_acquire(dev);
>
>         clear_bit(MT76_STATE_RUNNING, &phy->mt76->state);
> @@ -1876,6 +2022,8 @@ static void mt7925_mgd_prepare_tx(struct ieee80211_hw *hw,
>         u16 duration = info->duration ? info->duration :
>                        jiffies_to_msecs(HZ);
>
> +       cancel_work_sync(&mvif->phy->roc_work);
> +
>         mt792x_mutex_acquire(dev);
>         mt7925_set_roc(mvif->phy, &mvif->bss_conf,
>                        mvif->bss_conf.mt76.ctx->def.chan, duration,
> @@ -2033,6 +2181,7 @@ mt7925_change_vif_links(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
>         if (old_links == new_links)
>                 return 0;
>
> +       cancel_work_sync(&phy->roc_work);
>         mt792x_mutex_acquire(dev);
>
>         for_each_set_bit(link_id, &rem, IEEE80211_MLD_MAX_NUM_LINKS) {
> diff --git a/drivers/net/wireless/mediatek/mt76/mt792x.h b/drivers/net/wireless/mediatek/mt76/mt792x.h
> index 8388638ed550..d9c1ea709390 100644
> --- a/drivers/net/wireless/mediatek/mt76/mt792x.h
> +++ b/drivers/net/wireless/mediatek/mt76/mt792x.h
> @@ -186,6 +186,13 @@ struct mt792x_phy {
>         wait_queue_head_t roc_wait;
>         u8 roc_token_id;
>         bool roc_grant;
> +
> +       /* ROC rate limiting to prevent MCU overload during rapid reconnection
> +        * cycles (e.g., MLO authentication failures causing repeated ROC).
> +        */
> +       u8 roc_timeout_count;           /* consecutive ROC timeouts */
> +       unsigned long roc_last_timeout; /* jiffies of last timeout */
> +       unsigned long roc_backoff_until;/* don't issue ROC until this time */
>  };
>
>  struct mt792x_irq_map {
> --
> 2.52.0
>
[PATCH v6 00/13] wifi: mt76: stability fixes for deadlocks, NULL derefs, and race conditions
Posted by Zac 2 weeks, 5 days ago
From: Zac Bowling <zac@zacbowling.com>

TLDR: This series addresses stability issues in both the MT7921 and MT7925 
WiFi drivers that cause kernel panics, deadlocks, and system hangs 
on various systems using these drivers.

This v6 series is rebased on Sean Wang's upstream deadlock fix already sent
which is now included as patch 01/13. The remaining 12 patches are my stability
fixes.

Changes since v5:
- Rebased on Sean Wang's fix for mt7925_roc_abort_sync deadlock (now patch 1)
  and removed my work around for the same issue as Sean's fix is better.
- Fixed format string warning in patch 12: %lu -> %u for jiffies_to_msecs()
  return type (caught by kernel test robot)
- Added patch 13: fix double wcid initialization race condition - removes
  duplicate mt76_wcid_init() call that occurred after rcu_assign_pointer(),
  which could cause list corruption, memory leaks, and race conditions
  (this is a pre-existing bug in upstream, not introduced by this series)

Zac Bowling (12):
  wifi: mt76: fix list corruption in mt76_wcid_cleanup
  wifi: mt76: mt792x: fix NULL pointer and firmware reload issues
  wifi: mt76: mt7921: add mutex protection in critical paths
  wifi: mt76: mt7921: fix deadlock in sta removal and suspend ROC abort
  wifi: mt76: mt7925: add comprehensive NULL pointer protection for MLO
  wifi: mt76: mt7925: add mutex protection in critical paths
  wifi: mt76: mt7925: add MCU command error handling
  wifi: mt76: mt7925: add lockdep assertions for mutex verification
  wifi: mt76: mt7925: fix MLO roaming and ROC setup issues
  wifi: mt76: mt7925: fix BA session teardown during beacon loss
  wifi: mt76: mt7925: fix ROC deadlocks and race conditions
  wifi: mt76: mt7925: fix double wcid initialization race condition

Sean Wang (1):
  wifi: mt76: mt7925: fix potential deadlock in mt7925_roc_abort_sync

 drivers/net/wireless/mediatek/mt76/mac80211.c    |  10 +
 drivers/net/wireless/mediatek/mt76/mt76.h        |   1 +
 drivers/net/wireless/mediatek/mt76/mt7921/mac.c  |   2 +
 drivers/net/wireless/mediatek/mt76/mt7921/main.c |  28 ++-
 drivers/net/wireless/mediatek/mt76/mt7925/mac.c  |   8 +
 drivers/net/wireless/mediatek/mt76/mt7925/main.c | 303 ++++++++++++++++++++---
 drivers/net/wireless/mediatek/mt76/mt7925/mcu.c  |  48 +++-
 drivers/net/wireless/mediatek/mt76/mt7925/pci.c  |   2 +
 drivers/net/wireless/mediatek/mt76/mt792x.h      |   7 +
 drivers/net/wireless/mediatek/mt76/mt792x_core.c |  27 +-
 10 files changed, 390 insertions(+), 46 deletions(-)

--
2.52.0
Re: [PATCH v6 00/13] wifi: mt76: stability fixes for deadlocks, NULL derefs, and race conditions
Posted by Felix Fietkau 1 week, 6 days ago
On 20.01.26 21:10, Zac wrote:
> From: Zac Bowling <zac@zacbowling.com>
> 
> TLDR: This series addresses stability issues in both the MT7921 and MT7925
> WiFi drivers that cause kernel panics, deadlocks, and system hangs
> on various systems using these drivers.
> 
> This v6 series is rebased on Sean Wang's upstream deadlock fix already sent
> which is now included as patch 01/13. The remaining 12 patches are my stability
> fixes.

When you send v7, please include the "v7" in the subject for all 
patches, instead of just the cover letter. Working through your patches 
in patchwork is getting quite confusing...

- Felix
[PATCH v7 0/6] wifi: mt76: mt7925: MLO stability fixes
Posted by Zac 1 week, 4 days ago
From: Zac Bowling <zac@zacbowling.com>

This patch series addresses several stability issues in the mt7925 driver,
particularly around Multi-Link Operation (MLO) scenarios. These fixes address
kernel panics, deadlocks, and race conditions reported by users on systems
like Framework laptops with MT7925 WiFi adapters.

Changes since v6:
- Consolidated from 12 patches to 6 focused patches
- Removed patches that have been merged or superseded upstream
- Improved error handling in AMPDU actions
- Added lockdep assertions for better debugging

The series addresses:
1. Double wcid initialization race condition during station add
2. NULL pointer dereferences during MLO state transitions
3. Missing mutex protection in critical paths
4. MCU command error handling in AMPDU BA session management
5. Lockdep assertions for mutex verification
6. MLO ROC setup error handling

Tested on:
- Framework Laptop 16 with MT7925 (AMD variant)
- Kernel 6.18.x and nbd168/wireless mt76 branch
- Various MLO and non-MLO AP configurations

Zac Bowling (6):
  wifi: mt76: mt7925: fix double wcid initialization race condition
  wifi: mt76: mt7925: add NULL pointer protection for MLO operations
  wifi: mt76: mt7925: add mutex protection in critical paths
  wifi: mt76: mt7925: add MCU command error handling in ampdu_action
  wifi: mt76: mt7925: add lockdep assertions for mutex verification
  wifi: mt76: mt7925: fix MLO ROC setup error handling

 drivers/net/wireless/mediatek/mt76/mt7925/mac.c  |  3 ++
 drivers/net/wireless/mediatek/mt76/mt7925/main.c | 65 +++++++++++++++++++-----
 drivers/net/wireless/mediatek/mt76/mt7925/mcu.c  | 24 +++++++--
 3 files changed, 75 insertions(+), 17 deletions(-)

--
2.52.0
[PATCH v7 1/6] wifi: mt76: mt7925: fix double wcid initialization race condition
Posted by Zac 1 week, 4 days ago
Remove duplicate mt76_wcid_init() call in mt7925_mac_link_sta_add that
occurs after the wcid is already published via rcu_assign_pointer().

The wcid is correctly initialized at line 873 after allocation.
However, a second mt76_wcid_init() call at line 885 reinitializes
the wcid after it has been published to RCU readers, which can cause:

 - List head corruption (tx_list, poll_list) if concurrent code is
   already using the wcid
 - Memory leaks from reinitializing the pktid IDR
 - Race conditions where readers see partially initialized state

Fixes: c948b5da6bbe ("wifi: mt76: mt7925: add Mediatek Wi-Fi7 driver for mt7925 device")
Signed-off-by: Zac Bowling <zac@zacbowling.com>
---
 drivers/net/wireless/mediatek/mt76/mt7925/main.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mt7925/main.c b/drivers/net/wireless/mediatek/mt76/mt7925/main.c
index afcc0fa4aa35..fad3b1505f67 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7925/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7925/main.c
@@ -882,7 +882,6 @@ static int mt7925_mac_link_sta_add(struct mt76_dev *mdev,
 	wcid = &mlink->wcid;
 	ewma_signal_init(&wcid->rssi);
 	rcu_assign_pointer(dev->mt76.wcid[wcid->idx], wcid);
-	mt76_wcid_init(wcid, 0);
 	ewma_avg_signal_init(&mlink->avg_ack_signal);
 	memset(mlink->airtime_ac, 0,
 	       sizeof(msta->deflink.airtime_ac));
-- 
2.52.0
[PATCH v7 2/6] wifi: mt76: mt7925: add NULL pointer protection for MLO state transitions
Posted by Zac 1 week, 4 days ago
Add NULL pointer checks for functions that return pointers to link-related
structures throughout the mt7925 driver. During MLO state transitions,
these functions can return NULL when link configuration is not synchronized.

Functions protected:
- mt792x_vif_to_bss_conf(): Returns link BSS configuration
- mt792x_vif_to_link(): Returns driver link state
- mt792x_sta_to_link(): Returns station link state

Key changes:

1. mt7925_set_link_key():
   - Check link_conf, mconf, mlink before use
   - During MLO roaming, allow key removal to succeed if link is already gone

2. mt7925_mac_link_sta_add():
   - Check mlink and mconf before WCID allocation
   - Check link_conf before BSS info update
   - Add proper WCID cleanup on error paths (err_wcid label)
   - Check MCU return values and propagate errors

3. mt7925_mac_link_sta_assoc():
   - Check mlink before use
   - Check link_conf and mconf before BSS info update

4. mt7925_mac_link_sta_remove():
   - Check mlink before use
   - Check link_conf and mconf before cleanup operations

Prevents crashes during:
- BSSID roaming transitions
- MLO setup and teardown
- Hardware reset operations

Fixes: c948b5da6bbe ("wifi: mt76: mt7925: add Mediatek Wi-Fi7 driver for mt7925 device")
Signed-off-by: Zac Bowling <zac@zacbowling.com>
---
 .../net/wireless/mediatek/mt76/mt7925/main.c  | 66 ++++++++++++++-----
 1 file changed, 51 insertions(+), 15 deletions(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mt7925/main.c b/drivers/net/wireless/mediatek/mt76/mt7925/main.c
index fad3b1505f67..88ee90709b75 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7925/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7925/main.c
@@ -612,6 +612,17 @@ static int mt7925_set_link_key(struct ieee80211_hw *hw, enum set_key_cmd cmd,
 	link_sta = sta ? mt792x_sta_to_link_sta(vif, sta, link_id) : NULL;
 	mconf = mt792x_vif_to_link(mvif, link_id);
 	mlink = mt792x_sta_to_link(msta, link_id);
+
+	if (!link_conf || !mconf || !mlink) {
+		/* During MLO roaming, link state may be torn down before
+		 * mac80211 requests key removal. If removing a key and
+		 * the link is already gone, consider it successfully removed.
+		 */
+		if (cmd != SET_KEY)
+			return 0;
+		return -EINVAL;
+	}
+
 	wcid = &mlink->wcid;
 	wcid_keyidx = &wcid->hw_key_idx;
 
@@ -864,12 +875,17 @@ static int mt7925_mac_link_sta_add(struct mt76_dev *mdev,
 
 	msta = (struct mt792x_sta *)link_sta->sta->drv_priv;
 	mlink = mt792x_sta_to_link(msta, link_id);
+	if (!mlink)
+		return -EINVAL;
+
+	mconf = mt792x_vif_to_link(mvif, link_id);
+	if (!mconf)
+		return -EINVAL;
 
 	idx = mt76_wcid_alloc(dev->mt76.wcid_mask, MT792x_WTBL_STA - 1);
 	if (idx < 0)
 		return -ENOSPC;
 
-	mconf = mt792x_vif_to_link(mvif, link_id);
 	mt76_wcid_init(&mlink->wcid, 0);
 	mlink->wcid.sta = 1;
 	mlink->wcid.idx = idx;
@@ -888,21 +904,28 @@ static int mt7925_mac_link_sta_add(struct mt76_dev *mdev,
 
 	ret = mt76_connac_pm_wake(&dev->mphy, &dev->pm);
 	if (ret)
-		return ret;
+		goto err_wcid;
 
 	mt7925_mac_wtbl_update(dev, idx,
 			       MT_WTBL_UPDATE_ADM_COUNT_CLEAR);
 
 	link_conf = mt792x_vif_to_bss_conf(vif, link_id);
+	if (!link_conf) {
+		ret = -EINVAL;
+		goto err_wcid;
+	}
 
 	/* should update bss info before STA add */
 	if (vif->type == NL80211_IFTYPE_STATION && !link_sta->sta->tdls) {
 		if (ieee80211_vif_is_mld(vif))
-			mt7925_mcu_add_bss_info(&dev->phy, mconf->mt76.ctx,
-						link_conf, link_sta, link_sta != mlink->pri_link);
+			ret = mt7925_mcu_add_bss_info(&dev->phy, mconf->mt76.ctx,
+						      link_conf, link_sta,
+						      link_sta != mlink->pri_link);
 		else
-			mt7925_mcu_add_bss_info(&dev->phy, mconf->mt76.ctx,
-						link_conf, link_sta, false);
+			ret = mt7925_mcu_add_bss_info(&dev->phy, mconf->mt76.ctx,
+						      link_conf, link_sta, false);
+		if (ret)
+			goto err_wcid;
 	}
 
 	if (ieee80211_vif_is_mld(vif) &&
@@ -910,28 +933,34 @@ static int mt7925_mac_link_sta_add(struct mt76_dev *mdev,
 		ret = mt7925_mcu_sta_update(dev, link_sta, vif, true,
 					    MT76_STA_INFO_STATE_NONE);
 		if (ret)
-			return ret;
+			goto err_wcid;
 	} else if (ieee80211_vif_is_mld(vif) &&
 		   link_sta != mlink->pri_link) {
 		ret = mt7925_mcu_sta_update(dev, mlink->pri_link, vif,
 					    true, MT76_STA_INFO_STATE_ASSOC);
 		if (ret)
-			return ret;
+			goto err_wcid;
 
 		ret = mt7925_mcu_sta_update(dev, link_sta, vif, true,
 					    MT76_STA_INFO_STATE_ASSOC);
 		if (ret)
-			return ret;
+			goto err_wcid;
 	} else {
 		ret = mt7925_mcu_sta_update(dev, link_sta, vif, true,
 					    MT76_STA_INFO_STATE_NONE);
 		if (ret)
-			return ret;
+			goto err_wcid;
 	}
 
 	mt76_connac_power_save_sched(&dev->mphy, &dev->pm);
 
 	return 0;
+
+err_wcid:
+	rcu_assign_pointer(dev->mt76.wcid[idx], NULL);
+	mt76_wcid_mask_clear(dev->mt76.wcid_mask, idx);
+	mt76_connac_power_save_sched(&dev->mphy, &dev->pm);
+	return ret;
 }
 
 static int
@@ -1039,6 +1068,8 @@ static void mt7925_mac_link_sta_assoc(struct mt76_dev *mdev,
 
 	msta = (struct mt792x_sta *)link_sta->sta->drv_priv;
 	mlink = mt792x_sta_to_link(msta, link_sta->link_id);
+	if (!mlink)
+		return;
 
 	mt792x_mutex_acquire(dev);
 
@@ -1048,12 +1079,13 @@ static void mt7925_mac_link_sta_assoc(struct mt76_dev *mdev,
 		link_conf = mt792x_vif_to_bss_conf(vif, vif->bss_conf.link_id);
 	}
 
-	if (vif->type == NL80211_IFTYPE_STATION && !link_sta->sta->tdls) {
+	if (link_conf && vif->type == NL80211_IFTYPE_STATION && !link_sta->sta->tdls) {
 		struct mt792x_bss_conf *mconf;
 
 		mconf = mt792x_link_conf_to_mconf(link_conf);
-		mt7925_mcu_add_bss_info(&dev->phy, mconf->mt76.ctx,
-					link_conf, link_sta, true);
+		if (mconf)
+			mt7925_mcu_add_bss_info(&dev->phy, mconf->mt76.ctx,
+						link_conf, link_sta, true);
 	}
 
 	ewma_avg_signal_init(&mlink->avg_ack_signal);
@@ -1100,6 +1132,8 @@ static void mt7925_mac_link_sta_remove(struct mt76_dev *mdev,
 
 	msta = (struct mt792x_sta *)link_sta->sta->drv_priv;
 	mlink = mt792x_sta_to_link(msta, link_id);
+	if (!mlink)
+		return;
 
 	mt7925_roc_abort_sync(dev);
 
@@ -1113,10 +1147,12 @@ static void mt7925_mac_link_sta_remove(struct mt76_dev *mdev,
 
 	link_conf = mt792x_vif_to_bss_conf(vif, link_id);
 
-	if (vif->type == NL80211_IFTYPE_STATION && !link_sta->sta->tdls) {
+	if (link_conf && vif->type == NL80211_IFTYPE_STATION && !link_sta->sta->tdls) {
 		struct mt792x_bss_conf *mconf;
 
 		mconf = mt792x_link_conf_to_mconf(link_conf);
+		if (!mconf)
+			goto out;
 
 		if (ieee80211_vif_is_mld(vif))
 			mt792x_mac_link_bss_remove(dev, mconf, mlink);
@@ -1124,7 +1160,7 @@ static void mt7925_mac_link_sta_remove(struct mt76_dev *mdev,
 			mt7925_mcu_add_bss_info(&dev->phy, mconf->mt76.ctx, link_conf,
 						link_sta, false);
 	}
-
+out:
 	spin_lock_bh(&mdev->sta_poll_lock);
 	if (!list_empty(&mlink->wcid.poll_list))
 		list_del_init(&mlink->wcid.poll_list);
-- 
2.52.0
[PATCH 2/6] wifi: mt76: mt7925: add NULL pointer protection for MLO state transitions
Posted by Zac 1 week, 4 days ago
Add NULL pointer checks for functions that return pointers to link-related
structures throughout the mt7925 driver. During MLO state transitions,
these functions can return NULL when link configuration is not synchronized.

Functions protected:
- mt792x_vif_to_bss_conf(): Returns link BSS configuration
- mt792x_vif_to_link(): Returns driver link state
- mt792x_sta_to_link(): Returns station link state

Key changes:

1. mt7925_set_link_key():
   - Check link_conf, mconf, mlink before use
   - During MLO roaming, allow key removal to succeed if link is already gone

2. mt7925_mac_link_sta_add():
   - Check mlink and mconf before WCID allocation
   - Check link_conf before BSS info update
   - Add proper WCID cleanup on error paths (err_wcid label)
   - Check MCU return values and propagate errors

3. mt7925_mac_link_sta_assoc():
   - Check mlink before use
   - Check link_conf and mconf before BSS info update

4. mt7925_mac_link_sta_remove():
   - Check mlink before use
   - Check link_conf and mconf before cleanup operations

Prevents crashes during:
- BSSID roaming transitions
- MLO setup and teardown
- Hardware reset operations

Fixes: c948b5da6bbe ("wifi: mt76: mt7925: add Mediatek Wi-Fi7 driver for mt7925 device")
Signed-off-by: Zac Bowling <zac@zacbowling.com>
---
 .../net/wireless/mediatek/mt76/mt7925/main.c  | 67 ++++++++++++++-----
 1 file changed, 52 insertions(+), 15 deletions(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mt7925/main.c b/drivers/net/wireless/mediatek/mt76/mt7925/main.c
index fad3b1505f67..1400633712b7 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7925/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7925/main.c
@@ -612,6 +612,17 @@ static int mt7925_set_link_key(struct ieee80211_hw *hw, enum set_key_cmd cmd,
 	link_sta = sta ? mt792x_sta_to_link_sta(vif, sta, link_id) : NULL;
 	mconf = mt792x_vif_to_link(mvif, link_id);
 	mlink = mt792x_sta_to_link(msta, link_id);
+
+	if (!link_conf || !mconf || !mlink) {
+		/* During MLO roaming, link state may be torn down before
+		 * mac80211 requests key removal. If removing a key and
+		 * the link is already gone, consider it successfully removed.
+		 */
+		if (cmd != SET_KEY)
+			return 0;
+		return -EINVAL;
+	}
+
 	wcid = &mlink->wcid;
 	wcid_keyidx = &wcid->hw_key_idx;
 
@@ -864,12 +875,17 @@ static int mt7925_mac_link_sta_add(struct mt76_dev *mdev,
 
 	msta = (struct mt792x_sta *)link_sta->sta->drv_priv;
 	mlink = mt792x_sta_to_link(msta, link_id);
+	if (!mlink)
+		return -EINVAL;
+
+	mconf = mt792x_vif_to_link(mvif, link_id);
+	if (!mconf)
+		return -EINVAL;
 
 	idx = mt76_wcid_alloc(dev->mt76.wcid_mask, MT792x_WTBL_STA - 1);
 	if (idx < 0)
 		return -ENOSPC;
 
-	mconf = mt792x_vif_to_link(mvif, link_id);
 	mt76_wcid_init(&mlink->wcid, 0);
 	mlink->wcid.sta = 1;
 	mlink->wcid.idx = idx;
@@ -888,21 +904,28 @@ static int mt7925_mac_link_sta_add(struct mt76_dev *mdev,
 
 	ret = mt76_connac_pm_wake(&dev->mphy, &dev->pm);
 	if (ret)
-		return ret;
+		goto err_wcid;
 
 	mt7925_mac_wtbl_update(dev, idx,
 			       MT_WTBL_UPDATE_ADM_COUNT_CLEAR);
 
 	link_conf = mt792x_vif_to_bss_conf(vif, link_id);
+	if (!link_conf) {
+		ret = -EINVAL;
+		goto err_wcid;
+	}
 
 	/* should update bss info before STA add */
 	if (vif->type == NL80211_IFTYPE_STATION && !link_sta->sta->tdls) {
 		if (ieee80211_vif_is_mld(vif))
-			mt7925_mcu_add_bss_info(&dev->phy, mconf->mt76.ctx,
-						link_conf, link_sta, link_sta != mlink->pri_link);
+			ret = mt7925_mcu_add_bss_info(&dev->phy, mconf->mt76.ctx,
+						      link_conf, link_sta,
+						      link_sta != mlink->pri_link);
 		else
-			mt7925_mcu_add_bss_info(&dev->phy, mconf->mt76.ctx,
-						link_conf, link_sta, false);
+			ret = mt7925_mcu_add_bss_info(&dev->phy, mconf->mt76.ctx,
+						      link_conf, link_sta, false);
+		if (ret)
+			goto err_wcid;
 	}
 
 	if (ieee80211_vif_is_mld(vif) &&
@@ -910,28 +933,35 @@ static int mt7925_mac_link_sta_add(struct mt76_dev *mdev,
 		ret = mt7925_mcu_sta_update(dev, link_sta, vif, true,
 					    MT76_STA_INFO_STATE_NONE);
 		if (ret)
-			return ret;
+			goto err_wcid;
 	} else if (ieee80211_vif_is_mld(vif) &&
 		   link_sta != mlink->pri_link) {
 		ret = mt7925_mcu_sta_update(dev, mlink->pri_link, vif,
 					    true, MT76_STA_INFO_STATE_ASSOC);
 		if (ret)
-			return ret;
+			goto err_wcid;
 
 		ret = mt7925_mcu_sta_update(dev, link_sta, vif, true,
 					    MT76_STA_INFO_STATE_ASSOC);
 		if (ret)
-			return ret;
+			goto err_wcid;
 	} else {
 		ret = mt7925_mcu_sta_update(dev, link_sta, vif, true,
 					    MT76_STA_INFO_STATE_NONE);
 		if (ret)
-			return ret;
+			goto err_wcid;
 	}
 
 	mt76_connac_power_save_sched(&dev->mphy, &dev->pm);
 
 	return 0;
+
+err_wcid:
+	rcu_assign_pointer(dev->mt76.wcid[idx], NULL);
+	mt76_wcid_cleanup(&dev->mt76, wcid);
+	mt76_wcid_mask_clear(dev->mt76.wcid_mask, idx);
+	mt76_connac_power_save_sched(&dev->mphy, &dev->pm);
+	return ret;
 }
 
 static int
@@ -1039,6 +1069,8 @@ static void mt7925_mac_link_sta_assoc(struct mt76_dev *mdev,
 
 	msta = (struct mt792x_sta *)link_sta->sta->drv_priv;
 	mlink = mt792x_sta_to_link(msta, link_sta->link_id);
+	if (!mlink)
+		return;
 
 	mt792x_mutex_acquire(dev);
 
@@ -1048,12 +1080,13 @@ static void mt7925_mac_link_sta_assoc(struct mt76_dev *mdev,
 		link_conf = mt792x_vif_to_bss_conf(vif, vif->bss_conf.link_id);
 	}
 
-	if (vif->type == NL80211_IFTYPE_STATION && !link_sta->sta->tdls) {
+	if (link_conf && vif->type == NL80211_IFTYPE_STATION && !link_sta->sta->tdls) {
 		struct mt792x_bss_conf *mconf;
 
 		mconf = mt792x_link_conf_to_mconf(link_conf);
-		mt7925_mcu_add_bss_info(&dev->phy, mconf->mt76.ctx,
-					link_conf, link_sta, true);
+		if (mconf)
+			mt7925_mcu_add_bss_info(&dev->phy, mconf->mt76.ctx,
+						link_conf, link_sta, true);
 	}
 
 	ewma_avg_signal_init(&mlink->avg_ack_signal);
@@ -1100,6 +1133,8 @@ static void mt7925_mac_link_sta_remove(struct mt76_dev *mdev,
 
 	msta = (struct mt792x_sta *)link_sta->sta->drv_priv;
 	mlink = mt792x_sta_to_link(msta, link_id);
+	if (!mlink)
+		return;
 
 	mt7925_roc_abort_sync(dev);
 
@@ -1113,10 +1148,12 @@ static void mt7925_mac_link_sta_remove(struct mt76_dev *mdev,
 
 	link_conf = mt792x_vif_to_bss_conf(vif, link_id);
 
-	if (vif->type == NL80211_IFTYPE_STATION && !link_sta->sta->tdls) {
+	if (link_conf && vif->type == NL80211_IFTYPE_STATION && !link_sta->sta->tdls) {
 		struct mt792x_bss_conf *mconf;
 
 		mconf = mt792x_link_conf_to_mconf(link_conf);
+		if (!mconf)
+			goto out;
 
 		if (ieee80211_vif_is_mld(vif))
 			mt792x_mac_link_bss_remove(dev, mconf, mlink);
@@ -1124,7 +1161,7 @@ static void mt7925_mac_link_sta_remove(struct mt76_dev *mdev,
 			mt7925_mcu_add_bss_info(&dev->phy, mconf->mt76.ctx, link_conf,
 						link_sta, false);
 	}
-
+out:
 	spin_lock_bh(&mdev->sta_poll_lock);
 	if (!list_empty(&mlink->wcid.poll_list))
 		list_del_init(&mlink->wcid.poll_list);
-- 
2.52.0
[v7 PATCH 7/7] wifi: mt76: mt7925: add error logging for MLO ROC setup in set_links
Posted by Zac 1 week, 4 days ago
Add error logging in mt7925_mac_set_links() when mt7925_set_mlo_roc()
fails. Previously the error return was silently ignored since the
callback function is void.

The function now logs non-ENOLINK errors as warnings. ENOLINK errors
are expected during link transitions when the link configuration is
not yet ready, and mac80211 will retry the operation later.

This complements the error handling changes in mt7925_mcu_set_mlo_roc()
where WARN_ON_ONCE was replaced with proper -ENOLINK returns.

Fixes: c948b5da6bbe ("wifi: mt76: mt7925: add Mediatek Wi-Fi7 driver for mt7925 device")
Signed-off-by: Zac Bowling <zac@zacbowling.com>
---
 drivers/net/wireless/mediatek/mt76/mt7925/main.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mt7925/main.c b/drivers/net/wireless/mediatek/mt76/mt7925/main.c
index 0b088c448151..769c09e99d48 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7925/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7925/main.c
@@ -1048,11 +1048,16 @@ mt7925_mac_set_links(struct mt76_dev *mdev, struct ieee80211_vif *vif)
 
 	if (band == NL80211_BAND_2GHZ ||
 	    (band == NL80211_BAND_5GHZ && secondary_band == NL80211_BAND_6GHZ)) {
+		int ret;
+
 		mt7925_abort_roc(mvif->phy, &mvif->bss_conf);
 
 		mt792x_mutex_acquire(dev);
 
-		mt7925_set_mlo_roc(mvif->phy, &mvif->bss_conf, sel_links);
+		ret = mt7925_set_mlo_roc(mvif->phy, &mvif->bss_conf, sel_links);
+		if (ret && ret != -ENOLINK)
+			dev_warn(dev->mt76.dev,
+				 "MLO ROC setup failed in set_links: %d\n", ret);
 
 		mt792x_mutex_release(dev);
 	}
-- 
2.52.0
[PATCH v7 3/6] wifi: mt76: mt7925: add mutex protection in critical paths
Posted by Zac 1 week, 4 days ago
Add proper mutex protection for mt7925 driver operations that access
hardware state without proper synchronization. This fixes race conditions
that can cause system instability during power management and recovery.

Fixes:

1. mac.c: mt7925_mac_reset_work()
   - Wrap ieee80211_iterate_active_interfaces() with mt792x_mutex
   - The vif_connect_iter callback accesses hardware state

2. main.c: mt7925_set_runtime_pm()
   - Add mutex protection around ieee80211_iterate_active_interfaces()
   - Runtime PM can race with other operations

These protections ensure consistent hardware state access during power
management transitions and recovery operations.

Fixes: c948b5da6bbe ("wifi: mt76: mt7925: add Mediatek Wi-Fi7 driver for mt7925 device")
Signed-off-by: Zac Bowling <zac@zacbowling.com>
---
 drivers/net/wireless/mediatek/mt76/mt7925/mac.c  | 2 ++
 drivers/net/wireless/mediatek/mt76/mt7925/main.c | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/drivers/net/wireless/mediatek/mt76/mt7925/mac.c b/drivers/net/wireless/mediatek/mt76/mt7925/mac.c
index f1f0bc9eab04..88cf214ab452 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7925/mac.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7925/mac.c
@@ -1330,9 +1330,11 @@ void mt7925_mac_reset_work(struct work_struct *work)
 	dev->hw_full_reset = false;
 	pm->suspended = false;
 	ieee80211_wake_queues(hw);
+	mt792x_mutex_acquire(dev);
 	ieee80211_iterate_active_interfaces(hw,
 					    IEEE80211_IFACE_ITER_RESUME_ALL,
 					    mt7925_vif_connect_iter, NULL);
+	mt792x_mutex_release(dev);
 	mt76_connac_power_save_sched(&dev->mt76.phy, pm);
 
 	mt7925_regd_change(&dev->phy, "00");
diff --git a/drivers/net/wireless/mediatek/mt76/mt7925/main.c b/drivers/net/wireless/mediatek/mt76/mt7925/main.c
index 88ee90709b75..82de6f30ec27 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7925/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7925/main.c
@@ -770,9 +770,11 @@ void mt7925_set_runtime_pm(struct mt792x_dev *dev)
 	bool monitor = !!(hw->conf.flags & IEEE80211_CONF_MONITOR);
 
 	pm->enable = pm->enable_user && !monitor;
+	mt792x_mutex_acquire(dev);
 	ieee80211_iterate_active_interfaces(hw,
 					    IEEE80211_IFACE_ITER_RESUME_ALL,
 					    mt7925_pm_interface_iter, dev);
+	mt792x_mutex_release(dev);
 	pm->ds_enable = pm->ds_enable_user && !monitor;
 	mt7925_mcu_set_deep_sleep(dev, pm->ds_enable);
 }
-- 
2.52.0
[PATCH v7 4/6] wifi: mt76: mt7925: add MCU command error handling in ampdu_action
Posted by Zac 1 week, 4 days ago
Add proper error handling for MCU command return values that were
previously being ignored. Without proper error handling, failures in
MCU communication can leave the driver in an inconsistent state.

Changes:
- Check mt7925_mcu_uni_tx_ba() return value
- Check mt7925_mcu_uni_rx_ba() return value
- Return error to mac80211 on failure

Special case for IEEE80211_AMPDU_TX_STOP_CONT:
The ieee80211_stop_tx_ba_cb_irqsafe() callback is kept unconditional
because during beacon loss, the MCU command may fail but mac80211
MUST be notified to complete the BA session teardown. Otherwise the
state machine gets stuck and triggers WARN in
__ieee80211_stop_tx_ba_session(). This matches the behavior of mt7921
and mt7996 drivers.

Fixes: c948b5da6bbe ("wifi: mt76: mt7925: add Mediatek Wi-Fi7 driver for mt7925 device")
Signed-off-by: Zac Bowling <zac@zacbowling.com>
---
 drivers/net/wireless/mediatek/mt76/mt7925/main.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mt7925/main.c b/drivers/net/wireless/mediatek/mt76/mt7925/main.c
index 82de6f30ec27..8236edb1fb48 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7925/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7925/main.c
@@ -1300,22 +1300,22 @@ mt7925_ampdu_action(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 	case IEEE80211_AMPDU_RX_START:
 		mt76_rx_aggr_start(&dev->mt76, &msta->deflink.wcid, tid, ssn,
 				   params->buf_size);
-		mt7925_mcu_uni_rx_ba(dev, params, true);
+		ret = mt7925_mcu_uni_rx_ba(dev, params, true);
 		break;
 	case IEEE80211_AMPDU_RX_STOP:
 		mt76_rx_aggr_stop(&dev->mt76, &msta->deflink.wcid, tid);
-		mt7925_mcu_uni_rx_ba(dev, params, false);
+		ret = mt7925_mcu_uni_rx_ba(dev, params, false);
 		break;
 	case IEEE80211_AMPDU_TX_OPERATIONAL:
 		mtxq->aggr = true;
 		mtxq->send_bar = false;
-		mt7925_mcu_uni_tx_ba(dev, params, true);
+		ret = mt7925_mcu_uni_tx_ba(dev, params, true);
 		break;
 	case IEEE80211_AMPDU_TX_STOP_FLUSH:
 	case IEEE80211_AMPDU_TX_STOP_FLUSH_CONT:
 		mtxq->aggr = false;
 		clear_bit(tid, &msta->deflink.wcid.ampdu_state);
-		mt7925_mcu_uni_tx_ba(dev, params, false);
+		ret = mt7925_mcu_uni_tx_ba(dev, params, false);
 		break;
 	case IEEE80211_AMPDU_TX_START:
 		set_bit(tid, &msta->deflink.wcid.ampdu_state);
@@ -1324,6 +1324,11 @@ mt7925_ampdu_action(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 	case IEEE80211_AMPDU_TX_STOP_CONT:
 		mtxq->aggr = false;
 		clear_bit(tid, &msta->deflink.wcid.ampdu_state);
+		/* MCU command may fail during beacon loss, but callback must
+		 * always be called to complete the BA session teardown in
+		 * mac80211. Otherwise the state machine gets stuck and triggers
+		 * WARN in __ieee80211_stop_tx_ba_session().
+		 */
 		mt7925_mcu_uni_tx_ba(dev, params, false);
 		ieee80211_stop_tx_ba_cb_irqsafe(vif, sta->addr, tid);
 		break;
-- 
2.52.0
[PATCH v7 5/6] wifi: mt76: mt7925: add lockdep assertions for mutex verification
Posted by Zac 1 week, 4 days ago
Add lockdep_assert_held() calls to critical MCU functions to help catch
mutex violations during development and debugging. This follows the
pattern used in other mt76 drivers (mt7996, mt7915, mt7615).

Functions with new assertions:
- mt7925_mcu_add_bss_info(): Core BSS configuration MCU command
- mt7925_mcu_sta_update(): Station record update MCU command
- mt7925_mcu_uni_bss_ps(): Power save state MCU command

These functions modify firmware state and must be called with the
device mutex held to prevent race conditions. The lockdep assertions
will trigger warnings at runtime if code paths exist that call these
functions without proper mutex protection.

Also fixes a potential NULL pointer issue in mt7925_mcu_sta_update()
by initializing mlink to NULL and checking it before use.

Signed-off-by: Zac Bowling <zac@zacbowling.com>
---
 drivers/net/wireless/mediatek/mt76/mt7925/mcu.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mt7925/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7925/mcu.c
index 1379bf6a26b5..2ed4af282120 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7925/mcu.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7925/mcu.c
@@ -1532,6 +1532,8 @@ int mt7925_mcu_uni_bss_ps(struct mt792x_dev *dev,
 		},
 	};
 
+	lockdep_assert_held(&dev->mt76.mutex);
+
 	if (link_conf->vif->type != NL80211_IFTYPE_STATION)
 		return -EOPNOTSUPP;
 
@@ -2032,13 +2034,15 @@ int mt7925_mcu_sta_update(struct mt792x_dev *dev,
 		.rcpi = to_rcpi(rssi),
 	};
 	struct mt792x_sta *msta;
-	struct mt792x_link_sta *mlink;
+	struct mt792x_link_sta *mlink = NULL;
+
+	lockdep_assert_held(&dev->mt76.mutex);
 
 	if (link_sta) {
 		msta = (struct mt792x_sta *)link_sta->sta->drv_priv;
 		mlink = mt792x_sta_to_link(msta, link_sta->link_id);
 	}
-	info.wcid = link_sta ? &mlink->wcid : &mvif->sta.deflink.wcid;
+	info.wcid = (link_sta && mlink) ? &mlink->wcid : &mvif->sta.deflink.wcid;
 	info.newly = state != MT76_STA_INFO_STATE_ASSOC;
 
 	return mt7925_mcu_sta_cmd(&dev->mphy, &info);
@@ -2840,6 +2844,8 @@ int mt7925_mcu_add_bss_info(struct mt792x_phy *phy,
 	struct mt792x_link_sta *mlink_bc;
 	struct sk_buff *skb;
 
+	lockdep_assert_held(&dev->mt76.mutex);
+
 	skb = __mt7925_mcu_alloc_bss_req(&dev->mt76, &mconf->mt76,
 					 MT7925_BSS_UPDATE_MAX_SIZE);
 	if (IS_ERR(skb))
-- 
2.52.0
[PATCH v7 6/6] wifi: mt76: mt7925: fix MLO ROC setup error handling
Posted by Zac 1 week, 4 days ago
Replace noisy WARN_ON_ONCE checks with silent returns in
mt7925_mcu_set_mlo_roc(). During MLO setup, links may not be fully
configured when ROC is requested. The WARN_ON_ONCE statements were
triggering unnecessary kernel warnings during normal operation.

Changes:
- Replace WARN_ON_ONCE(!link_conf) with silent if (!link_conf)
- Replace WARN_ON_ONCE(!links[i].chan) with silent check
- Add explicit mconf NULL check before use
- Use -ENOLINK error code to indicate link not ready
- Replace continue with return to fail fast on invalid links

The -ENOLINK error code properly indicates that the link is not yet
ready for ROC, allowing upper layers to retry later without generating
spurious kernel warnings.

Fixes: c948b5da6bbe ("wifi: mt76: mt7925: add Mediatek Wi-Fi7 driver for mt7925 device")
Signed-off-by: Zac Bowling <zac@zacbowling.com>
---
 .../net/wireless/mediatek/mt76/mt7925/mcu.c   | 20 +++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mt7925/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7925/mcu.c
index 2ed4af282120..5ca2106b1ce0 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7925/mcu.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7925/mcu.c
@@ -1341,15 +1341,23 @@ int mt7925_mcu_set_mlo_roc(struct mt792x_phy *phy, struct mt792x_bss_conf *mconf
 	for (i = 0; i < ARRAY_SIZE(links); i++) {
 		links[i].id = i ? __ffs(~BIT(mconf->link_id) & sel_links) :
 				 mconf->link_id;
+
 		link_conf = mt792x_vif_to_bss_conf(vif, links[i].id);
-		if (WARN_ON_ONCE(!link_conf))
-			return -EPERM;
+		if (!link_conf)
+			return -ENOLINK;
 
 		links[i].chan = link_conf->chanreq.oper.chan;
-		if (WARN_ON_ONCE(!links[i].chan))
-			return -EPERM;
+		if (!links[i].chan)
+			/* Channel not configured yet - this can happen during
+			 * MLO AP setup when links are being added sequentially.
+			 * Return -ENOLINK to indicate link not ready.
+			 */
+			return -ENOLINK;
 
 		links[i].mconf = mt792x_vif_to_link(mvif, links[i].id);
+		if (!links[i].mconf)
+			return -ENOLINK;
+
 		links[i].tag = links[i].id == mconf->link_id ?
 			       UNI_ROC_ACQUIRE : UNI_ROC_SUB_LINK;
 
@@ -1364,8 +1372,8 @@ int mt7925_mcu_set_mlo_roc(struct mt792x_phy *phy, struct mt792x_bss_conf *mconf
 		type = MT7925_ROC_REQ_JOIN;
 
 	for (i = 0; i < ARRAY_SIZE(links) && i < hweight16(vif->active_links); i++) {
-		if (WARN_ON_ONCE(!links[i].mconf || !links[i].chan))
-			continue;
+		if (!links[i].mconf || !links[i].chan)
+			return -ENOLINK;
 
 		chan = links[i].chan;
 		center_ch = ieee80211_frequency_to_channel(chan->center_freq);
-- 
2.52.0
[PATCH 01/13] wifi: mt76: mt7925: fix potential deadlock in mt7925_roc_abort_sync
Posted by Zac 2 weeks, 5 days ago
From: Sean Wang <sean.wang@mediatek.com>

roc_abort_sync() can deadlock with roc_work(). roc_work() holds
dev->mt76.mutex, while cancel_work_sync() waits for roc_work()
to finish. If the caller already owns the same mutex, both
sides block and no progress is possible.

This deadlock can occur during station removal when
mt76_sta_state() -> mt76_sta_remove() ->
mt7925_mac_sta_remove_link() -> mt7925_mac_link_sta_remove() ->
mt7925_roc_abort_sync() invokes cancel_work_sync() while
roc_work() is still running and holding dev->mt76.mutex.

This avoids the mutex deadlock and preserves exactly-once
work ownership.

Fixes: 45064d19fd3a ("wifi: mt76: mt7925: fix a potential association failure upon resuming")
Co-developed-by: Quan Zhou <quan.zhou@mediatek.com>
Signed-off-by: Quan Zhou <quan.zhou@mediatek.com>
Signed-off-by: Sean Wang <sean.wang@mediatek.com>
---
 drivers/net/wireless/mediatek/mt76/mt7925/main.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mt7925/main.c b/drivers/net/wireless/mediatek/mt76/mt7925/main.c
index 2d358a96640c..05990455ee7d 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7925/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7925/main.c
@@ -457,12 +457,16 @@ void mt7925_roc_abort_sync(struct mt792x_dev *dev)
 {
 	struct mt792x_phy *phy = &dev->phy;
 
+	if (!test_and_clear_bit(MT76_STATE_ROC, &phy->mt76->state))
+		return;
+
 	timer_delete_sync(&phy->roc_timer);
-	cancel_work_sync(&phy->roc_work);
-	if (test_and_clear_bit(MT76_STATE_ROC, &phy->mt76->state))
-		ieee80211_iterate_interfaces(mt76_hw(dev),
-					     IEEE80211_IFACE_ITER_RESUME_ALL,
-					     mt7925_roc_iter, (void *)phy);
+
+	cancel_work(&phy->roc_work);
+
+	ieee80211_iterate_interfaces(mt76_hw(dev),
+				     IEEE80211_IFACE_ITER_RESUME_ALL,
+				     mt7925_roc_iter, (void *)phy);
 }
 EXPORT_SYMBOL_GPL(mt7925_roc_abort_sync);
 
-- 
2.52.0
[PATCH 02/13] wifi: mt76: fix list corruption in mt76_wcid_cleanup
Posted by Zac 2 weeks, 5 days ago
From: Zac Bowling <zac@zacbowling.com>

mt76_wcid_cleanup() was not removing wcid entries from sta_poll_list
before mt76_reset_device() reinitializes the master list. This leaves
stale pointers in wcid->poll_list, causing list corruption when
mt76_wcid_add_poll() later checks list_empty() and tries to add the
entry back.

The fix adds proper cleanup of poll_list in mt76_wcid_cleanup(),
matching how tx_list is already handled. This is similar to what
mt7996_mac_sta_deinit_link() already does correctly.

Fixes list corruption warnings like:
  list_add corruption. prev->next should be next (ffffffff...)

Signed-off-by: Zac Bowling <zac@zacbowling.com>
---
 drivers/net/wireless/mediatek/mt76/mac80211.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/net/wireless/mediatek/mt76/mac80211.c b/drivers/net/wireless/mediatek/mt76/mac80211.c
index 75772979f438..d0c522909e98 100644
--- a/drivers/net/wireless/mediatek/mt76/mac80211.c
+++ b/drivers/net/wireless/mediatek/mt76/mac80211.c
@@ -1716,6 +1716,16 @@ void mt76_wcid_cleanup(struct mt76_dev *dev, struct mt76_wcid *wcid)
 
 	idr_destroy(&wcid->pktid);
 
+	/* Remove from sta_poll_list to prevent list corruption after reset.
+	 * Without this, mt76_reset_device() reinitializes sta_poll_list but
+	 * leaves wcid->poll_list with stale pointers, causing list corruption
+	 * when mt76_wcid_add_poll() checks list_empty().
+	 */
+	spin_lock_bh(&dev->sta_poll_lock);
+	if (!list_empty(&wcid->poll_list))
+		list_del_init(&wcid->poll_list);
+	spin_unlock_bh(&dev->sta_poll_lock);
+
 	spin_lock_bh(&phy->tx_lock);
 
 	if (!list_empty(&wcid->tx_list))
-- 
2.52.0
[PATCH 03/13] wifi: mt76: mt792x: fix NULL pointer and firmware reload issues
Posted by Zac 2 weeks, 5 days ago
From: Zac Bowling <zac@zacbowling.com>

This patch combines two fixes for the shared mt792x code used by both
MT7921 and MT7925 drivers:

1. Fix NULL pointer dereference in TX path:

Add NULL pointer checks in mt792x_tx() to prevent kernel crashes when
transmitting packets during MLO link removal.

The function calls mt792x_sta_to_link() which can return NULL if the
link is being removed, but the return value was dereferenced without
checking. Similarly, the RCU-protected link_conf and link_sta pointers
were used without NULL validation.

This race can occur when:
- A packet is queued for transmission
- Concurrently, the link is being removed (mt7925_mac_link_sta_remove)
- mt792x_sta_to_link() returns NULL for the removed link
- Kernel crashes on wcid = &mlink->wcid dereference

Fix by checking mlink, conf, and link_sta before use, freeing the SKB
and returning early if any pointer is NULL.

2. Fix firmware reload failure after previous load crash:

If the firmware loading process crashes or is interrupted after
acquiring the patch semaphore but before releasing it, subsequent
firmware load attempts will fail with 'Failed to get patch semaphore'.

Apply the same fix from MT7915 (commit 79dd14f): release the patch
semaphore before starting firmware load and restart MCU firmware to
ensure clean state.

Fixes: c74df1c067f2 ("wifi: mt76: mt792x: introduce mt792x-lib module")
Fixes: 583204ae70f9 ("wifi: mt76: mt792x: move mt7921_load_firmware in mt792x-lib module")
Link: https://github.com/openwrt/mt76/commit/79dd14f2e8161b656341b6653261779199aedbe4
Signed-off-by: Zac Bowling <zac@zacbowling.com>
---
 .../net/wireless/mediatek/mt76/mt792x_core.c  | 27 +++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mt792x_core.c b/drivers/net/wireless/mediatek/mt76/mt792x_core.c
index f2ed16feb6c1..05598202b488 100644
--- a/drivers/net/wireless/mediatek/mt76/mt792x_core.c
+++ b/drivers/net/wireless/mediatek/mt76/mt792x_core.c
@@ -95,6 +95,8 @@ void mt792x_tx(struct ieee80211_hw *hw, struct ieee80211_tx_control *control,
 				       IEEE80211_TX_CTRL_MLO_LINK);
 		sta = (struct mt792x_sta *)control->sta->drv_priv;
 		mlink = mt792x_sta_to_link(sta, link_id);
+		if (!mlink)
+			goto free_skb;
 		wcid = &mlink->wcid;
 	}
 
@@ -113,9 +115,12 @@ void mt792x_tx(struct ieee80211_hw *hw, struct ieee80211_tx_control *control,
 		link_id = wcid->link_id;
 		rcu_read_lock();
 		conf = rcu_dereference(vif->link_conf[link_id]);
-		memcpy(hdr->addr2, conf->addr, ETH_ALEN);
-
 		link_sta = rcu_dereference(control->sta->link[link_id]);
+		if (!conf || !link_sta) {
+			rcu_read_unlock();
+			goto free_skb;
+		}
+		memcpy(hdr->addr2, conf->addr, ETH_ALEN);
 		memcpy(hdr->addr1, link_sta->addr, ETH_ALEN);
 
 		if (vif->type == NL80211_IFTYPE_STATION)
@@ -136,6 +141,10 @@ void mt792x_tx(struct ieee80211_hw *hw, struct ieee80211_tx_control *control,
 	}
 
 	mt76_connac_pm_queue_skb(hw, &dev->pm, wcid, skb);
+	return;
+
+free_skb:
+	ieee80211_free_txskb(hw, skb);
 }
 EXPORT_SYMBOL_GPL(mt792x_tx);
 
@@ -927,6 +936,20 @@ int mt792x_load_firmware(struct mt792x_dev *dev)
 {
 	int ret;
 
+	/* Release semaphore if taken by previous failed load attempt.
+	 * This prevents "Failed to get patch semaphore" errors when
+	 * recovering from firmware crashes or suspend/resume failures.
+	 */
+	ret = mt76_connac_mcu_patch_sem_ctrl(&dev->mt76, false);
+	if (ret < 0)
+		dev_dbg(dev->mt76.dev, "Semaphore release returned %d (may be expected)\n", ret);
+
+	/* Always restart MCU to ensure clean state before loading firmware */
+	mt76_connac_mcu_restart(&dev->mt76);
+
+	/* Wait for MCU to be ready after restart */
+	msleep(100);
+
 	ret = mt76_connac2_load_patch(&dev->mt76, mt792x_patch_name(dev));
 	if (ret)
 		return ret;
-- 
2.52.0
[PATCH 04/13] wifi: mt76: mt7921: add mutex protection in critical paths
Posted by Zac 2 weeks, 5 days ago
From: Zac Bowling <zac@zacbowling.com>

Add proper mutex protection for mt7921 driver operations that access
hardware state without proper synchronization. This fixes multiple race
conditions that can cause system instability.

Fixes added:

1. mac.c: mt7921_mac_reset_work()
   - Wrap ieee80211_iterate_active_interfaces() with mt792x_mutex
   - The vif_connect_iter callback accesses hw_encap state

2. main.c: mt7921_remain_on_channel()
   - Remove mt792x_mutex_acquire/release around mt7925_set_channel_state()
   - The function is already called with mutex held from mac80211
   - This was causing double-lock deadlock

3. main.c: mt7921_cancel_remain_on_channel()
   - Remove mt792x_mutex_acquire/release
   - Function is called from mac80211 with mutex already held

4. pci.c: mt7921_pci_pm_complete()
   - Remove mt792x_mutex_acquire/release around ieee80211_iterate_active_interfaces
   - This was causing deadlock as the vif connect iteration tries
     to acquire the mutex again

5. usb.c: mt7921_usb_pm_complete()
   - Same fix as pci.c for USB driver path

These changes prevent both missing mutex protection and mutex deadlocks
in the mt7921 driver.

Fixes: 5c14a5f944b9 ("wifi: mt76: mt7921: introduce remain_on_channel support")
Signed-off-by: Zac Bowling <zac@zacbowling.com>
---
 drivers/net/wireless/mediatek/mt76/mt7921/mac.c  | 2 ++
 drivers/net/wireless/mediatek/mt76/mt7921/main.c | 9 +++++++++
 drivers/net/wireless/mediatek/mt76/mt7921/pci.c  | 2 ++
 drivers/net/wireless/mediatek/mt76/mt7921/sdio.c | 2 ++
 4 files changed, 15 insertions(+)

diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/mac.c b/drivers/net/wireless/mediatek/mt76/mt7921/mac.c
index 03b4960db73f..f5c882e45bbe 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7921/mac.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7921/mac.c
@@ -693,9 +693,11 @@ void mt7921_mac_reset_work(struct work_struct *work)
 	clear_bit(MT76_RESET, &dev->mphy.state);
 	pm->suspended = false;
 	ieee80211_wake_queues(hw);
+	mt792x_mutex_acquire(dev);
 	ieee80211_iterate_active_interfaces(hw,
 					    IEEE80211_IFACE_ITER_RESUME_ALL,
 					    mt7921_vif_connect_iter, NULL);
+	mt792x_mutex_release(dev);
 	mt76_connac_power_save_sched(&dev->mt76.phy, pm);
 }
 
diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
index 5fae9a6e273c..196fcb1e2e94 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
@@ -373,6 +373,11 @@ void mt7921_roc_abort_sync(struct mt792x_dev *dev)
 
 	timer_delete_sync(&phy->roc_timer);
 	cancel_work_sync(&phy->roc_work);
+	/* Note: caller must hold mutex if ieee80211_iterate_interfaces is
+	 * needed for ROC cleanup. Some call sites (like mt7921_mac_sta_remove)
+	 * already hold the mutex via mt76_sta_remove(). For suspend paths,
+	 * the mutex should be acquired before calling this function.
+	 */
 	if (test_and_clear_bit(MT76_STATE_ROC, &phy->mt76->state))
 		ieee80211_iterate_interfaces(mt76_hw(dev),
 					     IEEE80211_IFACE_ITER_RESUME_ALL,
@@ -619,6 +624,7 @@ void mt7921_set_runtime_pm(struct mt792x_dev *dev)
 	bool monitor = !!(hw->conf.flags & IEEE80211_CONF_MONITOR);
 
 	pm->enable = pm->enable_user && !monitor;
+	/* Note: caller (debugfs) must hold mutex before calling this function */
 	ieee80211_iterate_active_interfaces(hw,
 					    IEEE80211_IFACE_ITER_RESUME_ALL,
 					    mt7921_pm_interface_iter, dev);
@@ -765,6 +771,9 @@ mt7921_regd_set_6ghz_power_type(struct ieee80211_vif *vif, bool is_add)
 	struct mt792x_dev *dev = phy->dev;
 	u32 valid_vif_num = 0;
 
+	/* Note: caller (mt7921_mac_sta_add/remove via mt76_sta_add/remove)
+	 * already holds dev->mt76.mutex, so we must not acquire it here.
+	 */
 	ieee80211_iterate_active_interfaces(mt76_hw(dev),
 					    IEEE80211_IFACE_ITER_RESUME_ALL,
 					    mt7921_calc_vif_num, &valid_vif_num);
diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/pci.c b/drivers/net/wireless/mediatek/mt76/mt7921/pci.c
index ec9686183251..9f76b334b93d 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7921/pci.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7921/pci.c
@@ -426,7 +426,9 @@ static int mt7921_pci_suspend(struct device *device)
 	cancel_delayed_work_sync(&pm->ps_work);
 	cancel_work_sync(&pm->wake_work);
 
+	mt792x_mutex_acquire(dev);
 	mt7921_roc_abort_sync(dev);
+	mt792x_mutex_release(dev);
 
 	err = mt792x_mcu_drv_pmctrl(dev);
 	if (err < 0)
diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/sdio.c b/drivers/net/wireless/mediatek/mt76/mt7921/sdio.c
index 3421e53dc948..92ea2811816f 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7921/sdio.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7921/sdio.c
@@ -219,7 +219,9 @@ static int mt7921s_suspend(struct device *__dev)
 	cancel_delayed_work_sync(&pm->ps_work);
 	cancel_work_sync(&pm->wake_work);
 
+	mt792x_mutex_acquire(dev);
 	mt7921_roc_abort_sync(dev);
+	mt792x_mutex_release(dev);
 
 	err = mt792x_mcu_drv_pmctrl(dev);
 	if (err < 0)
-- 
2.52.0
Re: [PATCH 04/13] wifi: mt76: mt7921: add mutex protection in critical paths
Posted by Felix Fietkau 1 week, 6 days ago
On 20.01.26 21:10, Zac wrote:
> From: Zac Bowling <zac@zacbowling.com>
> 
> Add proper mutex protection for mt7921 driver operations that access
> hardware state without proper synchronization. This fixes multiple race
> conditions that can cause system instability.
> 
> Fixes added:
> 
> 1. mac.c: mt7921_mac_reset_work()
>     - Wrap ieee80211_iterate_active_interfaces() with mt792x_mutex
>     - The vif_connect_iter callback accesses hw_encap state
> 
> 2. main.c: mt7921_remain_on_channel()
>     - Remove mt792x_mutex_acquire/release around mt7925_set_channel_state()
>     - The function is already called with mutex held from mac80211
>     - This was causing double-lock deadlock
> 
> 3. main.c: mt7921_cancel_remain_on_channel()
>     - Remove mt792x_mutex_acquire/release
>     - Function is called from mac80211 with mutex already held
> 
> 4. pci.c: mt7921_pci_pm_complete()
>     - Remove mt792x_mutex_acquire/release around ieee80211_iterate_active_interfaces
>     - This was causing deadlock as the vif connect iteration tries
>       to acquire the mutex again
> 
> 5. usb.c: mt7921_usb_pm_complete()
>     - Same fix as pci.c for USB driver path
Changelog should be below "---" after the commit description, so it 
doesn't get picked up.

> These changes prevent both missing mutex protection and mutex deadlocks
> in the mt7921 driver.
> 
> Fixes: 5c14a5f944b9 ("wifi: mt76: mt7921: introduce remain_on_channel support")
> Signed-off-by: Zac Bowling <zac@zacbowling.com>

> diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> index 5fae9a6e273c..196fcb1e2e94 100644
> --- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> +++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> @@ -373,6 +373,11 @@ void mt7921_roc_abort_sync(struct mt792x_dev *dev)
>   
>   	timer_delete_sync(&phy->roc_timer);
>   	cancel_work_sync(&phy->roc_work);
> +	/* Note: caller must hold mutex if ieee80211_iterate_interfaces is
> +	 * needed for ROC cleanup. Some call sites (like mt7921_mac_sta_remove)
> +	 * already hold the mutex via mt76_sta_remove(). For suspend paths,
> +	 * the mutex should be acquired before calling this function.
> +	 */
>   	if (test_and_clear_bit(MT76_STATE_ROC, &phy->mt76->state))
>   		ieee80211_iterate_interfaces(mt76_hw(dev),
>   					     IEEE80211_IFACE_ITER_RESUME_ALL,
> @@ -619,6 +624,7 @@ void mt7921_set_runtime_pm(struct mt792x_dev *dev)
>   	bool monitor = !!(hw->conf.flags & IEEE80211_CONF_MONITOR);
>   
>   	pm->enable = pm->enable_user && !monitor;
> +	/* Note: caller (debugfs) must hold mutex before calling this function */
>   	ieee80211_iterate_active_interfaces(hw,
>   					    IEEE80211_IFACE_ITER_RESUME_ALL,
>   					    mt7921_pm_interface_iter, dev);
> @@ -765,6 +771,9 @@ mt7921_regd_set_6ghz_power_type(struct ieee80211_vif *vif, bool is_add)
>   	struct mt792x_dev *dev = phy->dev;
>   	u32 valid_vif_num = 0;
>   
> +	/* Note: caller (mt7921_mac_sta_add/remove via mt76_sta_add/remove)
> +	 * already holds dev->mt76.mutex, so we must not acquire it here.
> +	 */
>   	ieee80211_iterate_active_interfaces(mt76_hw(dev),
>   					    IEEE80211_IFACE_ITER_RESUME_ALL,
>   					    mt7921_calc_vif_num, &valid_vif_num);

It looks like these comments should be replaced with 
lockdep_assert_held, so that these assumptions can be verified 
automatically instead of doing so by hand.


> diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/pci.c b/drivers/net/wireless/mediatek/mt76/mt7921/pci.c
> index ec9686183251..9f76b334b93d 100644
> --- a/drivers/net/wireless/mediatek/mt76/mt7921/pci.c
> +++ b/drivers/net/wireless/mediatek/mt76/mt7921/pci.c
> @@ -426,7 +426,9 @@ static int mt7921_pci_suspend(struct device *device)
>   	cancel_delayed_work_sync(&pm->ps_work);
>   	cancel_work_sync(&pm->wake_work);
>   
> +	mt792x_mutex_acquire(dev);
>   	mt7921_roc_abort_sync(dev);
> +	mt792x_mutex_release(dev);
The next patch is removing those...

- Felix
Re: [PATCH 04/13] wifi: mt76: mt7921: add mutex protection in critical paths
Posted by Zac Bowling 1 week, 4 days ago
You are right. I caught that too. After reordering, when I went from
the 24-something patch version of this series earlier this month to
this smaller 11-patch series, to make it easier to follow again, that
happened. It's already gone in my new v7 series. We lock somewhere
else up in the stack now.

I'm cleaning up this whole stack again, dropping the ROC_ABORT back
off hack, because I think actually the solution isn't at this layer at
all, but possibly in the mac80211 layer.

Zac Bowling

On Tue, Jan 27, 2026 at 2:59 AM Felix Fietkau <nbd@nbd.name> wrote:
>
> On 20.01.26 21:10, Zac wrote:
> > From: Zac Bowling <zac@zacbowling.com>
> >
> > Add proper mutex protection for mt7921 driver operations that access
> > hardware state without proper synchronization. This fixes multiple race
> > conditions that can cause system instability.
> >
> > Fixes added:
> >
> > 1. mac.c: mt7921_mac_reset_work()
> >     - Wrap ieee80211_iterate_active_interfaces() with mt792x_mutex
> >     - The vif_connect_iter callback accesses hw_encap state
> >
> > 2. main.c: mt7921_remain_on_channel()
> >     - Remove mt792x_mutex_acquire/release around mt7925_set_channel_state()
> >     - The function is already called with mutex held from mac80211
> >     - This was causing double-lock deadlock
> >
> > 3. main.c: mt7921_cancel_remain_on_channel()
> >     - Remove mt792x_mutex_acquire/release
> >     - Function is called from mac80211 with mutex already held
> >
> > 4. pci.c: mt7921_pci_pm_complete()
> >     - Remove mt792x_mutex_acquire/release around ieee80211_iterate_active_interfaces
> >     - This was causing deadlock as the vif connect iteration tries
> >       to acquire the mutex again
> >
> > 5. usb.c: mt7921_usb_pm_complete()
> >     - Same fix as pci.c for USB driver path
> Changelog should be below "---" after the commit description, so it
> doesn't get picked up.
>
> > These changes prevent both missing mutex protection and mutex deadlocks
> > in the mt7921 driver.
> >
> > Fixes: 5c14a5f944b9 ("wifi: mt76: mt7921: introduce remain_on_channel support")
> > Signed-off-by: Zac Bowling <zac@zacbowling.com>
>
> > diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> > index 5fae9a6e273c..196fcb1e2e94 100644
> > --- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> > +++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> > @@ -373,6 +373,11 @@ void mt7921_roc_abort_sync(struct mt792x_dev *dev)
> >
> >       timer_delete_sync(&phy->roc_timer);
> >       cancel_work_sync(&phy->roc_work);
> > +     /* Note: caller must hold mutex if ieee80211_iterate_interfaces is
> > +      * needed for ROC cleanup. Some call sites (like mt7921_mac_sta_remove)
> > +      * already hold the mutex via mt76_sta_remove(). For suspend paths,
> > +      * the mutex should be acquired before calling this function.
> > +      */
> >       if (test_and_clear_bit(MT76_STATE_ROC, &phy->mt76->state))
> >               ieee80211_iterate_interfaces(mt76_hw(dev),
> >                                            IEEE80211_IFACE_ITER_RESUME_ALL,
> > @@ -619,6 +624,7 @@ void mt7921_set_runtime_pm(struct mt792x_dev *dev)
> >       bool monitor = !!(hw->conf.flags & IEEE80211_CONF_MONITOR);
> >
> >       pm->enable = pm->enable_user && !monitor;
> > +     /* Note: caller (debugfs) must hold mutex before calling this function */
> >       ieee80211_iterate_active_interfaces(hw,
> >                                           IEEE80211_IFACE_ITER_RESUME_ALL,
> >                                           mt7921_pm_interface_iter, dev);
> > @@ -765,6 +771,9 @@ mt7921_regd_set_6ghz_power_type(struct ieee80211_vif *vif, bool is_add)
> >       struct mt792x_dev *dev = phy->dev;
> >       u32 valid_vif_num = 0;
> >
> > +     /* Note: caller (mt7921_mac_sta_add/remove via mt76_sta_add/remove)
> > +      * already holds dev->mt76.mutex, so we must not acquire it here.
> > +      */
> >       ieee80211_iterate_active_interfaces(mt76_hw(dev),
> >                                           IEEE80211_IFACE_ITER_RESUME_ALL,
> >                                           mt7921_calc_vif_num, &valid_vif_num);
>
> It looks like these comments should be replaced with
> lockdep_assert_held, so that these assumptions can be verified
> automatically instead of doing so by hand.
>
>
> > diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/pci.c b/drivers/net/wireless/mediatek/mt76/mt7921/pci.c
> > index ec9686183251..9f76b334b93d 100644
> > --- a/drivers/net/wireless/mediatek/mt76/mt7921/pci.c
> > +++ b/drivers/net/wireless/mediatek/mt76/mt7921/pci.c
> > @@ -426,7 +426,9 @@ static int mt7921_pci_suspend(struct device *device)
> >       cancel_delayed_work_sync(&pm->ps_work);
> >       cancel_work_sync(&pm->wake_work);
> >
> > +     mt792x_mutex_acquire(dev);
> >       mt7921_roc_abort_sync(dev);
> > +     mt792x_mutex_release(dev);
> The next patch is removing those...
>
> - Felix
[PATCH 05/13] wifi: mt76: mt7921: fix deadlock in sta removal and suspend ROC abort
Posted by Zac 2 weeks, 5 days ago
From: Zac Bowling <zac@zacbowling.com>

Fix deadlock scenarios in mt7921 ROC (Remain On Channel) abort paths:

1. Suspend path deadlock (pci.c, sdio.c):
   - Previous fix (b74d48c46f) added mutex around mt7921_roc_abort_sync
   - But roc_work acquires mutex, so cancel_work_sync can deadlock
   - Fix: Remove mutex wrappers since mt7921_roc_abort_sync doesn't
     actually need them (it only calls timer_delete_sync, cancel_work_sync,
     and ieee80211_iterate_interfaces which handles its own locking)

2. sta_remove path deadlock:
   - mt7921_mac_sta_remove is called from mt76_sta_remove which holds mutex
   - Calling mt7921_roc_abort_sync → cancel_work_sync can deadlock if
     roc_work is waiting for the mutex
   - Fix: Add mt7921_roc_abort_async (matching mt7925 pattern) that sets
     abort flag and schedules work instead of blocking
   - Add abort flag checking in mt7921_roc_work to handle async abort

The fix mirrors the mt7925 implementation which already handles these
scenarios correctly.

Fixes: b74d48c46f ("wifi: mt76: mt7921: fix mutex handling in multiple paths")
Signed-off-by: Zac Bowling <zac@zacbowling.com>
---
 .../net/wireless/mediatek/mt76/mt7921/main.c  | 29 +++++++++++++++----
 .../net/wireless/mediatek/mt76/mt7921/pci.c   |  2 --
 .../net/wireless/mediatek/mt76/mt7921/sdio.c  |  2 --
 3 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
index 196fcb1e2e94..f3941a25fd6f 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
@@ -367,17 +367,24 @@ static void mt7921_roc_iter(void *priv, u8 *mac,
 	mt7921_mcu_abort_roc(phy, mvif, phy->roc_token_id);
 }
 
+/* Async ROC abort - safe to call while holding mutex.
+ * Sets abort flag and schedules roc_work for cleanup.
+ */
+static void mt7921_roc_abort_async(struct mt792x_dev *dev)
+{
+	struct mt792x_phy *phy = &dev->phy;
+
+	set_bit(MT76_STATE_ROC_ABORT, &phy->mt76->state);
+	timer_delete(&phy->roc_timer);
+	ieee80211_queue_work(phy->mt76->hw, &phy->roc_work);
+}
+
 void mt7921_roc_abort_sync(struct mt792x_dev *dev)
 {
 	struct mt792x_phy *phy = &dev->phy;
 
 	timer_delete_sync(&phy->roc_timer);
 	cancel_work_sync(&phy->roc_work);
-	/* Note: caller must hold mutex if ieee80211_iterate_interfaces is
-	 * needed for ROC cleanup. Some call sites (like mt7921_mac_sta_remove)
-	 * already hold the mutex via mt76_sta_remove(). For suspend paths,
-	 * the mutex should be acquired before calling this function.
-	 */
 	if (test_and_clear_bit(MT76_STATE_ROC, &phy->mt76->state))
 		ieee80211_iterate_interfaces(mt76_hw(dev),
 					     IEEE80211_IFACE_ITER_RESUME_ALL,
@@ -392,6 +399,15 @@ void mt7921_roc_work(struct work_struct *work)
 	phy = (struct mt792x_phy *)container_of(work, struct mt792x_phy,
 						roc_work);
 
+	/* Check abort flag before acquiring mutex to prevent deadlock.
+	 * Only send expired callback if ROC was actually active.
+	 */
+	if (test_and_clear_bit(MT76_STATE_ROC_ABORT, &phy->mt76->state)) {
+		if (test_and_clear_bit(MT76_STATE_ROC, &phy->mt76->state))
+			ieee80211_remain_on_channel_expired(phy->mt76->hw);
+		return;
+	}
+
 	if (!test_and_clear_bit(MT76_STATE_ROC, &phy->mt76->state))
 		return;
 
@@ -888,7 +904,8 @@ void mt7921_mac_sta_remove(struct mt76_dev *mdev, struct ieee80211_vif *vif,
 	struct mt792x_dev *dev = container_of(mdev, struct mt792x_dev, mt76);
 	struct mt792x_sta *msta = (struct mt792x_sta *)sta->drv_priv;
 
-	mt7921_roc_abort_sync(dev);
+	/* Async abort - caller already holds mutex */
+	mt7921_roc_abort_async(dev);
 	mt76_connac_free_pending_tx_skbs(&dev->pm, &msta->deflink.wcid);
 	mt76_connac_pm_wake(&dev->mphy, &dev->pm);
 
diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/pci.c b/drivers/net/wireless/mediatek/mt76/mt7921/pci.c
index 9f76b334b93d..ec9686183251 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7921/pci.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7921/pci.c
@@ -426,9 +426,7 @@ static int mt7921_pci_suspend(struct device *device)
 	cancel_delayed_work_sync(&pm->ps_work);
 	cancel_work_sync(&pm->wake_work);
 
-	mt792x_mutex_acquire(dev);
 	mt7921_roc_abort_sync(dev);
-	mt792x_mutex_release(dev);
 
 	err = mt792x_mcu_drv_pmctrl(dev);
 	if (err < 0)
diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/sdio.c b/drivers/net/wireless/mediatek/mt76/mt7921/sdio.c
index 92ea2811816f..3421e53dc948 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7921/sdio.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7921/sdio.c
@@ -219,9 +219,7 @@ static int mt7921s_suspend(struct device *__dev)
 	cancel_delayed_work_sync(&pm->ps_work);
 	cancel_work_sync(&pm->wake_work);
 
-	mt792x_mutex_acquire(dev);
 	mt7921_roc_abort_sync(dev);
-	mt792x_mutex_release(dev);
 
 	err = mt792x_mcu_drv_pmctrl(dev);
 	if (err < 0)
-- 
2.52.0

[PATCH 06/13] wifi: mt76: mt7925: add comprehensive NULL pointer protection for MLO
Posted by Zac 2 weeks, 5 days ago
From: Zac Bowling <zac@zacbowling.com>

Add NULL pointer checks for functions that return pointers to link-related
structures throughout the mt7925 driver. During MLO state transitions,
these functions can return NULL when link configuration is not synchronized.

Functions protected:
- mt792x_vif_to_bss_conf(): Returns link BSS configuration
- mt792x_vif_to_link(): Returns driver link state
- mt792x_sta_to_link(): Returns station link state

Files updated:

1. mac.c:
   - mt7925_vif_connect_iter(): Check bss_conf before use
   - mt7925_mac_sta_assoc(): Check bss_conf before use

2. main.c:
   - mt7925_set_key(): Check link_conf and mlink
   - mt7925_mac_link_sta_add(): Check link_conf and mlink
   - mt7925_mac_link_sta_assoc(): Check bss_conf and mlink
   - mt7925_mac_link_sta_remove(): Check bss_conf and mlink
   - mt7925_change_vif_links(): Check conf before use
   - mt7925_assign_vif_chanctx(): Check mconf and mlink
   - mt7925_unassign_vif_chanctx(): Check mconf and mlink
   - mt7925_mgd_prepare_tx(): Check link_conf

3. mcu.c:
   - mt7925_mcu_sta_phy_tlv(): Check link_sta
   - mt7925_mcu_sta_amsdu_tlv(): Check link_sta
   - mt7925_mcu_sta_mld_tlv(): Check link_sta
   - mt7925_mcu_sta_cmd(): Check mlink
   - mt7925_mcu_add_bss_info(): Check link_conf
   - mt7925_mcu_set_chctx(): Check link_conf and mlink

Prevents crashes during:
- BSSID roaming transitions
- MLO setup and teardown
- Hardware reset operations
- Runtime power management

Fixes: c948b5da6bbe ("wifi: mt76: mt7925: add Mediatek Wi-Fi7 driver for mt7925 device")
Signed-off-by: Zac Bowling <zac@zacbowling.com>
---
 .../net/wireless/mediatek/mt76/mt7925/mac.c   |  6 ++
 .../net/wireless/mediatek/mt76/mt7925/main.c  | 82 ++++++++++++++++---
 .../net/wireless/mediatek/mt76/mt7925/mcu.c   | 22 ++++-
 3 files changed, 97 insertions(+), 13 deletions(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mt7925/mac.c b/drivers/net/wireless/mediatek/mt76/mt7925/mac.c
index 871b67101976..184efe8afa10 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7925/mac.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7925/mac.c
@@ -1271,6 +1271,12 @@ mt7925_vif_connect_iter(void *priv, u8 *mac,
 		bss_conf = mt792x_vif_to_bss_conf(vif, i);
 		mconf = mt792x_vif_to_link(mvif, i);
 
+		/* Skip links that don't have bss_conf set up yet in mac80211.
+		 * This can happen during HW reset when link state is inconsistent.
+		 */
+		if (!bss_conf)
+			continue;
+
 		mt76_connac_mcu_uni_add_dev(&dev->mphy, bss_conf, &mconf->mt76,
 					    &mvif->sta.deflink.wcid, true);
 		mt7925_mcu_set_tx(dev, bss_conf);
diff --git a/drivers/net/wireless/mediatek/mt76/mt7925/main.c b/drivers/net/wireless/mediatek/mt76/mt7925/main.c
index 05990455ee7d..74a48742e234 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7925/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7925/main.c
@@ -608,6 +608,10 @@ static int mt7925_set_link_key(struct ieee80211_hw *hw, enum set_key_cmd cmd,
 	link_sta = sta ? mt792x_sta_to_link_sta(vif, sta, link_id) : NULL;
 	mconf = mt792x_vif_to_link(mvif, link_id);
 	mlink = mt792x_sta_to_link(msta, link_id);
+
+	if (!link_conf || !mconf || !mlink)
+		return -EINVAL;
+
 	wcid = &mlink->wcid;
 	wcid_keyidx = &wcid->hw_key_idx;
 
@@ -860,12 +864,17 @@ static int mt7925_mac_link_sta_add(struct mt76_dev *mdev,
 
 	msta = (struct mt792x_sta *)link_sta->sta->drv_priv;
 	mlink = mt792x_sta_to_link(msta, link_id);
+	if (!mlink)
+		return -EINVAL;
 
 	idx = mt76_wcid_alloc(dev->mt76.wcid_mask, MT792x_WTBL_STA - 1);
 	if (idx < 0)
 		return -ENOSPC;
 
 	mconf = mt792x_vif_to_link(mvif, link_id);
+	if (!mconf)
+		return -EINVAL;
+
 	mt76_wcid_init(&mlink->wcid, 0);
 	mlink->wcid.sta = 1;
 	mlink->wcid.idx = idx;
@@ -891,6 +900,8 @@ static int mt7925_mac_link_sta_add(struct mt76_dev *mdev,
 			       MT_WTBL_UPDATE_ADM_COUNT_CLEAR);
 
 	link_conf = mt792x_vif_to_bss_conf(vif, link_id);
+	if (!link_conf)
+		return -EINVAL;
 
 	/* should update bss info before STA add */
 	if (vif->type == NL80211_IFTYPE_STATION && !link_sta->sta->tdls) {
@@ -997,18 +1008,29 @@ mt7925_mac_set_links(struct mt76_dev *mdev, struct ieee80211_vif *vif)
 {
 	struct mt792x_dev *dev = container_of(mdev, struct mt792x_dev, mt76);
 	struct mt792x_vif *mvif = (struct mt792x_vif *)vif->drv_priv;
-	struct ieee80211_bss_conf *link_conf =
-		mt792x_vif_to_bss_conf(vif, mvif->deflink_id);
-	struct cfg80211_chan_def *chandef = &link_conf->chanreq.oper;
-	enum nl80211_band band = chandef->chan->band, secondary_band;
+	struct ieee80211_bss_conf *link_conf;
+	struct cfg80211_chan_def *chandef;
+	enum nl80211_band band, secondary_band;
+	u16 sel_links;
+	u8 secondary_link_id;
 
-	u16 sel_links = mt76_select_links(vif, 2);
-	u8 secondary_link_id = __ffs(~BIT(mvif->deflink_id) & sel_links);
+	link_conf = mt792x_vif_to_bss_conf(vif, mvif->deflink_id);
+	if (!link_conf)
+		return;
+
+	chandef = &link_conf->chanreq.oper;
+	band = chandef->chan->band;
+
+	sel_links = mt76_select_links(vif, 2);
+	secondary_link_id = __ffs(~BIT(mvif->deflink_id) & sel_links);
 
 	if (!ieee80211_vif_is_mld(vif) || hweight16(sel_links) < 2)
 		return;
 
 	link_conf = mt792x_vif_to_bss_conf(vif, secondary_link_id);
+	if (!link_conf)
+		return;
+
 	secondary_band = link_conf->chanreq.oper.chan->band;
 
 	if (band == NL80211_BAND_2GHZ ||
@@ -1036,6 +1058,8 @@ static void mt7925_mac_link_sta_assoc(struct mt76_dev *mdev,
 
 	msta = (struct mt792x_sta *)link_sta->sta->drv_priv;
 	mlink = mt792x_sta_to_link(msta, link_sta->link_id);
+	if (!mlink)
+		return;
 
 	mt792x_mutex_acquire(dev);
 
@@ -1045,12 +1069,13 @@ static void mt7925_mac_link_sta_assoc(struct mt76_dev *mdev,
 		link_conf = mt792x_vif_to_bss_conf(vif, vif->bss_conf.link_id);
 	}
 
-	if (vif->type == NL80211_IFTYPE_STATION && !link_sta->sta->tdls) {
+	if (link_conf && vif->type == NL80211_IFTYPE_STATION && !link_sta->sta->tdls) {
 		struct mt792x_bss_conf *mconf;
 
 		mconf = mt792x_link_conf_to_mconf(link_conf);
-		mt7925_mcu_add_bss_info(&dev->phy, mconf->mt76.ctx,
-					link_conf, link_sta, true);
+		if (mconf)
+			mt7925_mcu_add_bss_info(&dev->phy, mconf->mt76.ctx,
+						link_conf, link_sta, true);
 	}
 
 	ewma_avg_signal_init(&mlink->avg_ack_signal);
@@ -1097,6 +1122,8 @@ static void mt7925_mac_link_sta_remove(struct mt76_dev *mdev,
 
 	msta = (struct mt792x_sta *)link_sta->sta->drv_priv;
 	mlink = mt792x_sta_to_link(msta, link_id);
+	if (!mlink)
+		return;
 
 	mt7925_roc_abort_sync(dev);
 
@@ -1110,10 +1137,12 @@ static void mt7925_mac_link_sta_remove(struct mt76_dev *mdev,
 
 	link_conf = mt792x_vif_to_bss_conf(vif, link_id);
 
-	if (vif->type == NL80211_IFTYPE_STATION && !link_sta->sta->tdls) {
+	if (link_conf && vif->type == NL80211_IFTYPE_STATION && !link_sta->sta->tdls) {
 		struct mt792x_bss_conf *mconf;
 
 		mconf = mt792x_link_conf_to_mconf(link_conf);
+		if (!mconf)
+			goto out;
 
 		if (ieee80211_vif_is_mld(vif))
 			mt792x_mac_link_bss_remove(dev, mconf, mlink);
@@ -1121,6 +1150,7 @@ static void mt7925_mac_link_sta_remove(struct mt76_dev *mdev,
 			mt7925_mcu_add_bss_info(&dev->phy, mconf->mt76.ctx, link_conf,
 						link_sta, false);
 	}
+out:
 
 	spin_lock_bh(&mdev->sta_poll_lock);
 	if (!list_empty(&mlink->wcid.poll_list))
@@ -1308,6 +1338,8 @@ mt7925_mlo_pm_iter(void *priv, u8 *mac, struct ieee80211_vif *vif)
 	mt792x_mutex_acquire(dev);
 	for_each_set_bit(i, &valid, IEEE80211_MLD_MAX_NUM_LINKS) {
 		bss_conf = mt792x_vif_to_bss_conf(vif, i);
+		if (!bss_conf)
+			continue;
 		mt7925_mcu_uni_bss_ps(dev, bss_conf);
 	}
 	mt792x_mutex_release(dev);
@@ -1634,6 +1666,8 @@ static void mt7925_ipv6_addr_change(struct ieee80211_hw *hw,
 
 	for_each_set_bit(i, &valid, IEEE80211_MLD_MAX_NUM_LINKS) {
 		bss_conf = mt792x_vif_to_bss_conf(vif, i);
+		if (!bss_conf)
+			continue;
 		__mt7925_ipv6_addr_change(hw, bss_conf, idev);
 	}
 }
@@ -1695,6 +1729,9 @@ mt7925_conf_tx(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 		    [IEEE80211_AC_BK] = 1,
 	};
 
+	if (!mconf)
+		return -EINVAL;
+
 	/* firmware uses access class index */
 	mconf->queue_params[mq_to_aci[queue]] = *params;
 
@@ -1865,6 +1902,8 @@ static void mt7925_vif_cfg_changed(struct ieee80211_hw *hw,
 	if (changed & BSS_CHANGED_ARP_FILTER) {
 		for_each_set_bit(i, &valid, IEEE80211_MLD_MAX_NUM_LINKS) {
 			bss_conf = mt792x_vif_to_bss_conf(vif, i);
+			if (!bss_conf)
+				continue;
 			mt7925_mcu_update_arp_filter(&dev->mt76, bss_conf);
 		}
 	}
@@ -1880,6 +1919,8 @@ static void mt7925_vif_cfg_changed(struct ieee80211_hw *hw,
 			} else if (mvif->mlo_pm_state == MT792x_MLO_CHANGED_PS) {
 				for_each_set_bit(i, &valid, IEEE80211_MLD_MAX_NUM_LINKS) {
 					bss_conf = mt792x_vif_to_bss_conf(vif, i);
+					if (!bss_conf)
+						continue;
 					mt7925_mcu_uni_bss_ps(dev, bss_conf);
 				}
 			}
@@ -1901,7 +1942,12 @@ static void mt7925_link_info_changed(struct ieee80211_hw *hw,
 	struct ieee80211_bss_conf *link_conf;
 
 	mconf = mt792x_vif_to_link(mvif, info->link_id);
+	if (!mconf)
+		return;
+
 	link_conf = mt792x_vif_to_bss_conf(vif, mconf->link_id);
+	if (!link_conf)
+		return;
 
 	mt792x_mutex_acquire(dev);
 
@@ -2025,6 +2071,11 @@ mt7925_change_vif_links(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 		mlink = mlinks[link_id];
 		link_conf = mt792x_vif_to_bss_conf(vif, link_id);
 
+		if (!link_conf) {
+			err = -EINVAL;
+			goto free;
+		}
+
 		rcu_assign_pointer(mvif->link_conf[link_id], mconf);
 		rcu_assign_pointer(mvif->sta.link[link_id], mlink);
 
@@ -2105,9 +2156,14 @@ static int mt7925_assign_vif_chanctx(struct ieee80211_hw *hw,
 
 	if (ieee80211_vif_is_mld(vif)) {
 		mconf = mt792x_vif_to_link(mvif, link_conf->link_id);
+		if (!mconf) {
+			mutex_unlock(&dev->mt76.mutex);
+			return -EINVAL;
+		}
+
 		pri_link_conf = mt792x_vif_to_bss_conf(vif, mvif->deflink_id);
 
-		if (vif->type == NL80211_IFTYPE_STATION &&
+		if (pri_link_conf && vif->type == NL80211_IFTYPE_STATION &&
 		    mconf == &mvif->bss_conf)
 			mt7925_mcu_add_bss_info(&dev->phy, NULL, pri_link_conf,
 						NULL, true);
@@ -2136,6 +2192,10 @@ static void mt7925_unassign_vif_chanctx(struct ieee80211_hw *hw,
 
 	if (ieee80211_vif_is_mld(vif)) {
 		mconf = mt792x_vif_to_link(mvif, link_conf->link_id);
+		if (!mconf) {
+			mutex_unlock(&dev->mt76.mutex);
+			return;
+		}
 
 		if (vif->type == NL80211_IFTYPE_STATION &&
 		    mconf == &mvif->bss_conf)
diff --git a/drivers/net/wireless/mediatek/mt76/mt7925/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7925/mcu.c
index cf0fdea45cf7..94ec62a4538a 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7925/mcu.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7925/mcu.c
@@ -1087,6 +1087,8 @@ mt7925_mcu_sta_hdr_trans_tlv(struct sk_buff *skb,
 		struct mt792x_link_sta *mlink;
 
 		mlink = mt792x_sta_to_link(msta, link_sta->link_id);
+		if (!mlink)
+			return;
 		wcid = &mlink->wcid;
 	} else {
 		wcid = &mvif->sta.deflink.wcid;
@@ -1120,6 +1122,9 @@ int mt7925_mcu_wtbl_update_hdr_trans(struct mt792x_dev *dev,
 	link_sta = mt792x_sta_to_link_sta(vif, sta, link_id);
 	mconf = mt792x_vif_to_link(mvif, link_id);
 
+	if (!mlink || !mconf)
+		return -EINVAL;
+
 	skb = __mt76_connac_mcu_alloc_sta_req(&dev->mt76, &mconf->mt76,
 					      &mlink->wcid,
 					      MT7925_STA_UPDATE_MAX_SIZE);
@@ -1741,6 +1746,8 @@ mt7925_mcu_sta_amsdu_tlv(struct sk_buff *skb,
 	amsdu->amsdu_en = true;
 
 	mlink = mt792x_sta_to_link(msta, link_sta->link_id);
+	if (!mlink)
+		return;
 	mlink->wcid.amsdu = true;
 
 	switch (link_sta->agg.max_amsdu_len) {
@@ -1773,6 +1780,10 @@ mt7925_mcu_sta_phy_tlv(struct sk_buff *skb,
 
 	link_conf = mt792x_vif_to_bss_conf(vif, link_sta->link_id);
 	mconf = mt792x_vif_to_link(mvif, link_sta->link_id);
+
+	if (!link_conf || !mconf)
+		return;
+
 	chandef = mconf->mt76.ctx ? &mconf->mt76.ctx->def :
 				    &link_conf->chanreq.oper;
 
@@ -1851,6 +1862,10 @@ mt7925_mcu_sta_rate_ctrl_tlv(struct sk_buff *skb,
 
 	link_conf = mt792x_vif_to_bss_conf(vif, link_sta->link_id);
 	mconf = mt792x_vif_to_link(mvif, link_sta->link_id);
+
+	if (!link_conf || !mconf)
+		return;
+
 	chandef = mconf->mt76.ctx ? &mconf->mt76.ctx->def :
 				    &link_conf->chanreq.oper;
 	band = chandef->chan->band;
@@ -1935,6 +1950,9 @@ mt7925_mcu_sta_mld_tlv(struct sk_buff *skb,
 
 		mconf = mt792x_vif_to_link(mvif, i);
 		mlink = mt792x_sta_to_link(msta, i);
+		if (!mconf || !mlink)
+			continue;
+
 		mld->link[cnt].wlan_id = cpu_to_le16(mlink->wcid.idx);
 		mld->link[cnt++].bss_idx = mconf->mt76.idx;
 
@@ -2027,13 +2045,13 @@ int mt7925_mcu_sta_update(struct mt792x_dev *dev,
 		.rcpi = to_rcpi(rssi),
 	};
 	struct mt792x_sta *msta;
-	struct mt792x_link_sta *mlink;
+	struct mt792x_link_sta *mlink = NULL;
 
 	if (link_sta) {
 		msta = (struct mt792x_sta *)link_sta->sta->drv_priv;
 		mlink = mt792x_sta_to_link(msta, link_sta->link_id);
 	}
-	info.wcid = link_sta ? &mlink->wcid : &mvif->sta.deflink.wcid;
+	info.wcid = (link_sta && mlink) ? &mlink->wcid : &mvif->sta.deflink.wcid;
 	info.newly = state != MT76_STA_INFO_STATE_ASSOC;
 
 	return mt7925_mcu_sta_cmd(&dev->mphy, &info);
-- 
2.52.0
[PATCH 08/13] wifi: mt76: mt7925: add MCU command error handling
Posted by Zac 2 weeks, 5 days ago
From: Zac Bowling <zac@zacbowling.com>

Add proper error handling for MCU command return values that were
previously being ignored. Without proper error handling, failures in
MCU communication can leave the driver in an inconsistent state.

Functions updated:

1. main.c: mt7925_ampdu_action() - BA session setup
   - Check mt7925_mcu_uni_tx_ba() return value
   - Check mt7925_mcu_uni_rx_ba() return value
   - Return error to mac80211 on failure

2. main.c: mt7925_mac_link_sta_add() - Station addition
   - Check mt7925_mcu_add_bss_info() return value
   - Propagate errors during station setup

3. main.c: mt7925_set_key() - Key installation
   - Check mt7925_mcu_add_bss_info() return value when setting
     BSS info before key installation
   - Prevent key setup on communication failure

These changes ensure that MCU communication failures are properly
detected and reported to mac80211, allowing proper error recovery
instead of leaving the driver in an undefined state.

Fixes: c948b5da6bbe ("wifi: mt76: mt7925: add Mediatek Wi-Fi7 driver for mt7925 device")
Signed-off-by: Zac Bowling <zac@zacbowling.com>
---
 .../net/wireless/mediatek/mt76/mt7925/main.c  | 30 +++++++++++--------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mt7925/main.c b/drivers/net/wireless/mediatek/mt76/mt7925/main.c
index f1884944f77d..59a5b22a6ed6 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7925/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7925/main.c
@@ -641,8 +641,10 @@ static int mt7925_set_link_key(struct ieee80211_hw *hw, enum set_key_cmd cmd,
 		struct mt792x_phy *phy = mt792x_hw_phy(hw);
 
 		mconf->mt76.cipher = mt7925_mcu_get_cipher(key->cipher);
-		mt7925_mcu_add_bss_info(phy, mconf->mt76.ctx, link_conf,
-					link_sta, true);
+		err = mt7925_mcu_add_bss_info(phy, mconf->mt76.ctx, link_conf,
+					      link_sta, true);
+		if (err)
+			goto out;
 	}
 
 	if (cmd == SET_KEY)
@@ -908,11 +910,14 @@ static int mt7925_mac_link_sta_add(struct mt76_dev *mdev,
 	/* should update bss info before STA add */
 	if (vif->type == NL80211_IFTYPE_STATION && !link_sta->sta->tdls) {
 		if (ieee80211_vif_is_mld(vif))
-			mt7925_mcu_add_bss_info(&dev->phy, mconf->mt76.ctx,
-						link_conf, link_sta, link_sta != mlink->pri_link);
+			ret = mt7925_mcu_add_bss_info(&dev->phy, mconf->mt76.ctx,
+						      link_conf, link_sta,
+						      link_sta != mlink->pri_link);
 		else
-			mt7925_mcu_add_bss_info(&dev->phy, mconf->mt76.ctx,
-						link_conf, link_sta, false);
+			ret = mt7925_mcu_add_bss_info(&dev->phy, mconf->mt76.ctx,
+						      link_conf, link_sta, false);
+		if (ret)
+			return ret;
 	}
 
 	if (ieee80211_vif_is_mld(vif) &&
@@ -1291,22 +1296,22 @@ mt7925_ampdu_action(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 	case IEEE80211_AMPDU_RX_START:
 		mt76_rx_aggr_start(&dev->mt76, &msta->deflink.wcid, tid, ssn,
 				   params->buf_size);
-		mt7925_mcu_uni_rx_ba(dev, params, true);
+		ret = mt7925_mcu_uni_rx_ba(dev, params, true);
 		break;
 	case IEEE80211_AMPDU_RX_STOP:
 		mt76_rx_aggr_stop(&dev->mt76, &msta->deflink.wcid, tid);
-		mt7925_mcu_uni_rx_ba(dev, params, false);
+		ret = mt7925_mcu_uni_rx_ba(dev, params, false);
 		break;
 	case IEEE80211_AMPDU_TX_OPERATIONAL:
 		mtxq->aggr = true;
 		mtxq->send_bar = false;
-		mt7925_mcu_uni_tx_ba(dev, params, true);
+		ret = mt7925_mcu_uni_tx_ba(dev, params, true);
 		break;
 	case IEEE80211_AMPDU_TX_STOP_FLUSH:
 	case IEEE80211_AMPDU_TX_STOP_FLUSH_CONT:
 		mtxq->aggr = false;
 		clear_bit(tid, &msta->deflink.wcid.ampdu_state);
-		mt7925_mcu_uni_tx_ba(dev, params, false);
+		ret = mt7925_mcu_uni_tx_ba(dev, params, false);
 		break;
 	case IEEE80211_AMPDU_TX_START:
 		set_bit(tid, &msta->deflink.wcid.ampdu_state);
@@ -1315,8 +1320,9 @@ mt7925_ampdu_action(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 	case IEEE80211_AMPDU_TX_STOP_CONT:
 		mtxq->aggr = false;
 		clear_bit(tid, &msta->deflink.wcid.ampdu_state);
-		mt7925_mcu_uni_tx_ba(dev, params, false);
-		ieee80211_stop_tx_ba_cb_irqsafe(vif, sta->addr, tid);
+		ret = mt7925_mcu_uni_tx_ba(dev, params, false);
+		if (!ret)
+			ieee80211_stop_tx_ba_cb_irqsafe(vif, sta->addr, tid);
 		break;
 	}
 	mt792x_mutex_release(dev);
-- 
2.52.0
[PATCH 09/13] wifi: mt76: mt7925: add lockdep assertions for mutex verification
Posted by Zac 2 weeks, 5 days ago
From: Zac Bowling <zac@zacbowling.com>

Add lockdep_assert_held() calls to critical MCU functions to help catch
mutex violations during development and debugging. This follows the
pattern used in other mt76 drivers (mt7996, mt7915, mt7615).

Functions with new assertions:
- mt7925_mcu_add_bss_info(): Core BSS configuration MCU command
- mt7925_mcu_sta_update(): Station record update MCU command
- mt7925_mcu_uni_bss_ps(): Power save state MCU command

These functions modify firmware state and must be called with the
device mutex held to prevent race conditions. The lockdep assertions
will trigger warnings at runtime if code paths exist that call these
functions without proper mutex protection.

This aids in detecting the class of bugs fixed by patches in this series.

Signed-off-by: Zac Bowling <zac@zacbowling.com>
---
 drivers/net/wireless/mediatek/mt76/mt7925/mcu.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/wireless/mediatek/mt76/mt7925/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7925/mcu.c
index 94ec62a4538a..1c58b0be2be4 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7925/mcu.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7925/mcu.c
@@ -1532,6 +1532,8 @@ int mt7925_mcu_uni_bss_ps(struct mt792x_dev *dev,
 		},
 	};
 
+	lockdep_assert_held(&dev->mt76.mutex);
+
 	if (link_conf->vif->type != NL80211_IFTYPE_STATION)
 		return -EOPNOTSUPP;
 
@@ -2047,6 +2049,8 @@ int mt7925_mcu_sta_update(struct mt792x_dev *dev,
 	struct mt792x_sta *msta;
 	struct mt792x_link_sta *mlink = NULL;
 
+	lockdep_assert_held(&dev->mt76.mutex);
+
 	if (link_sta) {
 		msta = (struct mt792x_sta *)link_sta->sta->drv_priv;
 		mlink = mt792x_sta_to_link(msta, link_sta->link_id);
@@ -2853,6 +2857,8 @@ int mt7925_mcu_add_bss_info(struct mt792x_phy *phy,
 	struct mt792x_link_sta *mlink_bc;
 	struct sk_buff *skb;
 
+	lockdep_assert_held(&dev->mt76.mutex);
+
 	skb = __mt7925_mcu_alloc_bss_req(&dev->mt76, &mconf->mt76,
 					 MT7925_BSS_UPDATE_MAX_SIZE);
 	if (IS_ERR(skb))
-- 
2.52.0
[PATCH 10/13] wifi: mt76: mt7925: fix MLO roaming and ROC setup issues
Posted by Zac 2 weeks, 5 days ago
From: Zac Bowling <zac@zacbowling.com>

Fix two issues related to MLO roaming and remain-on-channel operations:

1. Key removal failure during MLO roaming:

During MLO roaming, key removal can fail because the WCID (wireless client
ID) is already cleaned up before the key removal operation completes.

When roaming between APs in an MLO setup:
- mac80211 triggers sta_state changes
- mt7925_mac_link_sta_remove() is called for the old link
- WCID is cleared via mt76_wcid_cleanup()
- Later, key removal MCU command uses the now-invalid WCID

Fix by checking if the WCID is still valid before sending key removal
commands to firmware. If the WCID has already been cleaned up, skip
the MCU command since the firmware has already removed the keys.

2. Kernel warning in MLO ROC setup:

When starting a remain-on-channel operation in MLO mode, the driver
passes incorrect parameters to mt7925_mcu_set_roc(), causing a kernel
warning about invalid chanctx usage.

Fix by checking for valid chanctx and link configuration before
setting up ROC, and use the correct link_id from the vif when
available.

Fixes: c948b5da6bbe ("wifi: mt76: mt7925: add Mediatek Wi-Fi7 driver for mt7925 device")
Signed-off-by: Zac Bowling <zac@zacbowling.com>
---
 .../net/wireless/mediatek/mt76/mt7925/main.c  |  9 ++++++++-
 .../net/wireless/mediatek/mt76/mt7925/mcu.c   | 20 +++++++++++++------
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mt7925/main.c b/drivers/net/wireless/mediatek/mt76/mt7925/main.c
index 59a5b22a6ed6..7d68b08f445a 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7925/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7925/main.c
@@ -609,8 +609,15 @@ static int mt7925_set_link_key(struct ieee80211_hw *hw, enum set_key_cmd cmd,
 	mconf = mt792x_vif_to_link(mvif, link_id);
 	mlink = mt792x_sta_to_link(msta, link_id);
 
-	if (!link_conf || !mconf || !mlink)
+	if (!link_conf || !mconf || !mlink) {
+		/* During MLO roaming, link state may be torn down before
+		 * mac80211 requests key removal. If removing a key and
+		 * the link is already gone, consider it successfully removed.
+		 */
+		if (cmd != SET_KEY)
+			return 0;
 		return -EINVAL;
+	}
 
 	wcid = &mlink->wcid;
 	wcid_keyidx = &wcid->hw_key_idx;
diff --git a/drivers/net/wireless/mediatek/mt76/mt7925/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7925/mcu.c
index 1c58b0be2be4..6f7fc1b9a440 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7925/mcu.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7925/mcu.c
@@ -1342,15 +1342,23 @@ int mt7925_mcu_set_mlo_roc(struct mt792x_bss_conf *mconf, u16 sel_links,
 	for (i = 0; i < ARRAY_SIZE(links); i++) {
 		links[i].id = i ? __ffs(~BIT(mconf->link_id) & sel_links) :
 				 mconf->link_id;
+
 		link_conf = mt792x_vif_to_bss_conf(vif, links[i].id);
-		if (WARN_ON_ONCE(!link_conf))
-			return -EPERM;
+		if (!link_conf)
+			return -ENOLINK;
 
 		links[i].chan = link_conf->chanreq.oper.chan;
-		if (WARN_ON_ONCE(!links[i].chan))
-			return -EPERM;
+		if (!links[i].chan)
+			/* Channel not configured yet - this can happen during
+			 * MLO AP setup when links are being added sequentially.
+			 * Return -ENOLINK to indicate link not ready.
+			 */
+			return -ENOLINK;
 
 		links[i].mconf = mt792x_vif_to_link(mvif, links[i].id);
+		if (!links[i].mconf)
+			return -ENOLINK;
+
 		links[i].tag = links[i].id == mconf->link_id ?
 			       UNI_ROC_ACQUIRE : UNI_ROC_SUB_LINK;
 
@@ -1364,8 +1372,8 @@ int mt7925_mcu_set_mlo_roc(struct mt792x_bss_conf *mconf, u16 sel_links,
 		type = MT7925_ROC_REQ_JOIN;
 
 	for (i = 0; i < ARRAY_SIZE(links) && i < hweight16(vif->active_links); i++) {
-		if (WARN_ON_ONCE(!links[i].mconf || !links[i].chan))
-			continue;
+		if (!links[i].mconf || !links[i].chan)
+			return -ENOLINK;
 
 		chan = links[i].chan;
 		center_ch = ieee80211_frequency_to_channel(chan->center_freq);
-- 
2.52.0
[PATCH 11/13] wifi: mt76: mt7925: fix BA session teardown during beacon loss
Posted by Zac 2 weeks, 5 days ago
From: Zac Bowling <zac@zacbowling.com>

The ieee80211_stop_tx_ba_cb_irqsafe() callback was conditionally called
only when the MCU command succeeded. However, during beacon connection
loss, the MCU command may fail because the AP is no longer reachable.

If the callback is not called, mac80211's BA session state machine gets
stuck in an intermediate state. When mac80211 later tries to tear down
all BA sessions during disconnection, it hits a WARN in
__ieee80211_stop_tx_ba_session() due to the inconsistent state.

Fix by making the callback unconditional, matching the behavior of
mt7921 and mt7996 drivers. The MCU command failure is acceptable during
disconnection - what matters is that mac80211 is notified to complete
the session teardown.

Reported-by: Sean Wang <sean.wang@mediatek.com>
Signed-off-by: Zac Bowling <zac@zacbowling.com>
---
 drivers/net/wireless/mediatek/mt76/mt7925/main.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mt7925/main.c b/drivers/net/wireless/mediatek/mt76/mt7925/main.c
index 7d68b08f445a..82c81c22e39c 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7925/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7925/main.c
@@ -1327,9 +1327,13 @@ mt7925_ampdu_action(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 	case IEEE80211_AMPDU_TX_STOP_CONT:
 		mtxq->aggr = false;
 		clear_bit(tid, &msta->deflink.wcid.ampdu_state);
-		ret = mt7925_mcu_uni_tx_ba(dev, params, false);
-		if (!ret)
-			ieee80211_stop_tx_ba_cb_irqsafe(vif, sta->addr, tid);
+		/* MCU command may fail during beacon loss, but callback must
+		 * always be called to complete the BA session teardown in
+		 * mac80211. Otherwise the state machine gets stuck and triggers
+		 * WARN in __ieee80211_stop_tx_ba_session().
+		 */
+		mt7925_mcu_uni_tx_ba(dev, params, false);
+		ieee80211_stop_tx_ba_cb_irqsafe(vif, sta->addr, tid);
 		break;
 	}
 	mt792x_mutex_release(dev);
-- 
2.52.0
[PATCH 12/13] wifi: mt76: mt7925: fix ROC deadlocks and race conditions
Posted by Zac 2 weeks, 5 days ago
From: Zac Bowling <zac@zacbowling.com>

Fix multiple interrelated issues in the remain-on-channel (ROC) handling
that cause deadlocks, race conditions, and resource leaks.

Problems fixed:

1. Deadlock in sta removal ROC abort path:
   When a station is removed while a ROC operation is in progress, the
   driver would call mt7925_roc_abort_sync() which waits for ROC completion.
   However, the ROC work itself needs to acquire mt792x_mutex which is
   already held during station removal, causing a deadlock.

   Fix: Use async ROC abort (mt76_connac_mcu_abort_roc) when called from
   paths that already hold the mutex, and add MT76_STATE_ROC_ABORT flag
   to coordinate between the abort and the ROC timer.

2. ROC timer race during suspend:
   The ROC timer could fire after the device started suspending but before
   the ROC was properly aborted, causing undefined behavior.

   Fix: Delete ROC timer synchronously before suspend and check device
   state before processing ROC timeout.

3. ROC rate limiting for MLO auth failures:
   Rapid ROC requests during MLO authentication can overwhelm the firmware,
   causing authentication timeouts. The MT7925 firmware has limited ROC
   handling capacity.

   Fix: Add rate limiting infrastructure with configurable minimum interval
   between ROC requests. Track last ROC completion time and defer new
   requests if they arrive too quickly.

4. WCID leak in ROC cleanup:
   When ROC operations are aborted, the associated WCID resources were
   not being properly released, causing resource exhaustion over time.

   Fix: Ensure WCID cleanup happens in all ROC termination paths.

5. Async ROC abort race condition:
   The async ROC abort could race with normal ROC completion, causing
   double-free or use-after-free of ROC resources.

   Fix: Use MT76_STATE_ROC_ABORT flag and proper synchronization to
   prevent races between async abort and normal completion paths.

These fixes work together to provide robust ROC handling that doesn't
deadlock, properly releases resources, and handles edge cases during
suspend and MLO operations.

Fixes: c948b5da6bbe ("wifi: mt76: mt7925: add Mediatek Wi-Fi7 driver for mt7925 device")
Signed-off-by: Zac Bowling <zac@zacbowling.com>
---
 drivers/net/wireless/mediatek/mt76/mt76.h     |   1 +
 .../net/wireless/mediatek/mt76/mt7925/main.c  | 175 ++++++++++++++++--
 drivers/net/wireless/mediatek/mt76/mt792x.h   |   7 +
 3 files changed, 171 insertions(+), 12 deletions(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mt76.h b/drivers/net/wireless/mediatek/mt76/mt76.h
index d05e83ea1cac..91f9dd95c89e 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76.h
+++ b/drivers/net/wireless/mediatek/mt76/mt76.h
@@ -511,6 +511,7 @@ enum {
 	MT76_STATE_POWER_OFF,
 	MT76_STATE_SUSPEND,
 	MT76_STATE_ROC,
+	MT76_STATE_ROC_ABORT,
 	MT76_STATE_PM,
 	MT76_STATE_WED_RESET,
 };
diff --git a/drivers/net/wireless/mediatek/mt76/mt7925/main.c b/drivers/net/wireless/mediatek/mt76/mt7925/main.c
index 82c81c22e39c..4b7c13485497 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7925/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7925/main.c
@@ -453,6 +453,24 @@ static void mt7925_roc_iter(void *priv, u8 *mac,
 	mt7925_mcu_abort_roc(phy, &mvif->bss_conf, phy->roc_token_id);
 }
 
+/* Async ROC abort - safe to call while holding mutex.
+ * Sets abort flag and lets roc_work handle cleanup without blocking.
+ * This prevents deadlock when called from sta_remove path which holds mutex.
+ */
+static void mt7925_roc_abort_async(struct mt792x_dev *dev)
+{
+	struct mt792x_phy *phy = &dev->phy;
+
+	/* Set abort flag - roc_work checks this before acquiring mutex */
+	set_bit(MT76_STATE_ROC_ABORT, &phy->mt76->state);
+
+	/* Stop timer and schedule work to handle cleanup.
+	 * Must schedule work since timer may not have fired yet.
+	 */
+	timer_delete(&phy->roc_timer);
+	ieee80211_queue_work(phy->mt76->hw, &phy->roc_work);
+}
+
 void mt7925_roc_abort_sync(struct mt792x_dev *dev)
 {
 	struct mt792x_phy *phy = &dev->phy;
@@ -477,6 +495,17 @@ void mt7925_roc_work(struct work_struct *work)
 	phy = (struct mt792x_phy *)container_of(work, struct mt792x_phy,
 						roc_work);
 
+	/* Check abort flag BEFORE acquiring mutex to prevent deadlock.
+	 * If abort is requested while we're in the sta_remove path (which
+	 * holds the mutex), we must not try to acquire it or we'll deadlock.
+	 * Clear the flags and only notify mac80211 if ROC was actually active.
+	 */
+	if (test_and_clear_bit(MT76_STATE_ROC_ABORT, &phy->mt76->state)) {
+		if (test_and_clear_bit(MT76_STATE_ROC, &phy->mt76->state))
+			ieee80211_remain_on_channel_expired(phy->mt76->hw);
+		return;
+	}
+
 	if (!test_and_clear_bit(MT76_STATE_ROC, &phy->mt76->state))
 		return;
 
@@ -504,14 +533,93 @@ static int mt7925_abort_roc(struct mt792x_phy *phy,
 	return err;
 }
 
+/* ROC rate limiting constants - exponential backoff to prevent MCU overload
+ * when upper layers trigger rapid reconnection cycles (e.g., MLO auth failures).
+ * Max backoff ~1.6s, resets after 10s of no timeouts.
+ */
+#define MT7925_ROC_BACKOFF_BASE_MS	100
+#define MT7925_ROC_BACKOFF_MAX_MS	1600
+#define MT7925_ROC_TIMEOUT_RESET_MS	10000
+#define MT7925_ROC_TIMEOUT_WARN_THRESH	5
+
+/* Check if ROC should be throttled due to recent timeouts.
+ * Returns delay in jiffies if throttling, 0 if OK to proceed.
+ */
+static unsigned long mt7925_roc_throttle_check(struct mt792x_phy *phy)
+{
+	unsigned long now = jiffies;
+
+	/* Reset timeout counter if it's been a while since last timeout */
+	if (phy->roc_timeout_count &&
+	    time_after(now, phy->roc_last_timeout +
+		       msecs_to_jiffies(MT7925_ROC_TIMEOUT_RESET_MS))) {
+		phy->roc_timeout_count = 0;
+		phy->roc_backoff_until = 0;
+	}
+
+	/* Check if we're still in backoff period */
+	if (phy->roc_backoff_until && time_before(now, phy->roc_backoff_until))
+		return phy->roc_backoff_until - now;
+
+	return 0;
+}
+
+/* Record ROC timeout and calculate backoff period */
+static void mt7925_roc_record_timeout(struct mt792x_phy *phy)
+{
+	unsigned int backoff_ms;
+
+	phy->roc_last_timeout = jiffies;
+	phy->roc_timeout_count++;
+
+	/* Exponential backoff: 100ms, 200ms, 400ms, 800ms, 1600ms (capped) */
+	backoff_ms = MT7925_ROC_BACKOFF_BASE_MS <<
+		     min_t(u8, phy->roc_timeout_count - 1, 4);
+	if (backoff_ms > MT7925_ROC_BACKOFF_MAX_MS)
+		backoff_ms = MT7925_ROC_BACKOFF_MAX_MS;
+
+	phy->roc_backoff_until = jiffies + msecs_to_jiffies(backoff_ms);
+
+	/* Warn if we're seeing repeated timeouts - likely upper layer issue */
+	if (phy->roc_timeout_count == MT7925_ROC_TIMEOUT_WARN_THRESH)
+		dev_warn(phy->dev->mt76.dev,
+			 "mt7925: %u consecutive ROC timeouts, possible mac80211/wpa_supplicant issue (MLO key race?)\n",
+			 phy->roc_timeout_count);
+}
+
+/* Clear timeout tracking on successful ROC */
+static void mt7925_roc_clear_timeout(struct mt792x_phy *phy)
+{
+	phy->roc_timeout_count = 0;
+	phy->roc_backoff_until = 0;
+}
+
 static int mt7925_set_roc(struct mt792x_phy *phy,
 			  struct mt792x_bss_conf *mconf,
 			  struct ieee80211_channel *chan,
 			  int duration,
 			  enum mt7925_roc_req type)
 {
+	unsigned long throttle;
 	int err;
 
+	/* Check rate limiting - if in backoff period, wait or return busy */
+	throttle = mt7925_roc_throttle_check(phy);
+	if (throttle) {
+		/* For short backoffs, wait; for longer ones, return busy */
+		if (throttle < msecs_to_jiffies(200)) {
+			msleep(jiffies_to_msecs(throttle));
+		} else {
+			dev_dbg(phy->dev->mt76.dev,
+				"mt7925: ROC throttled, %u ms remaining\n",
+				jiffies_to_msecs(throttle));
+			return -EBUSY;
+		}
+	}
+
+	/* Clear stale abort flag from previous ROC */
+	clear_bit(MT76_STATE_ROC_ABORT, &phy->mt76->state);
+
 	if (test_and_set_bit(MT76_STATE_ROC, &phy->mt76->state))
 		return -EBUSY;
 
@@ -527,7 +635,11 @@ static int mt7925_set_roc(struct mt792x_phy *phy,
 	if (!wait_event_timeout(phy->roc_wait, phy->roc_grant, 4 * HZ)) {
 		mt7925_mcu_abort_roc(phy, mconf, phy->roc_token_id);
 		clear_bit(MT76_STATE_ROC, &phy->mt76->state);
+		mt7925_roc_record_timeout(phy);
 		err = -ETIMEDOUT;
+	} else {
+		/* Successful ROC - reset timeout tracking */
+		mt7925_roc_clear_timeout(phy);
 	}
 
 out:
@@ -538,8 +650,27 @@ static int mt7925_set_mlo_roc(struct mt792x_phy *phy,
 			      struct mt792x_bss_conf *mconf,
 			      u16 sel_links)
 {
+	unsigned long throttle;
 	int err;
 
+	/* Check rate limiting - MLO ROC is especially prone to rapid-fire
+	 * during reconnection cycles after MLO authentication failures.
+	 */
+	throttle = mt7925_roc_throttle_check(phy);
+	if (throttle) {
+		if (throttle < msecs_to_jiffies(200)) {
+			msleep(jiffies_to_msecs(throttle));
+		} else {
+			dev_dbg(phy->dev->mt76.dev,
+				"mt7925: MLO ROC throttled, %u ms remaining\n",
+				jiffies_to_msecs(throttle));
+			return -EBUSY;
+		}
+	}
+
+	/* Clear stale abort flag from previous ROC */
+	clear_bit(MT76_STATE_ROC_ABORT, &phy->mt76->state);
+
 	if (WARN_ON_ONCE(test_and_set_bit(MT76_STATE_ROC, &phy->mt76->state)))
 		return -EBUSY;
 
@@ -554,7 +685,10 @@ static int mt7925_set_mlo_roc(struct mt792x_phy *phy,
 	if (!wait_event_timeout(phy->roc_wait, phy->roc_grant, 4 * HZ)) {
 		mt7925_mcu_abort_roc(phy, mconf, phy->roc_token_id);
 		clear_bit(MT76_STATE_ROC, &phy->mt76->state);
+		mt7925_roc_record_timeout(phy);
 		err = -ETIMEDOUT;
+	} else {
+		mt7925_roc_clear_timeout(phy);
 	}
 
 out:
@@ -571,6 +705,7 @@ static int mt7925_remain_on_channel(struct ieee80211_hw *hw,
 	struct mt792x_phy *phy = mt792x_hw_phy(hw);
 	int err;
 
+	cancel_work_sync(&phy->roc_work);
 	mt792x_mutex_acquire(phy->dev);
 	err = mt7925_set_roc(phy, &mvif->bss_conf,
 			     chan, duration, MT7925_ROC_REQ_ROC);
@@ -878,14 +1013,14 @@ static int mt7925_mac_link_sta_add(struct mt76_dev *mdev,
 	if (!mlink)
 		return -EINVAL;
 
-	idx = mt76_wcid_alloc(dev->mt76.wcid_mask, MT792x_WTBL_STA - 1);
-	if (idx < 0)
-		return -ENOSPC;
-
 	mconf = mt792x_vif_to_link(mvif, link_id);
 	if (!mconf)
 		return -EINVAL;
 
+	idx = mt76_wcid_alloc(dev->mt76.wcid_mask, MT792x_WTBL_STA - 1);
+	if (idx < 0)
+		return -ENOSPC;
+
 	mt76_wcid_init(&mlink->wcid, 0);
 	mlink->wcid.sta = 1;
 	mlink->wcid.idx = idx;
@@ -905,14 +1040,16 @@ static int mt7925_mac_link_sta_add(struct mt76_dev *mdev,
 
 	ret = mt76_connac_pm_wake(&dev->mphy, &dev->pm);
 	if (ret)
-		return ret;
+		goto err_wcid;
 
 	mt7925_mac_wtbl_update(dev, idx,
 			       MT_WTBL_UPDATE_ADM_COUNT_CLEAR);
 
 	link_conf = mt792x_vif_to_bss_conf(vif, link_id);
-	if (!link_conf)
-		return -EINVAL;
+	if (!link_conf) {
+		ret = -EINVAL;
+		goto err_wcid;
+	}
 
 	/* should update bss info before STA add */
 	if (vif->type == NL80211_IFTYPE_STATION && !link_sta->sta->tdls) {
@@ -924,7 +1061,7 @@ static int mt7925_mac_link_sta_add(struct mt76_dev *mdev,
 			ret = mt7925_mcu_add_bss_info(&dev->phy, mconf->mt76.ctx,
 						      link_conf, link_sta, false);
 		if (ret)
-			return ret;
+			goto err_wcid;
 	}
 
 	if (ieee80211_vif_is_mld(vif) &&
@@ -932,28 +1069,34 @@ static int mt7925_mac_link_sta_add(struct mt76_dev *mdev,
 		ret = mt7925_mcu_sta_update(dev, link_sta, vif, true,
 					    MT76_STA_INFO_STATE_NONE);
 		if (ret)
-			return ret;
+			goto err_wcid;
 	} else if (ieee80211_vif_is_mld(vif) &&
 		   link_sta != mlink->pri_link) {
 		ret = mt7925_mcu_sta_update(dev, mlink->pri_link, vif,
 					    true, MT76_STA_INFO_STATE_ASSOC);
 		if (ret)
-			return ret;
+			goto err_wcid;
 
 		ret = mt7925_mcu_sta_update(dev, link_sta, vif, true,
 					    MT76_STA_INFO_STATE_ASSOC);
 		if (ret)
-			return ret;
+			goto err_wcid;
 	} else {
 		ret = mt7925_mcu_sta_update(dev, link_sta, vif, true,
 					    MT76_STA_INFO_STATE_NONE);
 		if (ret)
-			return ret;
+			goto err_wcid;
 	}
 
 	mt76_connac_power_save_sched(&dev->mphy, &dev->pm);
 
 	return 0;
+
+err_wcid:
+	rcu_assign_pointer(dev->mt76.wcid[idx], NULL);
+	mt76_wcid_mask_clear(dev->mt76.wcid_mask, idx);
+	mt76_connac_power_save_sched(&dev->mphy, &dev->pm);
+	return ret;
 }
 
 static int
@@ -1139,6 +1282,9 @@ static void mt7925_mac_link_sta_remove(struct mt76_dev *mdev,
 	if (!mlink)
 		return;
 
+	/* With Sean's fix, roc_abort_sync uses cancel_work() instead of
+	 * cancel_work_sync(), so it's safe to call even with mutex held.
+	 */
 	mt7925_roc_abort_sync(dev);
 
 	mt76_connac_free_pending_tx_skbs(&dev->pm, &mlink->wcid);
@@ -1534,6 +1680,8 @@ static int mt7925_suspend(struct ieee80211_hw *hw,
 	cancel_delayed_work_sync(&dev->pm.ps_work);
 	mt76_connac_free_pending_tx_skbs(&dev->pm, NULL);
 
+	/* Cancel ROC before quiescing starts */
+	mt7925_roc_abort_sync(dev);
 	mt792x_mutex_acquire(dev);
 
 	clear_bit(MT76_STATE_RUNNING, &phy->mt76->state);
@@ -1880,6 +2028,8 @@ static void mt7925_mgd_prepare_tx(struct ieee80211_hw *hw,
 	u16 duration = info->duration ? info->duration :
 		       jiffies_to_msecs(HZ);
 
+	cancel_work_sync(&mvif->phy->roc_work);
+
 	mt792x_mutex_acquire(dev);
 	mt7925_set_roc(mvif->phy, &mvif->bss_conf,
 		       mvif->bss_conf.mt76.ctx->def.chan, duration,
@@ -2037,6 +2187,7 @@ mt7925_change_vif_links(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 	if (old_links == new_links)
 		return 0;
 
+	cancel_work_sync(&phy->roc_work);
 	mt792x_mutex_acquire(dev);
 
 	for_each_set_bit(link_id, &rem, IEEE80211_MLD_MAX_NUM_LINKS) {
diff --git a/drivers/net/wireless/mediatek/mt76/mt792x.h b/drivers/net/wireless/mediatek/mt76/mt792x.h
index 8388638ed550..d9c1ea709390 100644
--- a/drivers/net/wireless/mediatek/mt76/mt792x.h
+++ b/drivers/net/wireless/mediatek/mt76/mt792x.h
@@ -186,6 +186,13 @@ struct mt792x_phy {
 	wait_queue_head_t roc_wait;
 	u8 roc_token_id;
 	bool roc_grant;
+
+	/* ROC rate limiting to prevent MCU overload during rapid reconnection
+	 * cycles (e.g., MLO authentication failures causing repeated ROC).
+	 */
+	u8 roc_timeout_count;		/* consecutive ROC timeouts */
+	unsigned long roc_last_timeout;	/* jiffies of last timeout */
+	unsigned long roc_backoff_until;/* don't issue ROC until this time */
 };
 
 struct mt792x_irq_map {
-- 
2.52.0
Re: [PATCH 12/13] wifi: mt76: mt7925: fix ROC deadlocks and race conditions
Posted by Felix Fietkau 1 week, 6 days ago
On 20.01.26 21:10, Zac wrote:
> From: Zac Bowling <zac@zacbowling.com>
> 
> Fix multiple interrelated issues in the remain-on-channel (ROC) handling
> that cause deadlocks, race conditions, and resource leaks.
> 
> Problems fixed:
> 
> 1. Deadlock in sta removal ROC abort path:
>     When a station is removed while a ROC operation is in progress, the
>     driver would call mt7925_roc_abort_sync() which waits for ROC completion.
>     However, the ROC work itself needs to acquire mt792x_mutex which is
>     already held during station removal, causing a deadlock.
> 
>     Fix: Use async ROC abort (mt76_connac_mcu_abort_roc) when called from
>     paths that already hold the mutex, and add MT76_STATE_ROC_ABORT flag
>     to coordinate between the abort and the ROC timer.
> 
> 2. ROC timer race during suspend:
>     The ROC timer could fire after the device started suspending but before
>     the ROC was properly aborted, causing undefined behavior.
> 
>     Fix: Delete ROC timer synchronously before suspend and check device
>     state before processing ROC timeout.
> 
> 3. ROC rate limiting for MLO auth failures:
>     Rapid ROC requests during MLO authentication can overwhelm the firmware,
>     causing authentication timeouts. The MT7925 firmware has limited ROC
>     handling capacity.
> 
>     Fix: Add rate limiting infrastructure with configurable minimum interval
>     between ROC requests. Track last ROC completion time and defer new
>     requests if they arrive too quickly.
> 
> 4. WCID leak in ROC cleanup:
>     When ROC operations are aborted, the associated WCID resources were
>     not being properly released, causing resource exhaustion over time.
> 
>     Fix: Ensure WCID cleanup happens in all ROC termination paths.
> 
> 5. Async ROC abort race condition:
>     The async ROC abort could race with normal ROC completion, causing
>     double-free or use-after-free of ROC resources.
> 
>     Fix: Use MT76_STATE_ROC_ABORT flag and proper synchronization to
>     prevent races between async abort and normal completion paths.
> 
> These fixes work together to provide robust ROC handling that doesn't
> deadlock, properly releases resources, and handles edge cases during
> suspend and MLO operations.
> 
> Fixes: c948b5da6bbe ("wifi: mt76: mt7925: add Mediatek Wi-Fi7 driver for mt7925 device")
> Signed-off-by: Zac Bowling <zac@zacbowling.com>

The rate limiting code seems a bit suspicious to me.
What does "limited ROC handling capacity" mean? Outstanding ROC 
requests? Does it need time to settle after a completed ROC?
This needs to be clarified and likely replaced with a more targeted fix.

- Felix
[PATCH 13/13] wifi: mt76: mt7925: fix double wcid initialization race condition
Posted by Zac 2 weeks, 5 days ago
Remove duplicate mt76_wcid_init() call in mt7925_mac_link_sta_add that
occurs after the wcid is already published via rcu_assign_pointer().

The wcid is correctly initialized at line 1023 after allocation.
However, a second mt76_wcid_init() call at line 1036 reinitializes
the wcid after it has been published to RCU readers, which can cause:

 - List head corruption (tx_list, poll_list) if concurrent code is
   already using the wcid
 - Memory leaks from reinitializing the pktid IDR
 - Race conditions where readers see partially initialized state

This appears to be a refactoring error where the duplicate call was
left behind.

Fixes: TBD ("wifi: mt76: mt7925: add MLO support")
Signed-off-by: Zac Bowling <zac@zacbowling.com>
---
 drivers/net/wireless/mediatek/mt76/mt7925/main.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mt7925/main.c b/drivers/net/wireless/mediatek/mt76/mt7925/main.c
index 4b7c13485497..acce21ad3a29 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7925/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7925/main.c
@@ -1033,7 +1033,6 @@ static int mt7925_mac_link_sta_add(struct mt76_dev *mdev,
 	wcid = &mlink->wcid;
 	ewma_signal_init(&wcid->rssi);
 	rcu_assign_pointer(dev->mt76.wcid[wcid->idx], wcid);
-	mt76_wcid_init(wcid, 0);
 	ewma_avg_signal_init(&mlink->avg_ack_signal);
 	memset(mlink->airtime_ac, 0,
 	       sizeof(msta->deflink.airtime_ac));
-- 
2.52.0
Re: [PATCH 11/11] wifi: mt76: mt7925: fix ROC deadlocks and race conditions
Posted by Zac Bowling 2 weeks, 5 days ago
Hi Sean,

Thank you for the detailed feedback and for sharing your deadlock fix.

> 1. Would it be possible to rebase your patchset on top of this fix

Yes, I'll rebase on your patch. I reviewed it, and it's a cleaner solution
than what I implemented. My approach used an async abort with a state flag,
but your `cancel_work()` approach avoids the blocking entirely.

Additionally, last night, after someone ran an AI bot check on my patches,
I found two issues in my current patchset that introduce deadlocks where
your existing patch stops it from hitting.

1. In my patch #3 added I mt792x_mutex_acquire() around
   ieee80211_iterate_active_interfaces(), but this function is called from
   mt7921_mac_sta_add/remove via mt76_sta_add/remove, which already hold
   dev->mutex. I need to remove this mutex wrapper.

2. In my patch #6 I wrapped mt7925_roc_abort_sync() with a mutex in the
   suspend path, but roc_abort_sync calls cancel_work_sync() which can
   deadlock if roc_work is waiting for the mutex. Your fix addresses
   this more elegantly.

I'll prepare a v6 rebased on your patch with these fixes.

> 2. Could you please elaborate on the test scenarios that would trigger
>    ROC rate limiting for MLO authentication failures?

The rate-limiting addresses a real-world scenario we observed with MT7925
when connecting to WiFi 7 APs with MLO + Fast Transition (802.11r) enabled.

When wpa_supplicant attempts Fast Transition roaming between MLO-capable APs,
there's a race condition between disconnect and key setup. The kernel's nl80211
validation requires link_id for MLO group keys (net/wireless/nl80211.c:4828),
but during FT roaming, wdev->valid_links may still be set from the previous
connection when the new key setup begins.

This causes repeated failures:
```
wpa_supplicant: nl80211: kernel reports: link ID must for MLO group key
wpa_supplicant: FT: Failed to set PTK to the driver
```

Each failure triggers a reconnection attempt, which requires ROC commands for
scanning. When these failures happen in rapid succession (we observed 3-4
failures within seconds), the MCU seems to become overwhelmed with messages
like this:

```
Message 00020027 (MCU_UNI_CMD_ROC) timeout
Message 00020027 (MCU_UNI_CMD_ROC) timeout
Message 00020027 (MCU_UNI_CMD_ROC) timeout
```

This leads to firmware reset, which triggers more reconnection attempts,
creating a cascading failure loop.

Reproduction manifests for me at least with:
- Single MT7925 interface in STA mode
- WiFi 7 AP with MLO enabled (multi-link across 5 GHz + 6 GHz)
- 802.11r (Fast Transition) enabled
- Multiple APs with the same SSID (roaming scenario)

I haven't tested with multiple virtual interfaces, but the core issue is
the rapid ROC request rate during the reconnection loop, not the number of
interfaces.

I had someone on the Framework forum post similar dumps showing similar
behavior with their Eeros mesh setup. I'm using some Unifi U7 Pros
with MLO enabled on one of the SSIDs.

So this might not be the right place to fix this. We may need to fix at the
upper-layers. I put this here so folks could work around with my DKMS
package, but a deeper refactor up multiple layers around MLO is probably
needed to really fix this. Fixing here at least validates things are more
stable (but I can't confirm it's really fixed, I don't know what is going
on inside the firmware, and it's internal state issues we can get into).

The root cause is likely in wpa_supplicant/mac80211 (race condition in MLO
key setup timing during FT roaming). However, the rate limiting provides a
defensive measure to prevent firmware crashes. Then I can maybe investigate
the upper-layer issues. Way bigger change, though, unfortunately.

Fix is similar to how TCP implements backoff to handle network congestion -
the congestion isn't TCP's fault, but the backoff prevents cascading failures.

The detailed crash analysis and dmesg logs are in our repository:
https://github.com/zbowling/mt7925/tree/main/crashes

Specifically:
- crash-2026-01-19-mlo-authentication-failure.log (MLO key race analysis)
- crash-2026-01-12-2210-auth-loop-mcu-timeout.log (MCU timeout during auth loop)

If you believe the rate limiting is unnecessary given how ROC operations are
serialized in the firmware, I can remove it. My goal was to prevent the
firmware from entering a reset loop, but if there's a better approach or if
the underlying mac80211/wpa_supplicant issue should be fixed instead now, I'm
happy to adjust. This just seemed to reduce the issue for my MLO setup.

Thank you for offering to prepare an out-of-tree branch - that would be very
helpful for testing the integrated patchset.

Zac Bowling

On Tue, Jan 20, 2026 at 12:25 AM Sean Wang <sean.wang@kernel.org> wrote:
>
> On Tue, Jan 20, 2026 at 12:29 AM Zac <zac@zacbowling.com> wrote:
> >
> > From: Zac Bowling <zac@zacbowling.com>
> >
> > Fix multiple interrelated issues in the remain-on-channel (ROC) handling
> > that cause deadlocks, race conditions, and resource leaks.
> >
> > Problems fixed:
> >
> > 1. Deadlock in sta removal ROC abort path:
> >    When a station is removed while a ROC operation is in progress, the
> >    driver would call mt7925_roc_abort_sync() which waits for ROC completion.
> >    However, the ROC work itself needs to acquire mt792x_mutex which is
> >    already held during station removal, causing a deadlock.
> >
> >    Fix: Use async ROC abort (mt76_connac_mcu_abort_roc) when called from
> >    paths that already hold the mutex, and add MT76_STATE_ROC_ABORT flag
> >    to coordinate between the abort and the ROC timer.
> >
>
> Hi Zac,
>
> Thanks for your continued efforts on the driver.
> We’ve sent a patch to address the mt7925 deadlock at the link below:
> https://lists.infradead.org/pipermail/linux-mediatek/2025-December/102164.html
> We plan to send the same fix to mt7921 as well.
>
> I had a couple of questions and suggestions:
> 1. Would it be possible to rebase your patchset on top of this fix
> (and any other pending patches that are not yet merged)? We noticed
> some conflicts when applying the series, and rebasing it this way
> would make it easier for nbd to integrate the full patchset.
> 2. Could you please elaborate on the test scenarios that would trigger
> ROC rate limiting for MLO authentication failures? If I recall
> correctly, ROC operations are typically handled sequentially unless
> multiple interfaces are created on the same physical device. In that
> case, how many virtual interfaces and which operating modes (GC/STA or
> multiple STAs) are required to reproduce the issue?
>
> I will try to prepare an out-of-tree branch with the current pending
> patches to help your patchset integrate more smoothly. Thanks for
> collecting community issues and fixes and incorporating them into the
> driver.
>
>              Sean
>
> > 2. ROC timer race during suspend:
> >    The ROC timer could fire after the device started suspending but before
> >    the ROC was properly aborted, causing undefined behavior.
> >
> >    Fix: Delete ROC timer synchronously before suspend and check device
> >    state before processing ROC timeout.
> >
> > 3. ROC rate limiting for MLO auth failures:
> >    Rapid ROC requests during MLO authentication can overwhelm the firmware,
> >    causing authentication timeouts. The MT7925 firmware has limited ROC
> >    handling capacity.
> >
> >    Fix: Add rate limiting infrastructure with configurable minimum interval
> >    between ROC requests. Track last ROC completion time and defer new
> >    requests if they arrive too quickly.
> >
> > 4. WCID leak in ROC cleanup:
> >    When ROC operations are aborted, the associated WCID resources were
> >    not being properly released, causing resource exhaustion over time.
> >
> >    Fix: Ensure WCID cleanup happens in all ROC termination paths.
> >
> > 5. Async ROC abort race condition:
> >    The async ROC abort could race with normal ROC completion, causing
> >    double-free or use-after-free of ROC resources.
> >
> >    Fix: Use MT76_STATE_ROC_ABORT flag and proper synchronization to
> >    prevent races between async abort and normal completion paths.
> >
> > These fixes work together to provide robust ROC handling that doesn't
> > deadlock, properly releases resources, and handles edge cases during
> > suspend and MLO operations.
> >
> > Fixes: c948b5da6bbe ("wifi: mt76: mt7925: add Mediatek Wi-Fi7 driver for mt7925 device")
> > Signed-off-by: Zac Bowling <zac@zacbowling.com>
> > ---
> >  drivers/net/wireless/mediatek/mt76/mt76.h     |   1 +
> >  .../net/wireless/mediatek/mt76/mt7925/main.c  | 175 ++++++++++++++++--
> >  drivers/net/wireless/mediatek/mt76/mt792x.h   |   7 +
> >  3 files changed, 170 insertions(+), 13 deletions(-)
> >
> > diff --git a/drivers/net/wireless/mediatek/mt76/mt76.h b/drivers/net/wireless/mediatek/mt76/mt76.h
> > index d05e83ea1cac..91f9dd95c89e 100644
> > --- a/drivers/net/wireless/mediatek/mt76/mt76.h
> > +++ b/drivers/net/wireless/mediatek/mt76/mt76.h
> > @@ -511,6 +511,7 @@ enum {
> >         MT76_STATE_POWER_OFF,
> >         MT76_STATE_SUSPEND,
> >         MT76_STATE_ROC,
> > +       MT76_STATE_ROC_ABORT,
> >         MT76_STATE_PM,
> >         MT76_STATE_WED_RESET,
> >  };
> > diff --git a/drivers/net/wireless/mediatek/mt76/mt7925/main.c b/drivers/net/wireless/mediatek/mt76/mt7925/main.c
> > index cc7ef2c17032..2404f7812897 100644
> > --- a/drivers/net/wireless/mediatek/mt76/mt7925/main.c
> > +++ b/drivers/net/wireless/mediatek/mt76/mt7925/main.c
> > @@ -453,6 +453,24 @@ static void mt7925_roc_iter(void *priv, u8 *mac,
> >         mt7925_mcu_abort_roc(phy, &mvif->bss_conf, phy->roc_token_id);
> >  }
> >
> > +/* Async ROC abort - safe to call while holding mutex.
> > + * Sets abort flag and lets roc_work handle cleanup without blocking.
> > + * This prevents deadlock when called from sta_remove path which holds mutex.
> > + */
> > +static void mt7925_roc_abort_async(struct mt792x_dev *dev)
> > +{
> > +       struct mt792x_phy *phy = &dev->phy;
> > +
> > +       /* Set abort flag - roc_work checks this before acquiring mutex */
> > +       set_bit(MT76_STATE_ROC_ABORT, &phy->mt76->state);
> > +
> > +       /* Stop timer and schedule work to handle cleanup.
> > +        * Must schedule work since timer may not have fired yet.
> > +        */
> > +       timer_delete(&phy->roc_timer);
> > +       ieee80211_queue_work(phy->mt76->hw, &phy->roc_work);
> > +}
> > +
> >  void mt7925_roc_abort_sync(struct mt792x_dev *dev)
> >  {
> >         struct mt792x_phy *phy = &dev->phy;
> > @@ -473,6 +491,17 @@ void mt7925_roc_work(struct work_struct *work)
> >         phy = (struct mt792x_phy *)container_of(work, struct mt792x_phy,
> >                                                 roc_work);
> >
> > +       /* Check abort flag BEFORE acquiring mutex to prevent deadlock.
> > +        * If abort is requested while we're in the sta_remove path (which
> > +        * holds the mutex), we must not try to acquire it or we'll deadlock.
> > +        * Clear the flags and only notify mac80211 if ROC was actually active.
> > +        */
> > +       if (test_and_clear_bit(MT76_STATE_ROC_ABORT, &phy->mt76->state)) {
> > +               if (test_and_clear_bit(MT76_STATE_ROC, &phy->mt76->state))
> > +                       ieee80211_remain_on_channel_expired(phy->mt76->hw);
> > +               return;
> > +       }
> > +
> >         if (!test_and_clear_bit(MT76_STATE_ROC, &phy->mt76->state))
> >                 return;
> >
> > @@ -500,14 +529,93 @@ static int mt7925_abort_roc(struct mt792x_phy *phy,
> >         return err;
> >  }
> >
> > +/* ROC rate limiting constants - exponential backoff to prevent MCU overload
> > + * when upper layers trigger rapid reconnection cycles (e.g., MLO auth failures).
> > + * Max backoff ~1.6s, resets after 10s of no timeouts.
> > + */
> > +#define MT7925_ROC_BACKOFF_BASE_MS     100
> > +#define MT7925_ROC_BACKOFF_MAX_MS      1600
> > +#define MT7925_ROC_TIMEOUT_RESET_MS    10000
> > +#define MT7925_ROC_TIMEOUT_WARN_THRESH 5
> > +
> > +/* Check if ROC should be throttled due to recent timeouts.
> > + * Returns delay in jiffies if throttling, 0 if OK to proceed.
> > + */
> > +static unsigned long mt7925_roc_throttle_check(struct mt792x_phy *phy)
> > +{
> > +       unsigned long now = jiffies;
> > +
> > +       /* Reset timeout counter if it's been a while since last timeout */
> > +       if (phy->roc_timeout_count &&
> > +           time_after(now, phy->roc_last_timeout +
> > +                      msecs_to_jiffies(MT7925_ROC_TIMEOUT_RESET_MS))) {
> > +               phy->roc_timeout_count = 0;
> > +               phy->roc_backoff_until = 0;
> > +       }
> > +
> > +       /* Check if we're still in backoff period */
> > +       if (phy->roc_backoff_until && time_before(now, phy->roc_backoff_until))
> > +               return phy->roc_backoff_until - now;
> > +
> > +       return 0;
> > +}
> > +
> > +/* Record ROC timeout and calculate backoff period */
> > +static void mt7925_roc_record_timeout(struct mt792x_phy *phy)
> > +{
> > +       unsigned int backoff_ms;
> > +
> > +       phy->roc_last_timeout = jiffies;
> > +       phy->roc_timeout_count++;
> > +
> > +       /* Exponential backoff: 100ms, 200ms, 400ms, 800ms, 1600ms (capped) */
> > +       backoff_ms = MT7925_ROC_BACKOFF_BASE_MS <<
> > +                    min_t(u8, phy->roc_timeout_count - 1, 4);
> > +       if (backoff_ms > MT7925_ROC_BACKOFF_MAX_MS)
> > +               backoff_ms = MT7925_ROC_BACKOFF_MAX_MS;
> > +
> > +       phy->roc_backoff_until = jiffies + msecs_to_jiffies(backoff_ms);
> > +
> > +       /* Warn if we're seeing repeated timeouts - likely upper layer issue */
> > +       if (phy->roc_timeout_count == MT7925_ROC_TIMEOUT_WARN_THRESH)
> > +               dev_warn(phy->dev->mt76.dev,
> > +                        "mt7925: %u consecutive ROC timeouts, possible mac80211/wpa_supplicant issue (MLO key race?)\n",
> > +                        phy->roc_timeout_count);
> > +}
> > +
> > +/* Clear timeout tracking on successful ROC */
> > +static void mt7925_roc_clear_timeout(struct mt792x_phy *phy)
> > +{
> > +       phy->roc_timeout_count = 0;
> > +       phy->roc_backoff_until = 0;
> > +}
> > +
> >  static int mt7925_set_roc(struct mt792x_phy *phy,
> >                           struct mt792x_bss_conf *mconf,
> >                           struct ieee80211_channel *chan,
> >                           int duration,
> >                           enum mt7925_roc_req type)
> >  {
> > +       unsigned long throttle;
> >         int err;
> >
> > +       /* Check rate limiting - if in backoff period, wait or return busy */
> > +       throttle = mt7925_roc_throttle_check(phy);
> > +       if (throttle) {
> > +               /* For short backoffs, wait; for longer ones, return busy */
> > +               if (throttle < msecs_to_jiffies(200)) {
> > +                       msleep(jiffies_to_msecs(throttle));
> > +               } else {
> > +                       dev_dbg(phy->dev->mt76.dev,
> > +                               "mt7925: ROC throttled, %lu ms remaining\n",
> > +                               jiffies_to_msecs(throttle));
> > +                       return -EBUSY;
> > +               }
> > +       }
> > +
> > +       /* Clear stale abort flag from previous ROC */
> > +       clear_bit(MT76_STATE_ROC_ABORT, &phy->mt76->state);
> > +
> >         if (test_and_set_bit(MT76_STATE_ROC, &phy->mt76->state))
> >                 return -EBUSY;
> >
> > @@ -523,7 +631,11 @@ static int mt7925_set_roc(struct mt792x_phy *phy,
> >         if (!wait_event_timeout(phy->roc_wait, phy->roc_grant, 4 * HZ)) {
> >                 mt7925_mcu_abort_roc(phy, mconf, phy->roc_token_id);
> >                 clear_bit(MT76_STATE_ROC, &phy->mt76->state);
> > +               mt7925_roc_record_timeout(phy);
> >                 err = -ETIMEDOUT;
> > +       } else {
> > +               /* Successful ROC - reset timeout tracking */
> > +               mt7925_roc_clear_timeout(phy);
> >         }
> >
> >  out:
> > @@ -534,8 +646,27 @@ static int mt7925_set_mlo_roc(struct mt792x_phy *phy,
> >                               struct mt792x_bss_conf *mconf,
> >                               u16 sel_links)
> >  {
> > +       unsigned long throttle;
> >         int err;
> >
> > +       /* Check rate limiting - MLO ROC is especially prone to rapid-fire
> > +        * during reconnection cycles after MLO authentication failures.
> > +        */
> > +       throttle = mt7925_roc_throttle_check(phy);
> > +       if (throttle) {
> > +               if (throttle < msecs_to_jiffies(200)) {
> > +                       msleep(jiffies_to_msecs(throttle));
> > +               } else {
> > +                       dev_dbg(phy->dev->mt76.dev,
> > +                               "mt7925: MLO ROC throttled, %lu ms remaining\n",
> > +                               jiffies_to_msecs(throttle));
> > +                       return -EBUSY;
> > +               }
> > +       }
> > +
> > +       /* Clear stale abort flag from previous ROC */
> > +       clear_bit(MT76_STATE_ROC_ABORT, &phy->mt76->state);
> > +
> >         if (WARN_ON_ONCE(test_and_set_bit(MT76_STATE_ROC, &phy->mt76->state)))
> >                 return -EBUSY;
> >
> > @@ -550,7 +681,10 @@ static int mt7925_set_mlo_roc(struct mt792x_phy *phy,
> >         if (!wait_event_timeout(phy->roc_wait, phy->roc_grant, 4 * HZ)) {
> >                 mt7925_mcu_abort_roc(phy, mconf, phy->roc_token_id);
> >                 clear_bit(MT76_STATE_ROC, &phy->mt76->state);
> > +               mt7925_roc_record_timeout(phy);
> >                 err = -ETIMEDOUT;
> > +       } else {
> > +               mt7925_roc_clear_timeout(phy);
> >         }
> >
> >  out:
> > @@ -567,6 +701,7 @@ static int mt7925_remain_on_channel(struct ieee80211_hw *hw,
> >         struct mt792x_phy *phy = mt792x_hw_phy(hw);
> >         int err;
> >
> > +       cancel_work_sync(&phy->roc_work);
> >         mt792x_mutex_acquire(phy->dev);
> >         err = mt7925_set_roc(phy, &mvif->bss_conf,
> >                              chan, duration, MT7925_ROC_REQ_ROC);
> > @@ -874,14 +1009,14 @@ static int mt7925_mac_link_sta_add(struct mt76_dev *mdev,
> >         if (!mlink)
> >                 return -EINVAL;
> >
> > -       idx = mt76_wcid_alloc(dev->mt76.wcid_mask, MT792x_WTBL_STA - 1);
> > -       if (idx < 0)
> > -               return -ENOSPC;
> > -
> >         mconf = mt792x_vif_to_link(mvif, link_id);
> >         if (!mconf)
> >                 return -EINVAL;
> >
> > +       idx = mt76_wcid_alloc(dev->mt76.wcid_mask, MT792x_WTBL_STA - 1);
> > +       if (idx < 0)
> > +               return -ENOSPC;
> > +
> >         mt76_wcid_init(&mlink->wcid, 0);
> >         mlink->wcid.sta = 1;
> >         mlink->wcid.idx = idx;
> > @@ -901,14 +1036,16 @@ static int mt7925_mac_link_sta_add(struct mt76_dev *mdev,
> >
> >         ret = mt76_connac_pm_wake(&dev->mphy, &dev->pm);
> >         if (ret)
> > -               return ret;
> > +               goto err_wcid;
> >
> >         mt7925_mac_wtbl_update(dev, idx,
> >                                MT_WTBL_UPDATE_ADM_COUNT_CLEAR);
> >
> >         link_conf = mt792x_vif_to_bss_conf(vif, link_id);
> > -       if (!link_conf)
> > -               return -EINVAL;
> > +       if (!link_conf) {
> > +               ret = -EINVAL;
> > +               goto err_wcid;
> > +       }
> >
> >         /* should update bss info before STA add */
> >         if (vif->type == NL80211_IFTYPE_STATION && !link_sta->sta->tdls) {
> > @@ -920,7 +1057,7 @@ static int mt7925_mac_link_sta_add(struct mt76_dev *mdev,
> >                         ret = mt7925_mcu_add_bss_info(&dev->phy, mconf->mt76.ctx,
> >                                                       link_conf, link_sta, false);
> >                 if (ret)
> > -                       return ret;
> > +                       goto err_wcid;
> >         }
> >
> >         if (ieee80211_vif_is_mld(vif) &&
> > @@ -928,28 +1065,34 @@ static int mt7925_mac_link_sta_add(struct mt76_dev *mdev,
> >                 ret = mt7925_mcu_sta_update(dev, link_sta, vif, true,
> >                                             MT76_STA_INFO_STATE_NONE);
> >                 if (ret)
> > -                       return ret;
> > +                       goto err_wcid;
> >         } else if (ieee80211_vif_is_mld(vif) &&
> >                    link_sta != mlink->pri_link) {
> >                 ret = mt7925_mcu_sta_update(dev, mlink->pri_link, vif,
> >                                             true, MT76_STA_INFO_STATE_ASSOC);
> >                 if (ret)
> > -                       return ret;
> > +                       goto err_wcid;
> >
> >                 ret = mt7925_mcu_sta_update(dev, link_sta, vif, true,
> >                                             MT76_STA_INFO_STATE_ASSOC);
> >                 if (ret)
> > -                       return ret;
> > +                       goto err_wcid;
> >         } else {
> >                 ret = mt7925_mcu_sta_update(dev, link_sta, vif, true,
> >                                             MT76_STA_INFO_STATE_NONE);
> >                 if (ret)
> > -                       return ret;
> > +                       goto err_wcid;
> >         }
> >
> >         mt76_connac_power_save_sched(&dev->mphy, &dev->pm);
> >
> >         return 0;
> > +
> > +err_wcid:
> > +       rcu_assign_pointer(dev->mt76.wcid[idx], NULL);
> > +       mt76_wcid_mask_clear(dev->mt76.wcid_mask, idx);
> > +       mt76_connac_power_save_sched(&dev->mphy, &dev->pm);
> > +       return ret;
> >  }
> >
> >  static int
> > @@ -1135,7 +1278,8 @@ static void mt7925_mac_link_sta_remove(struct mt76_dev *mdev,
> >         if (!mlink)
> >                 return;
> >
> > -       mt7925_roc_abort_sync(dev);
> > +       /* Async abort - caller already holds mutex */
> > +       mt7925_roc_abort_async(dev);
> >
> >         mt76_connac_free_pending_tx_skbs(&dev->pm, &mlink->wcid);
> >         mt76_connac_pm_wake(&dev->mphy, &dev->pm);
> > @@ -1530,6 +1674,8 @@ static int mt7925_suspend(struct ieee80211_hw *hw,
> >         cancel_delayed_work_sync(&dev->pm.ps_work);
> >         mt76_connac_free_pending_tx_skbs(&dev->pm, NULL);
> >
> > +       /* Cancel ROC before quiescing starts */
> > +       mt7925_roc_abort_sync(dev);
> >         mt792x_mutex_acquire(dev);
> >
> >         clear_bit(MT76_STATE_RUNNING, &phy->mt76->state);
> > @@ -1876,6 +2022,8 @@ static void mt7925_mgd_prepare_tx(struct ieee80211_hw *hw,
> >         u16 duration = info->duration ? info->duration :
> >                        jiffies_to_msecs(HZ);
> >
> > +       cancel_work_sync(&mvif->phy->roc_work);
> > +
> >         mt792x_mutex_acquire(dev);
> >         mt7925_set_roc(mvif->phy, &mvif->bss_conf,
> >                        mvif->bss_conf.mt76.ctx->def.chan, duration,
> > @@ -2033,6 +2181,7 @@ mt7925_change_vif_links(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
> >         if (old_links == new_links)
> >                 return 0;
> >
> > +       cancel_work_sync(&phy->roc_work);
> >         mt792x_mutex_acquire(dev);
> >
> >         for_each_set_bit(link_id, &rem, IEEE80211_MLD_MAX_NUM_LINKS) {
> > diff --git a/drivers/net/wireless/mediatek/mt76/mt792x.h b/drivers/net/wireless/mediatek/mt76/mt792x.h
> > index 8388638ed550..d9c1ea709390 100644
> > --- a/drivers/net/wireless/mediatek/mt76/mt792x.h
> > +++ b/drivers/net/wireless/mediatek/mt76/mt792x.h
> > @@ -186,6 +186,13 @@ struct mt792x_phy {
> >         wait_queue_head_t roc_wait;
> >         u8 roc_token_id;
> >         bool roc_grant;
> > +
> > +       /* ROC rate limiting to prevent MCU overload during rapid reconnection
> > +        * cycles (e.g., MLO authentication failures causing repeated ROC).
> > +        */
> > +       u8 roc_timeout_count;           /* consecutive ROC timeouts */
> > +       unsigned long roc_last_timeout; /* jiffies of last timeout */
> > +       unsigned long roc_backoff_until;/* don't issue ROC until this time */
> >  };
> >
> >  struct mt792x_irq_map {
> > --
> > 2.52.0
> >