[PATCH net v2] net: tg3: guard napi_disable and pci_disable_device calls

Yury M. posted 1 patch 4 days, 5 hours ago
drivers/net/ethernet/broadcom/tg3.c | 19 +++++++++++++++++--
drivers/net/ethernet/broadcom/tg3.h |  1 +
2 files changed, 18 insertions(+), 2 deletions(-)
[PATCH net v2] net: tg3: guard napi_disable and pci_disable_device calls
Posted by Yury M. 4 days, 5 hours ago
During PCIe hot-plug events, uncorrectable errors can be reported and
AER recovery for the tg3 device is initiated by the AER kernel driver.
The tg3_io_error_detected function is the AER error recovery handler.

 From tg3_io_error_detected, we call tg3_netif_stop->tg3_napi_disable->
napi_disable and return PCI_ERS_RESULT_NEED_RESET on non-fatal error.
We expect that during AER recovery tg3_io_slot_reset and tg3_io_resume
will be called. But AER error recovery can fail. For example, when one
of PCIe devices on the same bus reports PCI_ERS_RESULT_NO_AER_DRIVER.
As a result, tg3_io_slot_reset and tg3_io_resume are not called, PCIe
device is disabled and NAPI is disabled (pci_disable_device and
napi_disable are called from tg3_io_error_detected). Then we can try to
disable PCIe link and napi_disable will be called again:

   napi_disable+0x1b/0x1b0
   tg3_napi_disable+0x89/0xa0 [tg3]
   tg3_netif_stop+0x37/0xe3 [tg3]
   tg3_stop+0x30/0x160 [tg3]
   tg3_close+0x2a/0x60 [tg3]
   __dev_close_many+0xad/0x130
   dev_close_many+0xb2/0x190
   unregister_netdevice_many_notify+0x19d/0xa00
   unregister_netdevice_queue+0xf8/0x140
   unregister_netdev+0x1c/0x30
   tg3_remove_one+0xaa/0x150 [tg3]
   pci_device_remove+0x42/0xb0
   device_release_driver_internal+0x19c/0x200
   pci_stop_bus_device+0x85/0xb0
   pci_stop_bus_device+0x2c/0xb0
   pci_stop_bus_device+0x2c/0xb0
   pci_stop_and_remove_bus_device+0x12/0x20
   pciehp_unconfigure_device+0x9f/0x160
   pciehp_disable_slot+0x67/0x100
   pciehp_handle_presence_or_link_change+0x77/0x350

This is not expected by napi_disable and a thread can be locked in
napi_disable forever. We have pcierr_recovery to cover a similar issue,
but for fatal errors. We cannot reuse this flag because it is reset in
tg3_io_resume, but it is not called when AER recovery fails.

Similarly, if an AER error is reported and tg3_io_error_detected calls
pci_disable_device, a subsequent device removal via tg3_remove_one or
tg3_shutdown will call pci_disable_device again for the already-disabled
device.

Add a napi_enabled flag to struct tg3 to track whether napi_enable has
been called. Guard tg3_napi_disable() against being called before
tg3_napi_enable(), logging an error if that happens. Also guard
pci_disable_device() calls in tg3_remove_one() and tg3_shutdown() with
pci_is_enabled() to avoid disabling an already-disabled device.

Fixes: b45aa2f6192e ("tg3: Add EEH support")
Signed-off-by: Yury Murashka <yurypm@arista.com>
---
  drivers/net/ethernet/broadcom/tg3.c | 19 +++++++++++++++++--
  drivers/net/ethernet/broadcom/tg3.h |  1 +
  2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/tg3.c 
b/drivers/net/ethernet/broadcom/tg3.c
index 73a4b569b..500b6f7fa 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -7396,8 +7396,18 @@ static void tg3_napi_disable(struct tg3 *tp)
  	int txq_idx = tp->txq_cnt - 1;
  	int rxq_idx = tp->rxq_cnt - 1;
  	struct tg3_napi *tnapi;
+	struct net_device *netdev = tp->dev;
  	int i;
  +	if (!tp->napi_enabled) {
+		netdev_err(netdev, "%s() called when napi_enable wasn't called 
before, netif_running=%d, pci_enabled=%d\n",
+			   __func__, netif_running(netdev),
+			   pci_is_enabled(tp->pdev));
+		return;
+	}
+
+	tp->napi_enabled = false;
+
  	for (i = tp->irq_cnt - 1; i >= 0; i--) {
  		tnapi = &tp->napi[i];
  		if (tnapi->tx_buffers) {
@@ -7420,6 +7430,8 @@ static void tg3_napi_enable(struct tg3 *tp)
  	struct tg3_napi *tnapi;
  	int i;
  +	tp->napi_enabled = true;
+
  	for (i = 0; i < tp->irq_cnt; i++) {
  		tnapi = &tp->napi[i];
  		napi_enable_locked(&tnapi->napi);
@@ -17718,6 +17730,7 @@ static int tg3_init_one(struct pci_dev *pdev,
  	tp->tx_mode = TG3_DEF_TX_MODE;
  	tp->irq_sync = 1;
  	tp->pcierr_recovery = false;
+	tp->napi_enabled = false;
   	if (tg3_debug > 0)
  		tp->msg_enable = tg3_debug;
@@ -18099,7 +18112,8 @@ static void tg3_remove_one(struct pci_dev *pdev)
  		}
  		free_netdev(dev);
  		pci_release_regions(pdev);
-		pci_disable_device(pdev);
+		if (pci_is_enabled(pdev))
+			pci_disable_device(pdev);
  	}
  }
  @@ -18257,7 +18271,8 @@ static void tg3_shutdown(struct pci_dev *pdev)
   	rtnl_unlock();
  -	pci_disable_device(pdev);
+	if (pci_is_enabled(pdev))
+		pci_disable_device(pdev);
  }
   /**
diff --git a/drivers/net/ethernet/broadcom/tg3.h 
b/drivers/net/ethernet/broadcom/tg3.h
index a9e7f88fa..34fb771e8 100644
--- a/drivers/net/ethernet/broadcom/tg3.h
+++ b/drivers/net/ethernet/broadcom/tg3.h
@@ -3429,6 +3429,7 @@ struct tg3 {
  	struct device			*hwmon_dev;
  	bool				link_up;
  	bool				pcierr_recovery;
+	bool				napi_enabled;
   	u32                             ape_hb;
  	unsigned long                   ape_hb_interval;
-- 
2.51.0
Re: [PATCH net v2] net: tg3: guard napi_disable and pci_disable_device calls
Posted by Pavan Chebbi 3 days, 11 hours ago
On Wed, May 20, 2026 at 10:05 PM Yury M. <yurypm@arista.com> wrote:
>
> During PCIe hot-plug events, uncorrectable errors can be reported and
> AER recovery for the tg3 device is initiated by the AER kernel driver.
> The tg3_io_error_detected function is the AER error recovery handler.
>
>  From tg3_io_error_detected, we call tg3_netif_stop->tg3_napi_disable->
> napi_disable and return PCI_ERS_RESULT_NEED_RESET on non-fatal error.
> We expect that during AER recovery tg3_io_slot_reset and tg3_io_resume
> will be called. But AER error recovery can fail. For example, when one
> of PCIe devices on the same bus reports PCI_ERS_RESULT_NO_AER_DRIVER.
> As a result, tg3_io_slot_reset and tg3_io_resume are not called, PCIe
> device is disabled and NAPI is disabled (pci_disable_device and
> napi_disable are called from tg3_io_error_detected). Then we can try to
> disable PCIe link and napi_disable will be called again:
>
>    napi_disable+0x1b/0x1b0
>    tg3_napi_disable+0x89/0xa0 [tg3]
>    tg3_netif_stop+0x37/0xe3 [tg3]
>    tg3_stop+0x30/0x160 [tg3]
>    tg3_close+0x2a/0x60 [tg3]
>    __dev_close_many+0xad/0x130
>    dev_close_many+0xb2/0x190
>    unregister_netdevice_many_notify+0x19d/0xa00
>    unregister_netdevice_queue+0xf8/0x140
>    unregister_netdev+0x1c/0x30
>    tg3_remove_one+0xaa/0x150 [tg3]
>    pci_device_remove+0x42/0xb0
>    device_release_driver_internal+0x19c/0x200
>    pci_stop_bus_device+0x85/0xb0
>    pci_stop_bus_device+0x2c/0xb0
>    pci_stop_bus_device+0x2c/0xb0
>    pci_stop_and_remove_bus_device+0x12/0x20
>    pciehp_unconfigure_device+0x9f/0x160
>    pciehp_disable_slot+0x67/0x100
>    pciehp_handle_presence_or_link_change+0x77/0x350
>
> This is not expected by napi_disable and a thread can be locked in
> napi_disable forever. We have pcierr_recovery to cover a similar issue,
> but for fatal errors. We cannot reuse this flag because it is reset in
> tg3_io_resume, but it is not called when AER recovery fails.
>
> Similarly, if an AER error is reported and tg3_io_error_detected calls
> pci_disable_device, a subsequent device removal via tg3_remove_one or
> tg3_shutdown will call pci_disable_device again for the already-disabled
> device.
>
> Add a napi_enabled flag to struct tg3 to track whether napi_enable has
> been called. Guard tg3_napi_disable() against being called before
> tg3_napi_enable(), logging an error if that happens. Also guard
> pci_disable_device() calls in tg3_remove_one() and tg3_shutdown() with
> pci_is_enabled() to avoid disabling an already-disabled device.
>
> Fixes: b45aa2f6192e ("tg3: Add EEH support")
> Signed-off-by: Yury Murashka <yurypm@arista.com>
> ---

For some reason I cannot see this patch in the patchwork.

>   drivers/net/ethernet/broadcom/tg3.c | 19 +++++++++++++++++--
>   drivers/net/ethernet/broadcom/tg3.h |  1 +
>   2 files changed, 18 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/net/ethernet/broadcom/tg3.c
> b/drivers/net/ethernet/broadcom/tg3.c
> index 73a4b569b..500b6f7fa 100644
> --- a/drivers/net/ethernet/broadcom/tg3.c
> +++ b/drivers/net/ethernet/broadcom/tg3.c
> @@ -7396,8 +7396,18 @@ static void tg3_napi_disable(struct tg3 *tp)
>         int txq_idx = tp->txq_cnt - 1;
>         int rxq_idx = tp->rxq_cnt - 1;
>         struct tg3_napi *tnapi;
> +       struct net_device *netdev = tp->dev;

Should maintain reverse-xmas tree for declarations.
(Sorry I was focused more on the need for the patch than the actual
changes during v1. I could have given these comments then itself but
anyway here they are)

>         int i;
>   +     if (!tp->napi_enabled) {
> +               netdev_err(netdev, "%s() called when napi_enable wasn't called
> before, netif_running=%d, pci_enabled=%d\n",
> +                          __func__, netif_running(netdev),
> +                          pci_is_enabled(tp->pdev));

This trace looks more like a debug print. Can be removed.

> +               return;
> +       }
> +
> +       tp->napi_enabled = false;
> +
>         for (i = tp->irq_cnt - 1; i >= 0; i--) {
>                 tnapi = &tp->napi[i];
>                 if (tnapi->tx_buffers) {
> @@ -7420,6 +7430,8 @@ static void tg3_napi_enable(struct tg3 *tp)
>         struct tg3_napi *tnapi;
>         int i;

Leave a line. Please checkpatch.pl once before sending.

>   +     tp->napi_enabled = true;
> +
>         for (i = 0; i < tp->irq_cnt; i++) {
>                 tnapi = &tp->napi[i];
>                 napi_enable_locked(&tnapi->napi);
> @@ -17718,6 +17730,7 @@ static int tg3_init_one(struct pci_dev *pdev,
>         tp->tx_mode = TG3_DEF_TX_MODE;
>         tp->irq_sync = 1;
>         tp->pcierr_recovery = false;
> +       tp->napi_enabled = false;
>         if (tg3_debug > 0)
>                 tp->msg_enable = tg3_debug;
> @@ -18099,7 +18112,8 @@ static void tg3_remove_one(struct pci_dev *pdev)
>                 }
>                 free_netdev(dev);
>                 pci_release_regions(pdev);
> -               pci_disable_device(pdev);
> +               if (pci_is_enabled(pdev))
> +                       pci_disable_device(pdev);
>         }
>   }
>   @@ -18257,7 +18271,8 @@ static void tg3_shutdown(struct pci_dev *pdev)
>         rtnl_unlock();
>   -     pci_disable_device(pdev);
> +       if (pci_is_enabled(pdev))
> +               pci_disable_device(pdev);
>   }
>    /**
> diff --git a/drivers/net/ethernet/broadcom/tg3.h
> b/drivers/net/ethernet/broadcom/tg3.h
> index a9e7f88fa..34fb771e8 100644
> --- a/drivers/net/ethernet/broadcom/tg3.h
> +++ b/drivers/net/ethernet/broadcom/tg3.h
> @@ -3429,6 +3429,7 @@ struct tg3 {
>         struct device                   *hwmon_dev;
>         bool                            link_up;
>         bool                            pcierr_recovery;
> +       bool                            napi_enabled;
>         u32                             ape_hb;
>         unsigned long                   ape_hb_interval;
> --
> 2.51.0
>