From: Nicolas Pitre <npitre@baylibre.com>
Usage of devm_alloc_etherdev_mqs() conflicts with
am65_cpsw_nuss_cleanup_ndev() as the same struct net_device instances
get unregistered twice. Switch to alloc_etherdev_mqs() and make sure
am65_cpsw_nuss_cleanup_ndev() unregisters and frees those net_device
instances properly.
With this, it is finally possible to rmmod the driver without oopsing
the kernel.
Fixes: 93a76530316a ("net: ethernet: ti: introduce am65x/j721e gigabit eth subsystem driver")
Signed-off-by: Nicolas Pitre <npitre@baylibre.com>
---
drivers/net/ethernet/ti/am65-cpsw-nuss.c | 20 ++++++++++++--------
1 file changed, 12 insertions(+), 8 deletions(-)
diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
index f6bc8a4dc6..e95457c988 100644
--- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c
+++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
@@ -2744,10 +2744,9 @@ am65_cpsw_nuss_init_port_ndev(struct am65_cpsw_common *common, u32 port_idx)
return 0;
/* alloc netdev */
- port->ndev = devm_alloc_etherdev_mqs(common->dev,
- sizeof(struct am65_cpsw_ndev_priv),
- AM65_CPSW_MAX_QUEUES,
- AM65_CPSW_MAX_QUEUES);
+ port->ndev = alloc_etherdev_mqs(sizeof(struct am65_cpsw_ndev_priv),
+ AM65_CPSW_MAX_QUEUES,
+ AM65_CPSW_MAX_QUEUES);
if (!port->ndev) {
dev_err(dev, "error allocating slave net_device %u\n",
port->port_id);
@@ -2868,8 +2867,12 @@ static void am65_cpsw_nuss_cleanup_ndev(struct am65_cpsw_common *common)
for (i = 0; i < common->port_num; i++) {
port = &common->ports[i];
- if (port->ndev && port->ndev->reg_state == NETREG_REGISTERED)
+ if (!port->ndev)
+ continue;
+ if (port->ndev->reg_state == NETREG_REGISTERED)
unregister_netdev(port->ndev);
+ free_netdev(port->ndev);
+ port->ndev = NULL;
}
}
@@ -3613,16 +3616,17 @@ static int am65_cpsw_nuss_probe(struct platform_device *pdev)
ret = am65_cpsw_nuss_init_ndevs(common);
if (ret)
- goto err_free_phylink;
+ goto err_ndevs_clear;
ret = am65_cpsw_nuss_register_ndevs(common);
if (ret)
- goto err_free_phylink;
+ goto err_ndevs_clear;
pm_runtime_put(dev);
return 0;
-err_free_phylink:
+err_ndevs_clear:
+ am65_cpsw_nuss_cleanup_ndev(common);
am65_cpsw_nuss_phylink_cleanup(common);
am65_cpts_release(common->cpts);
err_of_clear:
--
2.46.1
On 04/10/2024 07:10, Nicolas Pitre wrote:
> From: Nicolas Pitre <npitre@baylibre.com>
>
> Usage of devm_alloc_etherdev_mqs() conflicts with
> am65_cpsw_nuss_cleanup_ndev() as the same struct net_device instances
> get unregistered twice. Switch to alloc_etherdev_mqs() and make sure
> am65_cpsw_nuss_cleanup_ndev() unregisters and frees those net_device
> instances properly.
>
> With this, it is finally possible to rmmod the driver without oopsing
> the kernel.
>
> Fixes: 93a76530316a ("net: ethernet: ti: introduce am65x/j721e gigabit eth subsystem driver")
> Signed-off-by: Nicolas Pitre <npitre@baylibre.com>
Reviewed-by: Roger Quadros <roger@kernel.org>
Hi Nicolas,
On 04/10/2024 07:10, Nicolas Pitre wrote:
> From: Nicolas Pitre <npitre@baylibre.com>
>
> Usage of devm_alloc_etherdev_mqs() conflicts with
> am65_cpsw_nuss_cleanup_ndev() as the same struct net_device instances
> get unregistered twice. Switch to alloc_etherdev_mqs() and make sure
Do we know why the same net device gets unregistered twice?
> am65_cpsw_nuss_cleanup_ndev() unregisters and frees those net_device
> instances properly.
>
> With this, it is finally possible to rmmod the driver without oopsing
> the kernel.
>
> Fixes: 93a76530316a ("net: ethernet: ti: introduce am65x/j721e gigabit eth subsystem driver")
> Signed-off-by: Nicolas Pitre <npitre@baylibre.com>
> ---
> drivers/net/ethernet/ti/am65-cpsw-nuss.c | 20 ++++++++++++--------
> 1 file changed, 12 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
> index f6bc8a4dc6..e95457c988 100644
> --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c
> +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
> @@ -2744,10 +2744,9 @@ am65_cpsw_nuss_init_port_ndev(struct am65_cpsw_common *common, u32 port_idx)
> return 0;
>
> /* alloc netdev */
> - port->ndev = devm_alloc_etherdev_mqs(common->dev,
> - sizeof(struct am65_cpsw_ndev_priv),
> - AM65_CPSW_MAX_QUEUES,
> - AM65_CPSW_MAX_QUEUES);
> + port->ndev = alloc_etherdev_mqs(sizeof(struct am65_cpsw_ndev_priv),
> + AM65_CPSW_MAX_QUEUES,
> + AM65_CPSW_MAX_QUEUES);
Can we solve this issue without doing this change as
there are many error cases relying on devm managed freeing of netdev.
> if (!port->ndev) {
> dev_err(dev, "error allocating slave net_device %u\n",
> port->port_id);
> @@ -2868,8 +2867,12 @@ static void am65_cpsw_nuss_cleanup_ndev(struct am65_cpsw_common *common)
>
> for (i = 0; i < common->port_num; i++) {
> port = &common->ports[i];
> - if (port->ndev && port->ndev->reg_state == NETREG_REGISTERED)
> + if (!port->ndev)
> + continue;
> + if (port->ndev->reg_state == NETREG_REGISTERED)
> unregister_netdev(port->ndev);
> + free_netdev(port->ndev);
> + port->ndev = NULL;
I still can't see what we are doing wrong in existing code.
> }
> }
>
> @@ -3613,16 +3616,17 @@ static int am65_cpsw_nuss_probe(struct platform_device *pdev)
>
> ret = am65_cpsw_nuss_init_ndevs(common);
> if (ret)
> - goto err_free_phylink;
> + goto err_ndevs_clear;
>
> ret = am65_cpsw_nuss_register_ndevs(common);
> if (ret)
> - goto err_free_phylink;
> + goto err_ndevs_clear;
>
> pm_runtime_put(dev);
> return 0;
>
> -err_free_phylink:
> +err_ndevs_clear:
> + am65_cpsw_nuss_cleanup_ndev(common);
> am65_cpsw_nuss_phylink_cleanup(common);
> am65_cpts_release(common->cpts);
> err_of_clear:
--
cheers,
-roger
On Fri, 4 Oct 2024, Roger Quadros wrote:
> Hi Nicolas,
>
> On 04/10/2024 07:10, Nicolas Pitre wrote:
> > From: Nicolas Pitre <npitre@baylibre.com>
> >
> > Usage of devm_alloc_etherdev_mqs() conflicts with
> > am65_cpsw_nuss_cleanup_ndev() as the same struct net_device instances
> > get unregistered twice. Switch to alloc_etherdev_mqs() and make sure
>
> Do we know why the same net device gets unregistered twice?
When using devm_alloc_etherdev_mqs() every successful allocation is put
in a resource list tied to the device. When the driver is removed,
there's a net device unregister from am65_cpsw_nuss_cleanup_ndev() and
another one from devm_free_netdev().
We established in patch #1 that net devices must be unregistered before
devlink_port_unregister() is invoked, meaning we can't rely on the
implicit devm_free_netdev() as it happens too late, hence the explicit
am65_cpsw_nuss_cleanup_ndev().
> > am65_cpsw_nuss_cleanup_ndev() unregisters and frees those net_device
> > instances properly.
> >
> > With this, it is finally possible to rmmod the driver without oopsing
> > the kernel.
> >
> > Fixes: 93a76530316a ("net: ethernet: ti: introduce am65x/j721e gigabit eth subsystem driver")
> > Signed-off-by: Nicolas Pitre <npitre@baylibre.com>
> > ---
> > drivers/net/ethernet/ti/am65-cpsw-nuss.c | 20 ++++++++++++--------
> > 1 file changed, 12 insertions(+), 8 deletions(-)
> >
> > diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
> > index f6bc8a4dc6..e95457c988 100644
> > --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c
> > +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
> > @@ -2744,10 +2744,9 @@ am65_cpsw_nuss_init_port_ndev(struct am65_cpsw_common *common, u32 port_idx)
> > return 0;
> >
> > /* alloc netdev */
> > - port->ndev = devm_alloc_etherdev_mqs(common->dev,
> > - sizeof(struct am65_cpsw_ndev_priv),
> > - AM65_CPSW_MAX_QUEUES,
> > - AM65_CPSW_MAX_QUEUES);
> > + port->ndev = alloc_etherdev_mqs(sizeof(struct am65_cpsw_ndev_priv),
> > + AM65_CPSW_MAX_QUEUES,
> > + AM65_CPSW_MAX_QUEUES);
>
> Can we solve this issue without doing this change as
> there are many error cases relying on devm managed freeing of netdev.
If you know of a way to do this differently I'm all ears.
About the many error cases needing the freeing of net devices, as far as
I know they're all covered with this patch.
> I still can't see what we are doing wrong in existing code.
Did you try to rmmod this driver lately?
Nicolas
On 04/10/2024 18:37, Nicolas Pitre wrote:
> On Fri, 4 Oct 2024, Roger Quadros wrote:
>
>> Hi Nicolas,
>>
>> On 04/10/2024 07:10, Nicolas Pitre wrote:
>>> From: Nicolas Pitre <npitre@baylibre.com>
>>>
>>> Usage of devm_alloc_etherdev_mqs() conflicts with
>>> am65_cpsw_nuss_cleanup_ndev() as the same struct net_device instances
>>> get unregistered twice. Switch to alloc_etherdev_mqs() and make sure
>>
>> Do we know why the same net device gets unregistered twice?
>
> When using devm_alloc_etherdev_mqs() every successful allocation is put
> in a resource list tied to the device. When the driver is removed,
> there's a net device unregister from am65_cpsw_nuss_cleanup_ndev() and
> another one from devm_free_netdev().
I couldn't find out where devm_free_netdev() calls unregister_netdev().
Also we didn't use devm_register_netdev() so resource manager will not
call unregister_netdev().
>
> We established in patch #1 that net devices must be unregistered before
> devlink_port_unregister() is invoked, meaning we can't rely on the
> implicit devm_free_netdev() as it happens too late, hence the explicit
> am65_cpsw_nuss_cleanup_ndev().
>
>>> am65_cpsw_nuss_cleanup_ndev() unregisters and frees those net_device
>>> instances properly.
>>>
>>> With this, it is finally possible to rmmod the driver without oopsing
>>> the kernel.
>>>
>>> Fixes: 93a76530316a ("net: ethernet: ti: introduce am65x/j721e gigabit eth subsystem driver")
>>> Signed-off-by: Nicolas Pitre <npitre@baylibre.com>
>>> ---
>>> drivers/net/ethernet/ti/am65-cpsw-nuss.c | 20 ++++++++++++--------
>>> 1 file changed, 12 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
>>> index f6bc8a4dc6..e95457c988 100644
>>> --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c
>>> +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
>>> @@ -2744,10 +2744,9 @@ am65_cpsw_nuss_init_port_ndev(struct am65_cpsw_common *common, u32 port_idx)
>>> return 0;
>>>
>>> /* alloc netdev */
>>> - port->ndev = devm_alloc_etherdev_mqs(common->dev,
>>> - sizeof(struct am65_cpsw_ndev_priv),
>>> - AM65_CPSW_MAX_QUEUES,
>>> - AM65_CPSW_MAX_QUEUES);
>>> + port->ndev = alloc_etherdev_mqs(sizeof(struct am65_cpsw_ndev_priv),
>>> + AM65_CPSW_MAX_QUEUES,
>>> + AM65_CPSW_MAX_QUEUES);
>>
>> Can we solve this issue without doing this change as
>> there are many error cases relying on devm managed freeing of netdev.
>
> If you know of a way to do this differently I'm all ears.
I sent another approach already. please check.
https://lore.kernel.org/all/67c9ede4-9751-4255-b752-27dd60495ff3@kernel.org/
>
> About the many error cases needing the freeing of net devices, as far as
> I know they're all covered with this patch.
No they are not. you now have to explicitly call free_netdev() in error paths of am65_cpsw_nuss_init_port_ndev().
I see 3 places directly returning error code.
i.e.
default:
dev_err(dev, "selected phy-mode is not supported\n");
return -EOPNOTSUPP;
}
...
if (IS_ERR(phylink))
return PTR_ERR(phylink);
...
ndev_priv->stats = netdev_alloc_pcpu_stats(struct am65_cpsw_ndev_stats);
if (!ndev_priv->stats)
return -ENOMEM;
>
>> I still can't see what we are doing wrong in existing code.
>
> Did you try to rmmod this driver lately?
Yes and it throws an oops, so we do need a fix.
>
>
> Nicolas
--
cheers,
-roger
On Fri, 4 Oct 2024, Roger Quadros wrote: > > If you know of a way to do this differently I'm all ears. > > I sent another approach already. please check. > https://lore.kernel.org/all/67c9ede4-9751-4255-b752-27dd60495ff3@kernel.org/ Seems to work correctly. Still... given this paragraph found in Documentation/process/maintainer-netdev.rst: |Netdev remains skeptical about promises of all "auto-cleanup" APIs, |including even ``devm_`` helpers, historically. They are not the preferred |style of implementation, merely an acceptable one. and given my solution is way simpler, I tend to also prefer it over yours. But I'm not the maintainer nor even a significant contributor here so as long as the issue is fixed I won't mind. > > About the many error cases needing the freeing of net devices, as far as > > I know they're all covered with this patch. > > No they are not. As I said yesterday, I do still stand by my affirmation that they are. Please look at the entire return path and you'll see that everything is covered. Nicolas
On 05/10/2024 23:26, Nicolas Pitre wrote: > On Fri, 4 Oct 2024, Roger Quadros wrote: > >>> If you know of a way to do this differently I'm all ears. >> >> I sent another approach already. please check. >> https://lore.kernel.org/all/67c9ede4-9751-4255-b752-27dd60495ff3@kernel.org/ > > Seems to work correctly. > > Still... given this paragraph found in Documentation/process/maintainer-netdev.rst: > > |Netdev remains skeptical about promises of all "auto-cleanup" APIs, > |including even ``devm_`` helpers, historically. They are not the preferred > |style of implementation, merely an acceptable one. > > and given my solution is way simpler, I tend to also prefer it over yours. OK. Let's go with yours as it makes the driver more compliant to netdev guidelines. > > But I'm not the maintainer nor even a significant contributor here so as > long as the issue is fixed I won't mind. > >>> About the many error cases needing the freeing of net devices, as far as >>> I know they're all covered with this patch. >> >> No they are not. > > As I said yesterday, I do still stand by my affirmation that they are. > Please look at the entire return path and you'll see that everything is > covered. Indeed, my bad. It wasn't obvious by just looking at the patch but when looking at the code it is called via am65_cpsw_nuss_cleanup_ndev(). > > > Nicolas -- cheers, -roger
On Fri, 4 Oct 2024, Roger Quadros wrote: > > > On 04/10/2024 18:37, Nicolas Pitre wrote: > >>> diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c > >>> index f6bc8a4dc6..e95457c988 100644 > >>> --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c > >>> +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c > >>> @@ -2744,10 +2744,9 @@ am65_cpsw_nuss_init_port_ndev(struct am65_cpsw_common *common, u32 port_idx) > >>> return 0; > >>> > >>> /* alloc netdev */ > >>> - port->ndev = devm_alloc_etherdev_mqs(common->dev, > >>> - sizeof(struct am65_cpsw_ndev_priv), > >>> - AM65_CPSW_MAX_QUEUES, > >>> - AM65_CPSW_MAX_QUEUES); > >>> + port->ndev = alloc_etherdev_mqs(sizeof(struct am65_cpsw_ndev_priv), > >>> + AM65_CPSW_MAX_QUEUES, > >>> + AM65_CPSW_MAX_QUEUES); > >> > >> Can we solve this issue without doing this change as > >> there are many error cases relying on devm managed freeing of netdev. > > > > If you know of a way to do this differently I'm all ears. > > I sent another approach already. please check. Slowly being built. > > About the many error cases needing the freeing of net devices, as far as > > I know they're all covered with this patch. > > No they are not. you now have to explicitly call free_netdev() in error paths of am65_cpsw_nuss_init_port_ndev(). And it does. If am65_cpsw_nuss_init_ndevs() fails then it frees them all. Same as with am65_cpsw_nuss_phylink_cleanup(). Nicolas
Hi Nicolas,
On 04/10/2024 12:09, Roger Quadros wrote:
> Hi Nicolas,
>
> On 04/10/2024 07:10, Nicolas Pitre wrote:
>> From: Nicolas Pitre <npitre@baylibre.com>
>>
>> Usage of devm_alloc_etherdev_mqs() conflicts with
>> am65_cpsw_nuss_cleanup_ndev() as the same struct net_device instances
>> get unregistered twice. Switch to alloc_etherdev_mqs() and make sure
>
> Do we know why the same net device gets unregistered twice?
On some boards there are 2 net devices per CPSW. so those those 2
getting unregistered?
On some investigation I found that the issue has to do with napi_list.
I don't exactly know why but it oopes in free_netdev() at napi_list
iterations
list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
netif_napi_del(p);
If we cleanup the napi list at remove then I don't see the oops anymore.
>
>> am65_cpsw_nuss_cleanup_ndev() unregisters and frees those net_device
>> instances properly.
>>
>> With this, it is finally possible to rmmod the driver without oopsing
>> the kernel.
>>
>> Fixes: 93a76530316a ("net: ethernet: ti: introduce am65x/j721e gigabit eth subsystem driver")
>> Signed-off-by: Nicolas Pitre <npitre@baylibre.com>
>> ---
Can you please try the below patch instead?
diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
index f6bc8a4dc687..e214547aeba7 100644
--- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c
+++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
@@ -2206,14 +2206,11 @@ static void am65_cpsw_nuss_free_tx_chns(void *data)
}
}
-static void am65_cpsw_nuss_remove_tx_chns(struct am65_cpsw_common *common)
+static void am65_cpsw_nuss_cleanup_tx_napi(struct am65_cpsw_common *common)
{
struct device *dev = common->dev;
int i;
- devm_remove_action(dev, am65_cpsw_nuss_free_tx_chns, common);
-
- common->tx_ch_rate_msk = 0;
for (i = 0; i < common->tx_ch_num; i++) {
struct am65_cpsw_tx_chn *tx_chn = &common->tx_chns[i];
@@ -2222,7 +2219,15 @@ static void am65_cpsw_nuss_remove_tx_chns(struct am65_cpsw_common *common)
netif_napi_del(&tx_chn->napi_tx);
}
+}
+
+static void am65_cpsw_nuss_remove_tx_chns(struct am65_cpsw_common *common)
+{
+ struct device *dev = common->dev;
+ devm_remove_action(dev, am65_cpsw_nuss_free_tx_chns, common);
+ common->tx_ch_rate_msk = 0;
+ am65_cpsw_nuss_cleanup_tx_napi(common);
am65_cpsw_nuss_free_tx_chns(common);
}
@@ -2355,25 +2360,27 @@ static void am65_cpsw_nuss_free_rx_chns(void *data)
k3_udma_glue_release_rx_chn(rx_chn->rx_chn);
}
-static void am65_cpsw_nuss_remove_rx_chns(struct am65_cpsw_common *common)
+static void am65_cpsw_nuss_cleanup_rx_napi(struct am65_cpsw_common *common)
{
struct device *dev = common->dev;
- struct am65_cpsw_rx_chn *rx_chn;
struct am65_cpsw_rx_flow *flows;
int i;
- rx_chn = &common->rx_chns;
- flows = rx_chn->flows;
- devm_remove_action(dev, am65_cpsw_nuss_free_rx_chns, common);
-
+ flows = common->rx_chns.flows;
for (i = 0; i < common->rx_ch_num_flows; i++) {
if (!(flows[i].irq < 0))
devm_free_irq(dev, flows[i].irq, &flows[i]);
netif_napi_del(&flows[i].napi_rx);
}
+}
- am65_cpsw_nuss_free_rx_chns(common);
+static void am65_cpsw_nuss_remove_rx_chns(struct am65_cpsw_common *common)
+{
+ struct device *dev = common->dev;
+ devm_remove_action(dev, am65_cpsw_nuss_free_rx_chns, common);
+ am65_cpsw_nuss_cleanup_rx_napi(common);
+ am65_cpsw_nuss_free_rx_chns(common);
common->rx_flow_id_base = -1;
}
@@ -2871,6 +2878,9 @@ static void am65_cpsw_nuss_cleanup_ndev(struct am65_cpsw_common *common)
if (port->ndev && port->ndev->reg_state == NETREG_REGISTERED)
unregister_netdev(port->ndev);
}
+
+ am65_cpsw_nuss_cleanup_rx_napi(common);
+ am65_cpsw_nuss_cleanup_tx_napi(common);
}
static void am65_cpsw_port_offload_fwd_mark_update(struct am65_cpsw_common *common)
© 2016 - 2026 Red Hat, Inc.