From: Jianbo Liu <jianbol@nvidia.com>
The function mlx5_uplink_netdev_get() gets the uplink netdevice
pointer from mdev->mlx5e_res.uplink_netdev. However, the netdevice can
be removed and its pointer cleared when unbound from the mlx5_core.eth
driver. This results in a NULL pointer, causing a kernel panic.
BUG: unable to handle page fault for address: 0000000000001300
#PF: supervisor read access in kernel mode
#PF: error_code(0x0000) - not-present page
PGD 0 P4D 0
Oops: Oops: 0000 [#1] SMP
CPU: 5 UID: 0 PID: 1420 Comm: devlink Not tainted 6.16.0-rc7+ #1 NONE
Hardware name: ...
RIP: 0010:mlx5e_vport_rep_load+0x22a/0x270 [mlx5_core]
Code: ...
RSP: 0018:ffff8881142b78f0 EFLAGS: 00010246
RAX: ffff888104652150 RBX: ffff88810215a6c0 RCX: 0000000000000000
RDX: ffff888104652000 RSI: ffffffffa02b0cc0 RDI: 0000000000000000
RBP: ffff888104652000 R08: ffff8884edaad5c0 R09: 0000000000000000
R10: ffff8881142b78f0 R11: ffff888100716480 R12: ffff88810f1e0000
R13: ffff88810215a6c0 R14: ffff888104652000 R15: ffff8881095301a0
FS: 00007fa008742740(0000) GS:ffff88856a9f6000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000001300 CR3: 0000000106fb6001 CR4: 0000000000372eb0
Call Trace:
<TASK>
mlx5_esw_offloads_rep_load+0x68/0xe0 [mlx5_core]
esw_offloads_enable+0x593/0x910 [mlx5_core]
mlx5_eswitch_enable_locked+0x341/0x420 [mlx5_core]
? is_mp_supported+0x4b/0xa0 [mlx5_core]
mlx5_devlink_eswitch_mode_set+0x17e/0x3a0 [mlx5_core]
devlink_nl_eswitch_set_doit+0x60/0xd0
genl_family_rcv_msg_doit+0xe0/0x130
genl_rcv_msg+0x183/0x290
? devlink_get_from_attrs_lock+0x180/0x180
? devlink_nl_eswitch_get_doit+0x290/0x290
? devlink_nl_pre_doit_port_optional+0x40/0x40
? genl_family_rcv_msg_dumpit+0xf0/0xf0
netlink_rcv_skb+0x4b/0xf0
genl_rcv+0x24/0x40
netlink_unicast+0x255/0x380
? __alloc_skb+0x120/0x1b0
netlink_sendmsg+0x1f3/0x420
__sock_sendmsg+0x38/0x60
__sys_sendto+0x119/0x180
? count_memcg_events+0xec/0x150
? __rseq_handle_notify_resume+0xa9/0x460
? handle_mm_fault+0x12e/0x260
? do_user_addr_fault+0x2a4/0x680
__x64_sys_sendto+0x20/0x30
do_syscall_64+0x53/0x1d0
entry_SYSCALL_64_after_hwframe+0x4b/0x53
Ensure the pointer is valid before use by checking it for NULL. If it
is valid, immediately call netdev_hold() to take a reference, and
preventing the netdevice from being freed while it is in use.
Fixes: 7a9fb35e8c3a ("net/mlx5e: Do not reload ethernet ports when changing eswitch mode")
Signed-off-by: Jianbo Liu <jianbol@nvidia.com>
Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Reviewed-by: Dragos Tatulea <dtatulea@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
.../net/ethernet/mellanox/mlx5/core/en_rep.c | 26 +++++++++++++++----
.../net/ethernet/mellanox/mlx5/core/esw/qos.c | 1 +
.../ethernet/mellanox/mlx5/core/lib/mlx5.h | 15 ++++++++++-
include/linux/mlx5/driver.h | 1 +
4 files changed, 37 insertions(+), 6 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 63a7a788fb0d..912887376824 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -1506,12 +1506,20 @@ static const struct mlx5e_profile mlx5e_uplink_rep_profile = {
static int
mlx5e_vport_uplink_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
{
- struct mlx5e_priv *priv = netdev_priv(mlx5_uplink_netdev_get(dev));
struct mlx5e_rep_priv *rpriv = mlx5e_rep_to_rep_priv(rep);
+ struct net_device *netdev = mlx5_uplink_netdev_get(dev);
+ struct mlx5e_priv *priv;
+ int err;
+
+ if (!netdev)
+ return 0;
+ priv = netdev_priv(netdev);
rpriv->netdev = priv->netdev;
- return mlx5e_netdev_change_profile(priv, &mlx5e_uplink_rep_profile,
- rpriv);
+ err = mlx5e_netdev_change_profile(priv, &mlx5e_uplink_rep_profile,
+ rpriv);
+ mlx5_uplink_netdev_put(dev, netdev);
+ return err;
}
static void
@@ -1638,8 +1646,16 @@ mlx5e_vport_rep_unload(struct mlx5_eswitch_rep *rep)
{
struct mlx5e_rep_priv *rpriv = mlx5e_rep_to_rep_priv(rep);
struct net_device *netdev = rpriv->netdev;
- struct mlx5e_priv *priv = netdev_priv(netdev);
- void *ppriv = priv->ppriv;
+ struct mlx5e_priv *priv;
+ void *ppriv;
+
+ if (!netdev) {
+ ppriv = rpriv;
+ goto free_ppriv;
+ }
+
+ priv = netdev_priv(netdev);
+ ppriv = priv->ppriv;
if (rep->vport == MLX5_VPORT_UPLINK) {
mlx5e_vport_uplink_rep_unload(rpriv);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
index 8b4977650183..5f2d6c35f1ad 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
@@ -1515,6 +1515,7 @@ static u32 mlx5_esw_qos_lag_link_speed_get_locked(struct mlx5_core_dev *mdev)
speed = lksettings.base.speed;
out:
+ mlx5_uplink_netdev_put(mdev, slave);
return speed;
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
index b111ccd03b02..74ea5da58b7e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
@@ -47,7 +47,20 @@ int mlx5_crdump_collect(struct mlx5_core_dev *dev, u32 *cr_data);
static inline struct net_device *mlx5_uplink_netdev_get(struct mlx5_core_dev *mdev)
{
- return mdev->mlx5e_res.uplink_netdev;
+ struct mlx5e_resources *mlx5e_res = &mdev->mlx5e_res;
+ struct net_device *netdev;
+
+ mutex_lock(&mlx5e_res->uplink_netdev_lock);
+ netdev = mlx5e_res->uplink_netdev;
+ netdev_hold(netdev, &mlx5e_res->tracker, GFP_KERNEL);
+ mutex_unlock(&mlx5e_res->uplink_netdev_lock);
+ return netdev;
+}
+
+static inline void mlx5_uplink_netdev_put(struct mlx5_core_dev *mdev,
+ struct net_device *netdev)
+{
+ netdev_put(netdev, &mdev->mlx5e_res.tracker);
}
struct mlx5_sd;
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 8c5fbfb85749..10fe492e1fed 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -663,6 +663,7 @@ struct mlx5e_resources {
bool tisn_valid;
} hw_objs;
struct net_device *uplink_netdev;
+ netdevice_tracker tracker;
struct mutex uplink_netdev_lock;
struct mlx5_crypto_dek_priv *dek_priv;
};
--
2.31.1
On Mon, 8 Sep 2025 13:07:04 +0300 Tariq Toukan wrote: > + struct net_device *netdev = mlx5_uplink_netdev_get(dev); > + struct mlx5e_priv *priv; > + int err; > + > + if (!netdev) > + return 0; Please don't call in variable init functions which require cleanup or error checking.
On 9/10/2025 9:23 AM, Jakub Kicinski wrote: > On Mon, 8 Sep 2025 13:07:04 +0300 Tariq Toukan wrote: >> + struct net_device *netdev = mlx5_uplink_netdev_get(dev); >> + struct mlx5e_priv *priv; >> + int err; >> + >> + if (!netdev) >> + return 0; > > Please don't call in variable init functions which require cleanup > or error checking. But in this function, a NULL return from mlx5_uplink_netdev_get is a valid condition where it should simply return 0. No cleanup or error check is needed. Thanks! Jianbo
On Wed, 10 Sep 2025 11:23:09 +0800 Jianbo Liu wrote: > On 9/10/2025 9:23 AM, Jakub Kicinski wrote: > > On Mon, 8 Sep 2025 13:07:04 +0300 Tariq Toukan wrote: > >> + struct net_device *netdev = mlx5_uplink_netdev_get(dev); > >> + struct mlx5e_priv *priv; > >> + int err; > >> + > >> + if (!netdev) > >> + return 0; > > > > Please don't call in variable init functions which require cleanup > > or error checking. > > But in this function, a NULL return from mlx5_uplink_netdev_get is a > valid condition where it should simply return 0. No cleanup or error > check is needed. You have to check if it succeeded, and if so, you need to clean up later. Do no hide meaningful code in variable init.
On 9/11/2025 8:45 AM, Jakub Kicinski wrote: > On Wed, 10 Sep 2025 11:23:09 +0800 Jianbo Liu wrote: >> On 9/10/2025 9:23 AM, Jakub Kicinski wrote: >>> On Mon, 8 Sep 2025 13:07:04 +0300 Tariq Toukan wrote: >>>> + struct net_device *netdev = mlx5_uplink_netdev_get(dev); >>>> + struct mlx5e_priv *priv; >>>> + int err; >>>> + >>>> + if (!netdev) >>>> + return 0; >>> >>> Please don't call in variable init functions which require cleanup >>> or error checking. >> >> But in this function, a NULL return from mlx5_uplink_netdev_get is a >> valid condition where it should simply return 0. No cleanup or error >> check is needed. > > You have to check if it succeeded, and if so, you need to clean up > later. Do no hide meaningful code in variable init. My focus was on the NULL case, but I see now that the real issue is ensuring the corresponding cleanup (_put) happens on the successful path. Hiding the _get call in the initializer makes that less clear. I will refactor the code to follow the correct pattern, like this: struct net_device *netdev; netdev = mlx5_uplink_netdev_get(dev); if (!netdev) return 0; Thank you for the explanation.
On 9/11/25 09:09, Jianbo Liu wrote: > > > On 9/11/2025 8:45 AM, Jakub Kicinski wrote: >> On Wed, 10 Sep 2025 11:23:09 +0800 Jianbo Liu wrote: >>> On 9/10/2025 9:23 AM, Jakub Kicinski wrote: >>>> On Mon, 8 Sep 2025 13:07:04 +0300 Tariq Toukan wrote: >>>>> + struct net_device *netdev = mlx5_uplink_netdev_get(dev); >>>>> + struct mlx5e_priv *priv; >>>>> + int err; >>>>> + >>>>> + if (!netdev) >>>>> + return 0; >>>> >>>> Please don't call in variable init functions which require cleanup >>>> or error checking. >>> >>> But in this function, a NULL return from mlx5_uplink_netdev_get is a >>> valid condition where it should simply return 0. No cleanup or error >>> check is needed. >> >> You have to check if it succeeded, and if so, you need to clean up >> later. Do no hide meaningful code in variable init. > > My focus was on the NULL case, but I see now that the real issue is > ensuring the corresponding cleanup (_put) happens on the successful > path. Hiding the _get call in the initializer makes that less clear. > > I will refactor the code to follow the correct pattern, like this: > > struct net_device *netdev; > > netdev = mlx5_uplink_netdev_get(dev); > if (!netdev) > return 0; > > Thank you for the explanation. > that would be much better, and make it obvious that there is matched get() and put() calls would be also great to minify stacktrace https://www.kernel.org/doc/html/latest/process/submitting-patches.html#backtraces-in-commit-messages
On 9/12/2025 7:07 PM, Przemek Kitszel wrote: > On 9/11/25 09:09, Jianbo Liu wrote: >> >> >> On 9/11/2025 8:45 AM, Jakub Kicinski wrote: >>> On Wed, 10 Sep 2025 11:23:09 +0800 Jianbo Liu wrote: >>>> On 9/10/2025 9:23 AM, Jakub Kicinski wrote: >>>>> On Mon, 8 Sep 2025 13:07:04 +0300 Tariq Toukan wrote: >>>>>> + struct net_device *netdev = mlx5_uplink_netdev_get(dev); >>>>>> + struct mlx5e_priv *priv; >>>>>> + int err; >>>>>> + >>>>>> + if (!netdev) >>>>>> + return 0; >>>>> >>>>> Please don't call in variable init functions which require cleanup >>>>> or error checking. >>>> >>>> But in this function, a NULL return from mlx5_uplink_netdev_get is a >>>> valid condition where it should simply return 0. No cleanup or error >>>> check is needed. >>> >>> You have to check if it succeeded, and if so, you need to clean up >>> later. Do no hide meaningful code in variable init. >> >> My focus was on the NULL case, but I see now that the real issue is >> ensuring the corresponding cleanup (_put) happens on the successful >> path. Hiding the _get call in the initializer makes that less clear. >> >> I will refactor the code to follow the correct pattern, like this: >> >> struct net_device *netdev; >> >> netdev = mlx5_uplink_netdev_get(dev); >> if (!netdev) >> return 0; >> >> Thank you for the explanation. >> > > that would be much better, and make it obvious that there is > matched get() and put() calls > > would be also great to minify stacktrace > https://www.kernel.org/doc/html/latest/process/submitting- > patches.html#backtraces-in-commit-messages > Got it. I'll minify the stack trace. Thanks for the tip.
© 2016 - 2025 Red Hat, Inc.