The fixed commit placed mutex_lock() inside spin_lock_bh(), which triggers
a warning:
BUG: sleeping function called from invalid context at...
Fix this by moving the IPsec deletion operation to bond_ipsec_free_sa,
which is not held by spin_lock_bh().
Additionally, delete the IPsec list in bond_ipsec_del_sa_all() when the
XFRM state is DEAD to prevent xdo_dev_state_free() from being triggered
again in bond_ipsec_free_sa().
For bond_ipsec_free_sa(), there are now three conditions:
1. if (!slave): When no active device exists.
2. if (!xs->xso.real_dev): When xdo_dev_state_add() fails.
3. if (xs->xso.real_dev != real_dev): When an xs has already been freed
by bond_ipsec_del_sa_all() due to migration, and the active slave has
changed to a new device. At the same time, the xs is marked as DEAD
due to the XFRM entry is removed, triggering xfrm_state_gc_task() and
bond_ipsec_free_sa().
In all three cases, xdo_dev_state_free() should not be called, only xs
should be removed from bond->ipsec list.
At the same time, protect bond_ipsec_del_sa_all and bond_ipsec_add_sa_all
with x->lock for each xs being processed. This prevents XFRM from
concurrently initiating add/delete operations on the managed states.
Fixes: 2aeeef906d5a ("bonding: change ipsec_lock from spin lock to mutex")
Reported-by: Jakub Kicinski <kuba@kernel.org>
Closes: https://lore.kernel.org/netdev/20241212062734.182a0164@kernel.org
Suggested-by: Cosmin Ratiu <cratiu@nvidia.com>
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
---
drivers/net/bonding/bond_main.c | 53 +++++++++++++++++++++++----------
1 file changed, 37 insertions(+), 16 deletions(-)
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index e45bba240cbc..06b060d9b031 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -537,15 +537,22 @@ static void bond_ipsec_add_sa_all(struct bonding *bond)
}
list_for_each_entry(ipsec, &bond->ipsec_list, list) {
+ spin_lock_bh(&ipsec->xs->lock);
+ /* Skip dead xfrm states, they'll be freed later. */
+ if (ipsec->xs->km.state == XFRM_STATE_DEAD)
+ goto next;
+
/* If new state is added before ipsec_lock acquired */
if (ipsec->xs->xso.real_dev == real_dev)
- continue;
+ goto next;
ipsec->xs->xso.real_dev = real_dev;
if (real_dev->xfrmdev_ops->xdo_dev_state_add(ipsec->xs, NULL)) {
slave_warn(bond_dev, real_dev, "%s: failed to add SA\n", __func__);
ipsec->xs->xso.real_dev = NULL;
}
+next:
+ spin_unlock_bh(&ipsec->xs->lock);
}
out:
mutex_unlock(&bond->ipsec_lock);
@@ -560,7 +567,6 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs)
struct net_device *bond_dev = xs->xso.dev;
struct net_device *real_dev;
netdevice_tracker tracker;
- struct bond_ipsec *ipsec;
struct bonding *bond;
struct slave *slave;
@@ -592,15 +598,6 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs)
real_dev->xfrmdev_ops->xdo_dev_state_delete(xs);
out:
netdev_put(real_dev, &tracker);
- mutex_lock(&bond->ipsec_lock);
- list_for_each_entry(ipsec, &bond->ipsec_list, list) {
- if (ipsec->xs == xs) {
- list_del(&ipsec->list);
- kfree(ipsec);
- break;
- }
- }
- mutex_unlock(&bond->ipsec_lock);
}
static void bond_ipsec_del_sa_all(struct bonding *bond)
@@ -617,8 +614,18 @@ static void bond_ipsec_del_sa_all(struct bonding *bond)
mutex_lock(&bond->ipsec_lock);
list_for_each_entry(ipsec, &bond->ipsec_list, list) {
+ spin_lock_bh(&ipsec->xs->lock);
if (!ipsec->xs->xso.real_dev)
- continue;
+ goto next;
+
+ if (ipsec->xs->km.state == XFRM_STATE_DEAD) {
+ /* already dead no need to delete again */
+ if (real_dev->xfrmdev_ops->xdo_dev_state_free)
+ real_dev->xfrmdev_ops->xdo_dev_state_free(ipsec->xs);
+ list_del(&ipsec->list);
+ kfree(ipsec);
+ goto next;
+ }
if (!real_dev->xfrmdev_ops ||
!real_dev->xfrmdev_ops->xdo_dev_state_delete ||
@@ -631,6 +638,8 @@ static void bond_ipsec_del_sa_all(struct bonding *bond)
if (real_dev->xfrmdev_ops->xdo_dev_state_free)
real_dev->xfrmdev_ops->xdo_dev_state_free(ipsec->xs);
}
+next:
+ spin_unlock_bh(&ipsec->xs->lock);
}
mutex_unlock(&bond->ipsec_lock);
}
@@ -640,6 +649,7 @@ static void bond_ipsec_free_sa(struct xfrm_state *xs)
struct net_device *bond_dev = xs->xso.dev;
struct net_device *real_dev;
netdevice_tracker tracker;
+ struct bond_ipsec *ipsec;
struct bonding *bond;
struct slave *slave;
@@ -659,11 +669,22 @@ static void bond_ipsec_free_sa(struct xfrm_state *xs)
if (!xs->xso.real_dev)
goto out;
- WARN_ON(xs->xso.real_dev != real_dev);
+ mutex_lock(&bond->ipsec_lock);
+ list_for_each_entry(ipsec, &bond->ipsec_list, list) {
+ if (ipsec->xs == xs) {
+ /* do xdo_dev_state_free if real_dev matches,
+ * otherwise only remove the list
+ */
+ if (real_dev && real_dev->xfrmdev_ops &&
+ real_dev->xfrmdev_ops->xdo_dev_state_free)
+ real_dev->xfrmdev_ops->xdo_dev_state_free(xs);
+ list_del(&ipsec->list);
+ kfree(ipsec);
+ break;
+ }
+ }
+ mutex_unlock(&bond->ipsec_lock);
- if (real_dev && real_dev->xfrmdev_ops &&
- real_dev->xfrmdev_ops->xdo_dev_state_free)
- real_dev->xfrmdev_ops->xdo_dev_state_free(xs);
out:
netdev_put(real_dev, &tracker);
}
--
2.46.0
On 3/4/25 15:11, Hangbin Liu wrote:
> The fixed commit placed mutex_lock() inside spin_lock_bh(), which triggers
> a warning:
>
> BUG: sleeping function called from invalid context at...
>
> Fix this by moving the IPsec deletion operation to bond_ipsec_free_sa,
> which is not held by spin_lock_bh().
>
> Additionally, delete the IPsec list in bond_ipsec_del_sa_all() when the
> XFRM state is DEAD to prevent xdo_dev_state_free() from being triggered
> again in bond_ipsec_free_sa().
>
> For bond_ipsec_free_sa(), there are now three conditions:
>
> 1. if (!slave): When no active device exists.
> 2. if (!xs->xso.real_dev): When xdo_dev_state_add() fails.
> 3. if (xs->xso.real_dev != real_dev): When an xs has already been freed
> by bond_ipsec_del_sa_all() due to migration, and the active slave has
> changed to a new device. At the same time, the xs is marked as DEAD
> due to the XFRM entry is removed, triggering xfrm_state_gc_task() and
> bond_ipsec_free_sa().
>
> In all three cases, xdo_dev_state_free() should not be called, only xs
> should be removed from bond->ipsec list.
>
> At the same time, protect bond_ipsec_del_sa_all and bond_ipsec_add_sa_all
> with x->lock for each xs being processed. This prevents XFRM from
> concurrently initiating add/delete operations on the managed states.
>
> Fixes: 2aeeef906d5a ("bonding: change ipsec_lock from spin lock to mutex")
> Reported-by: Jakub Kicinski <kuba@kernel.org>
> Closes: https://lore.kernel.org/netdev/20241212062734.182a0164@kernel.org
> Suggested-by: Cosmin Ratiu <cratiu@nvidia.com>
> Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
> ---
> drivers/net/bonding/bond_main.c | 53 +++++++++++++++++++++++----------
> 1 file changed, 37 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
> index e45bba240cbc..06b060d9b031 100644
> --- a/drivers/net/bonding/bond_main.c
> +++ b/drivers/net/bonding/bond_main.c
> @@ -537,15 +537,22 @@ static void bond_ipsec_add_sa_all(struct bonding *bond)
> }
>
> list_for_each_entry(ipsec, &bond->ipsec_list, list) {
> + spin_lock_bh(&ipsec->xs->lock);
> + /* Skip dead xfrm states, they'll be freed later. */
> + if (ipsec->xs->km.state == XFRM_STATE_DEAD)
> + goto next;
> +
> /* If new state is added before ipsec_lock acquired */
> if (ipsec->xs->xso.real_dev == real_dev)
> - continue;
> + goto next;
>
> ipsec->xs->xso.real_dev = real_dev;
> if (real_dev->xfrmdev_ops->xdo_dev_state_add(ipsec->xs, NULL)) {
> slave_warn(bond_dev, real_dev, "%s: failed to add SA\n", __func__);
> ipsec->xs->xso.real_dev = NULL;
> }
> +next:
> + spin_unlock_bh(&ipsec->xs->lock);
> }
> out:
> mutex_unlock(&bond->ipsec_lock);
> @@ -560,7 +567,6 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs)
> struct net_device *bond_dev = xs->xso.dev;
> struct net_device *real_dev;
> netdevice_tracker tracker;
> - struct bond_ipsec *ipsec;
> struct bonding *bond;
> struct slave *slave;
>
> @@ -592,15 +598,6 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs)
> real_dev->xfrmdev_ops->xdo_dev_state_delete(xs);
> out:
> netdev_put(real_dev, &tracker);
> - mutex_lock(&bond->ipsec_lock);
> - list_for_each_entry(ipsec, &bond->ipsec_list, list) {
> - if (ipsec->xs == xs) {
> - list_del(&ipsec->list);
> - kfree(ipsec);
> - break;
> - }
> - }
> - mutex_unlock(&bond->ipsec_lock);
> }
>
> static void bond_ipsec_del_sa_all(struct bonding *bond)
> @@ -617,8 +614,18 @@ static void bond_ipsec_del_sa_all(struct bonding *bond)
>
> mutex_lock(&bond->ipsec_lock);
> list_for_each_entry(ipsec, &bond->ipsec_list, list) {
Second time - you should use list_for_each_entry_safe if you're walking and deleting
elements from the list.
> + spin_lock_bh(&ipsec->xs->lock);
> if (!ipsec->xs->xso.real_dev)
> - continue;
> + goto next;
> +
> + if (ipsec->xs->km.state == XFRM_STATE_DEAD) {
> + /* already dead no need to delete again */
> + if (real_dev->xfrmdev_ops->xdo_dev_state_free)
> + real_dev->xfrmdev_ops->xdo_dev_state_free(ipsec->xs);
Have you checked if .xdo_dev_state_free can sleep?
I see at least one that can: mlx5e_xfrm_free_state().
> + list_del(&ipsec->list);
> + kfree(ipsec);
> + goto next;
> + }
>
> if (!real_dev->xfrmdev_ops ||
> !real_dev->xfrmdev_ops->xdo_dev_state_delete ||
> @@ -631,6 +638,8 @@ static void bond_ipsec_del_sa_all(struct bonding *bond)
> if (real_dev->xfrmdev_ops->xdo_dev_state_free)
> real_dev->xfrmdev_ops->xdo_dev_state_free(ipsec->xs);
> }
> +next:
> + spin_unlock_bh(&ipsec->xs->lock);
> }
> mutex_unlock(&bond->ipsec_lock);
> }
> @@ -640,6 +649,7 @@ static void bond_ipsec_free_sa(struct xfrm_state *xs)
> struct net_device *bond_dev = xs->xso.dev;
> struct net_device *real_dev;
> netdevice_tracker tracker;
> + struct bond_ipsec *ipsec;
> struct bonding *bond;
> struct slave *slave;
>
> @@ -659,11 +669,22 @@ static void bond_ipsec_free_sa(struct xfrm_state *xs)
> if (!xs->xso.real_dev)
> goto out;
>
> - WARN_ON(xs->xso.real_dev != real_dev);
> + mutex_lock(&bond->ipsec_lock);
> + list_for_each_entry(ipsec, &bond->ipsec_list, list) {
> + if (ipsec->xs == xs) {
> + /* do xdo_dev_state_free if real_dev matches,
> + * otherwise only remove the list
> + */
> + if (real_dev && real_dev->xfrmdev_ops &&
> + real_dev->xfrmdev_ops->xdo_dev_state_free)
> + real_dev->xfrmdev_ops->xdo_dev_state_free(xs);
> + list_del(&ipsec->list);
> + kfree(ipsec);
> + break;
> + }
> + }
> + mutex_unlock(&bond->ipsec_lock);
>
> - if (real_dev && real_dev->xfrmdev_ops &&
> - real_dev->xfrmdev_ops->xdo_dev_state_free)
> - real_dev->xfrmdev_ops->xdo_dev_state_free(xs);
> out:
> netdev_put(real_dev, &tracker);
> }
On Wed, Mar 05, 2025 at 10:38:36AM +0200, Nikolay Aleksandrov wrote:
> > @@ -617,8 +614,18 @@ static void bond_ipsec_del_sa_all(struct bonding *bond)
> >
> > mutex_lock(&bond->ipsec_lock);
> > list_for_each_entry(ipsec, &bond->ipsec_list, list) {
>
> Second time - you should use list_for_each_entry_safe if you're walking and deleting
> elements from the list.
Sorry, I missed this comment. I will update in next version.
>
> > + spin_lock_bh(&ipsec->xs->lock);
> > if (!ipsec->xs->xso.real_dev)
> > - continue;
> > + goto next;
> > +
> > + if (ipsec->xs->km.state == XFRM_STATE_DEAD) {
> > + /* already dead no need to delete again */
> > + if (real_dev->xfrmdev_ops->xdo_dev_state_free)
> > + real_dev->xfrmdev_ops->xdo_dev_state_free(ipsec->xs);
>
> Have you checked if .xdo_dev_state_free can sleep?
> I see at least one that can: mlx5e_xfrm_free_state().
Hmm, This brings us back to the initial problem. We tried to avoid calling
a spin lock in a sleep context (bond_ipsec_del_sa), but now the new code
encounters this issue again.
With your reply, I also checked the xdo_dev_state_add() in
bond_ipsec_add_sa_all(), which may also sleep, e.g. mlx5e_xfrm_add_state(),
If we unlock the spin lock, then the race came back again.
Any idea about this?
thanks
Hangbin
On Wed, 2025-03-05 at 14:13 +0000, Hangbin Liu wrote:
> On Wed, Mar 05, 2025 at 10:38:36AM +0200, Nikolay Aleksandrov wrote:
> > > @@ -617,8 +614,18 @@ static void bond_ipsec_del_sa_all(struct
> > > bonding *bond)
> > >
> > > mutex_lock(&bond->ipsec_lock);
> > > list_for_each_entry(ipsec, &bond->ipsec_list, list) {
> >
> > Second time - you should use list_for_each_entry_safe if you're
> > walking and deleting
> > elements from the list.
>
> Sorry, I missed this comment. I will update in next version.
>
> >
> > > + spin_lock_bh(&ipsec->xs->lock);
> > > if (!ipsec->xs->xso.real_dev)
> > > - continue;
> > > + goto next;
> > > +
> > > + if (ipsec->xs->km.state == XFRM_STATE_DEAD) {
> > > + /* already dead no need to delete again
> > > */
> > > + if (real_dev->xfrmdev_ops-
> > > >xdo_dev_state_free)
> > > + real_dev->xfrmdev_ops-
> > > >xdo_dev_state_free(ipsec->xs);
> >
> > Have you checked if .xdo_dev_state_free can sleep?
> > I see at least one that can: mlx5e_xfrm_free_state().
>
> Hmm, This brings us back to the initial problem. We tried to avoid
> calling
> a spin lock in a sleep context (bond_ipsec_del_sa), but now the new
> code
> encounters this issue again.
The reason the mutex was added (instead of the spinlock used before)
was exactly because the add and free offload operations could sleep.
> With your reply, I also checked the xdo_dev_state_add() in
> bond_ipsec_add_sa_all(), which may also sleep, e.g.
> mlx5e_xfrm_add_state(),
>
> If we unlock the spin lock, then the race came back again.
>
> Any idea about this?
The race is between bond_ipsec_del_sa_all and bond_ipsec_del_sa (plus
bond_ipsec_free_sa). The issue is that when bond_ipsec_del_sa_all
releases x->lock, bond_ipsec_del_sa can immediately be called, followed
by bond_ipsec_free_sa.
Maybe dropping x->lock after setting real_dev to NULL? I checked,
real_dev is not used anywhere on the free calls, I think. I have
another series refactoring things around real_dev, I hope to be able to
send it soon.
Here's a sketch of this idea:
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -613,8 +613,11 @@ static void bond_ipsec_del_sa_all(struct bonding
*bond)
mutex_lock(&bond->ipsec_lock);
list_for_each_entry(ipsec, &bond->ipsec_list, list) {
- if (!ipsec->xs->xso.real_dev)
+ spin_lock(&ipsec->x->lock);
+ if (!ipsec->xs->xso.real_dev) {
+ spin_unlock(&ipsec->x->lock);
continue;
+ }
if (!real_dev->xfrmdev_ops ||
!real_dev->xfrmdev_ops->xdo_dev_state_delete ||
@@ -622,12 +625,16 @@ static void bond_ipsec_del_sa_all(struct bonding
*bond)
slave_warn(bond_dev, real_dev,
"%s: no slave
xdo_dev_state_delete\n",
__func__);
- } else {
- real_dev->xfrmdev_ops-
>xdo_dev_state_delete(real_dev, ipsec->xs);
- if (real_dev->xfrmdev_ops->xdo_dev_state_free)
- real_dev->xfrmdev_ops-
>xdo_dev_state_free(ipsec->xs);
- ipsec->xs->xso.real_dev = NULL;
+ spin_unlock(&ipsec->x->lock);
+ continue;
}
+
+ real_dev->xfrmdev_ops->xdo_dev_state_delete(real_dev,
ipsec->xs);
+ ipsec->xs->xso.real_dev = NULL;
+ /* Unlock before freeing device state, it could sleep.
*/
+ spin_unlock(&ipsec->x->lock);
+ if (real_dev->xfrmdev_ops->xdo_dev_state_free)
+ real_dev->xfrmdev_ops-
>xdo_dev_state_free(ipsec->xs);
Cosmin.
On Wed, Mar 05, 2025 at 04:12:18PM +0000, Cosmin Ratiu wrote:
> +++ b/drivers/net/bonding/bond_main.c
> @@ -613,8 +613,11 @@ static void bond_ipsec_del_sa_all(struct bonding
> *bond)
>
> mutex_lock(&bond->ipsec_lock);
> list_for_each_entry(ipsec, &bond->ipsec_list, list) {
> - if (!ipsec->xs->xso.real_dev)
> + spin_lock(&ipsec->x->lock);
> + if (!ipsec->xs->xso.real_dev) {
> + spin_unlock(&ipsec->x->lock);
> continue;
> + }
>
> if (!real_dev->xfrmdev_ops ||
> !real_dev->xfrmdev_ops->xdo_dev_state_delete ||
> @@ -622,12 +625,16 @@ static void bond_ipsec_del_sa_all(struct bonding
> *bond)
> slave_warn(bond_dev, real_dev,
> "%s: no slave
> xdo_dev_state_delete\n",
> __func__);
> - } else {
> - real_dev->xfrmdev_ops-
> >xdo_dev_state_delete(real_dev, ipsec->xs);
> - if (real_dev->xfrmdev_ops->xdo_dev_state_free)
> - real_dev->xfrmdev_ops-
> >xdo_dev_state_free(ipsec->xs);
> - ipsec->xs->xso.real_dev = NULL;
> + spin_unlock(&ipsec->x->lock);
> + continue;
> }
> +
> + real_dev->xfrmdev_ops->xdo_dev_state_delete(real_dev,
> ipsec->xs);
> + ipsec->xs->xso.real_dev = NULL;
> + /* Unlock before freeing device state, it could sleep.
> */
> + spin_unlock(&ipsec->x->lock);
> + if (real_dev->xfrmdev_ops->xdo_dev_state_free)
> + real_dev->xfrmdev_ops-
> >xdo_dev_state_free(ipsec->xs);
BTW, with setting real_dev = NULL here, I think
> To fix that, these entries should be freed here and the WARN_ON in
> bond_ipsec_free_sa() should be converted to an if...goto out, so that
> bond_ipsec_free_sa() calls would hit one of these conditions:
> 1. "if (!slave)", when no active device exists.
> 2. "if (!xs->xso.real_dev)", when xdo_dev_state_add() failed.
> 3. "if (xs->xso.real_dev != real_dev)", when a DEAD xs was already
> freed by bond_ipsec_del_sa_all() migration to a new device.
> In all 3 cases, xdo_dev_state_free() shouldn't be called, only xs
> removed from the bond->ipsec list.
The if (xs->xso.real_dev != real_dev) should never happen again.
As the real_dev = NULL, it will trigger 2 "if (!xs->xso.real_dev)"
directly.
And in bond_ipsec_add_sa_all(), it will set ipsec->xs->xso.real_dev =
real_dev, which the active slave already finished changing.
Thanks
Hangbin
On Wed, Mar 05, 2025 at 04:12:18PM +0000, Cosmin Ratiu wrote:
> On Wed, 2025-03-05 at 14:13 +0000, Hangbin Liu wrote:
> > On Wed, Mar 05, 2025 at 10:38:36AM +0200, Nikolay Aleksandrov wrote:
> > > > @@ -617,8 +614,18 @@ static void bond_ipsec_del_sa_all(struct
> > > > bonding *bond)
> > > >
> > > > mutex_lock(&bond->ipsec_lock);
> > > > list_for_each_entry(ipsec, &bond->ipsec_list, list) {
> > >
> > > Second time - you should use list_for_each_entry_safe if you're
> > > walking and deleting
> > > elements from the list.
> >
> > Sorry, I missed this comment. I will update in next version.
> >
> > >
> > > > + spin_lock_bh(&ipsec->xs->lock);
> > > > if (!ipsec->xs->xso.real_dev)
> > > > - continue;
> > > > + goto next;
> > > > +
> > > > + if (ipsec->xs->km.state == XFRM_STATE_DEAD) {
> > > > + /* already dead no need to delete again
> > > > */
> > > > + if (real_dev->xfrmdev_ops-
> > > > >xdo_dev_state_free)
> > > > + real_dev->xfrmdev_ops-
> > > > >xdo_dev_state_free(ipsec->xs);
> > >
> > > Have you checked if .xdo_dev_state_free can sleep?
> > > I see at least one that can: mlx5e_xfrm_free_state().
> >
> > Hmm, This brings us back to the initial problem. We tried to avoid
> > calling
> > a spin lock in a sleep context (bond_ipsec_del_sa), but now the new
> > code
> > encounters this issue again.
>
> The reason the mutex was added (instead of the spinlock used before)
> was exactly because the add and free offload operations could sleep.
>
> > With your reply, I also checked the xdo_dev_state_add() in
> > bond_ipsec_add_sa_all(), which may also sleep, e.g.
> > mlx5e_xfrm_add_state(),
> >
> > If we unlock the spin lock, then the race came back again.
> >
> > Any idea about this?
>
> The race is between bond_ipsec_del_sa_all and bond_ipsec_del_sa (plus
> bond_ipsec_free_sa). The issue is that when bond_ipsec_del_sa_all
> releases x->lock, bond_ipsec_del_sa can immediately be called, followed
> by bond_ipsec_free_sa.
> Maybe dropping x->lock after setting real_dev to NULL? I checked,
> real_dev is not used anywhere on the free calls, I think. I have
> another series refactoring things around real_dev, I hope to be able to
> send it soon.
>
> Here's a sketch of this idea:
>
> --- a/drivers/net/bonding/bond_main.c
> +++ b/drivers/net/bonding/bond_main.c
> @@ -613,8 +613,11 @@ static void bond_ipsec_del_sa_all(struct bonding
> *bond)
>
> mutex_lock(&bond->ipsec_lock);
> list_for_each_entry(ipsec, &bond->ipsec_list, list) {
> - if (!ipsec->xs->xso.real_dev)
> + spin_lock(&ipsec->x->lock);
> + if (!ipsec->xs->xso.real_dev) {
> + spin_unlock(&ipsec->x->lock);
> continue;
> + }
>
> if (!real_dev->xfrmdev_ops ||
> !real_dev->xfrmdev_ops->xdo_dev_state_delete ||
> @@ -622,12 +625,16 @@ static void bond_ipsec_del_sa_all(struct bonding
> *bond)
> slave_warn(bond_dev, real_dev,
> "%s: no slave
> xdo_dev_state_delete\n",
> __func__);
> - } else {
> - real_dev->xfrmdev_ops-
> >xdo_dev_state_delete(real_dev, ipsec->xs);
> - if (real_dev->xfrmdev_ops->xdo_dev_state_free)
> - real_dev->xfrmdev_ops-
> >xdo_dev_state_free(ipsec->xs);
> - ipsec->xs->xso.real_dev = NULL;
> + spin_unlock(&ipsec->x->lock);
> + continue;
> }
> +
> + real_dev->xfrmdev_ops->xdo_dev_state_delete(real_dev,
> ipsec->xs);
> + ipsec->xs->xso.real_dev = NULL;
Set xs->xso.real_dev = NULL is a good idea. As we will break
in bond_ipsec_del_sa()/bond_ipsec_free_sa() when there is no
xs->xso.real_dev.
For bond_ipsec_add_sa_all(), I will move the xso.real_dev = real_dev
after .xdo_dev_state_add() in case the following situation.
bond_ipsec_add_sa_all()
spin_unlock(&ipsec->x->lock);
ipsec->xs->xso.real_dev = real_dev;
__xfrm_state_delete x->state = DEAD
- bond_ipsec_del_sa()
- .xdo_dev_state_delete()
.xdo_dev_state_add()
Thanks
Hangbin
> + /* Unlock before freeing device state, it could sleep.
> */
> + spin_unlock(&ipsec->x->lock);
> + if (real_dev->xfrmdev_ops->xdo_dev_state_free)
> + real_dev->xfrmdev_ops-
> >xdo_dev_state_free(ipsec->xs);
>
> Cosmin.
On Thu, Mar 06, 2025 at 09:37:53AM +0000, Hangbin Liu wrote:
> >
> > The reason the mutex was added (instead of the spinlock used before)
> > was exactly because the add and free offload operations could sleep.
> >
> > > With your reply, I also checked the xdo_dev_state_add() in
> > > bond_ipsec_add_sa_all(), which may also sleep, e.g.
> > > mlx5e_xfrm_add_state(),
> > >
> > > If we unlock the spin lock, then the race came back again.
> > >
> > > Any idea about this?
> >
> > The race is between bond_ipsec_del_sa_all and bond_ipsec_del_sa (plus
> > bond_ipsec_free_sa). The issue is that when bond_ipsec_del_sa_all
> > releases x->lock, bond_ipsec_del_sa can immediately be called, followed
> > by bond_ipsec_free_sa.
> > Maybe dropping x->lock after setting real_dev to NULL? I checked,
> > real_dev is not used anywhere on the free calls, I think. I have
> > another series refactoring things around real_dev, I hope to be able to
> > send it soon.
> >
> > Here's a sketch of this idea:
> >
> > --- a/drivers/net/bonding/bond_main.c
> > +++ b/drivers/net/bonding/bond_main.c
> > @@ -613,8 +613,11 @@ static void bond_ipsec_del_sa_all(struct bonding
> > *bond)
> >
> > mutex_lock(&bond->ipsec_lock);
> > list_for_each_entry(ipsec, &bond->ipsec_list, list) {
> > - if (!ipsec->xs->xso.real_dev)
> > + spin_lock(&ipsec->x->lock);
> > + if (!ipsec->xs->xso.real_dev) {
> > + spin_unlock(&ipsec->x->lock);
> > continue;
> > + }
> >
> > if (!real_dev->xfrmdev_ops ||
> > !real_dev->xfrmdev_ops->xdo_dev_state_delete ||
> > @@ -622,12 +625,16 @@ static void bond_ipsec_del_sa_all(struct bonding
> > *bond)
> > slave_warn(bond_dev, real_dev,
> > "%s: no slave
> > xdo_dev_state_delete\n",
> > __func__);
> > - } else {
> > - real_dev->xfrmdev_ops-
> > >xdo_dev_state_delete(real_dev, ipsec->xs);
> > - if (real_dev->xfrmdev_ops->xdo_dev_state_free)
> > - real_dev->xfrmdev_ops-
> > >xdo_dev_state_free(ipsec->xs);
> > - ipsec->xs->xso.real_dev = NULL;
> > + spin_unlock(&ipsec->x->lock);
> > + continue;
> > }
> > +
> > + real_dev->xfrmdev_ops->xdo_dev_state_delete(real_dev,
> > ipsec->xs);
> > + ipsec->xs->xso.real_dev = NULL;
>
> Set xs->xso.real_dev = NULL is a good idea. As we will break
> in bond_ipsec_del_sa()/bond_ipsec_free_sa() when there is no
> xs->xso.real_dev.
>
> For bond_ipsec_add_sa_all(), I will move the xso.real_dev = real_dev
> after .xdo_dev_state_add() in case the following situation.
>
> bond_ipsec_add_sa_all()
> spin_unlock(&ipsec->x->lock);
> ipsec->xs->xso.real_dev = real_dev;
> __xfrm_state_delete x->state = DEAD
> - bond_ipsec_del_sa()
> - .xdo_dev_state_delete()
> .xdo_dev_state_add()
Hmm, do we still need to the spin_lock in bond_ipsec_add_sa_all()? With
xs->xso.real_dev = NULL after bond_ipsec_del_sa_all(), it looks there is
no need the spin_lock in bond_ipsec_add_sa_all(). e.g.
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 04b677d0c45b..3ada51c63207 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -537,15 +537,27 @@ static void bond_ipsec_add_sa_all(struct bonding *bond)
}
list_for_each_entry(ipsec, &bond->ipsec_list, list) {
+ spin_lock_bh(&ipsec->xs->lock);
+ /* Skip dead xfrm states, they'll be freed later. */
+ if (ipsec->xs->km.state == XFRM_STATE_DEAD) {
+ spin_unlock_bh(&ipsec->xs->lock);
+ continue;
+ }
+
/* If new state is added before ipsec_lock acquired */
- if (ipsec->xs->xso.real_dev == real_dev)
+ if (ipsec->xs->xso.real_dev == real_dev) {
+ spin_unlock_bh(&ipsec->xs->lock);
continue;
+ }
- ipsec->xs->xso.real_dev = real_dev;
if (real_dev->xfrmdev_ops->xdo_dev_state_add(ipsec->xs, NULL)) {
slave_warn(bond_dev, real_dev, "%s: failed to add SA\n", __func__);
ipsec->xs->xso.real_dev = NULL;
}
+ /* Set real_dev after .xdo_dev_state_add in case
+ * __xfrm_state_delete() is called in parallel
+ */
+ ipsec->xs->xso.real_dev = real_dev;
}
The spin_lock here seems useless now. What do you think?
Thanks
Hangbin
On Thu, 2025-03-06 at 10:02 +0000, Hangbin Liu wrote: > > For bond_ipsec_add_sa_all(), I will move the xso.real_dev = > > real_dev > > after .xdo_dev_state_add() in case the following situation. xso.real_dev needs to be initialized before the call to xdo_dev_state_add, since many of the implementations look in xso.real_dev to determine on which device to operate on. So the ordering should be: - get the lock - set xso.real_dev to real_dev - release the lock - call xdo_dev_state_add - if it fails, reacquire the lock and set the device to NULL. Unfortunately, this doesn't seem to protect against the scenario below, as after dropping the spinlock from bond_ipsec_add_sa_all, bond_ipsec_del_sa can freely call xdo_dev_state_delete() on real_dev before xdo_dev_state_add happens. I don't know what to do in this case... > > > > bond_ipsec_add_sa_all() > > spin_unlock(&ipsec->x->lock); > > ipsec->xs->xso.real_dev = real_dev; > > __xfrm_state_delete x->state = DEAD > > - bond_ipsec_del_sa() > > - .xdo_dev_state_delete() > > .xdo_dev_state_add() Cosmin.
On Thu, Mar 06, 2025 at 01:37:15PM +0000, Cosmin Ratiu wrote: > On Thu, 2025-03-06 at 10:02 +0000, Hangbin Liu wrote: > > > For bond_ipsec_add_sa_all(), I will move the xso.real_dev = > > > real_dev > > > after .xdo_dev_state_add() in case the following situation. > > xso.real_dev needs to be initialized before the call to > xdo_dev_state_add, since many of the implementations look in > xso.real_dev to determine on which device to operate on. > So the ordering should be: > - get the lock > - set xso.real_dev to real_dev > - release the lock > - call xdo_dev_state_add > - if it fails, reacquire the lock and set the device to NULL. > > Unfortunately, this doesn't seem to protect against the scenario below, > as after dropping the spinlock from bond_ipsec_add_sa_all, > bond_ipsec_del_sa can freely call xdo_dev_state_delete() on real_dev > before xdo_dev_state_add happens. > > I don't know what to do in this case... Yes, me neither. How about add a note and leave it there until we have a solution? Regards Hangbin > > > > > > > bond_ipsec_add_sa_all() > > > spin_unlock(&ipsec->x->lock); > > > ipsec->xs->xso.real_dev = real_dev; > > > __xfrm_state_delete x->state = DEAD > > > - bond_ipsec_del_sa() > > > - .xdo_dev_state_delete() > > > .xdo_dev_state_add() > > Cosmin.
On Thu, Mar 06, 2025 at 10:02:34AM +0000, Hangbin Liu wrote:
> > Set xs->xso.real_dev = NULL is a good idea. As we will break
> > in bond_ipsec_del_sa()/bond_ipsec_free_sa() when there is no
> > xs->xso.real_dev.
> >
> > For bond_ipsec_add_sa_all(), I will move the xso.real_dev = real_dev
> > after .xdo_dev_state_add() in case the following situation.
> >
> Hmm, do we still need to the spin_lock in bond_ipsec_add_sa_all()? With
> xs->xso.real_dev = NULL after bond_ipsec_del_sa_all(), it looks there is
> no need the spin_lock in bond_ipsec_add_sa_all(). e.g.
>
>
> diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
> index 04b677d0c45b..3ada51c63207 100644
> --- a/drivers/net/bonding/bond_main.c
> +++ b/drivers/net/bonding/bond_main.c
> @@ -537,15 +537,27 @@ static void bond_ipsec_add_sa_all(struct bonding *bond)
> }
>
> list_for_each_entry(ipsec, &bond->ipsec_list, list) {
> + spin_lock_bh(&ipsec->xs->lock);
> + /* Skip dead xfrm states, they'll be freed later. */
> + if (ipsec->xs->km.state == XFRM_STATE_DEAD) {
> + spin_unlock_bh(&ipsec->xs->lock);
> + continue;
> + }
> +
> /* If new state is added before ipsec_lock acquired */
> - if (ipsec->xs->xso.real_dev == real_dev)
> + if (ipsec->xs->xso.real_dev == real_dev) {
> + spin_unlock_bh(&ipsec->xs->lock);
> continue;
> + }
>
> - ipsec->xs->xso.real_dev = real_dev;
> if (real_dev->xfrmdev_ops->xdo_dev_state_add(ipsec->xs, NULL)) {
> slave_warn(bond_dev, real_dev, "%s: failed to add SA\n", __func__);
> ipsec->xs->xso.real_dev = NULL;
> }
> + /* Set real_dev after .xdo_dev_state_add in case
> + * __xfrm_state_delete() is called in parallel
> + */
> + ipsec->xs->xso.real_dev = real_dev;
> }
OK, please ignore this, the .xdo_dev_state_add() need xso.real_dev to
be set first. Then I'm still wonder how to avoid the race before
.xdo_dev_state_add() is called, e.g.
bond_ipsec_add_sa_all()
spin_lock_bh(&ipsec->xs->lock);
ipsec->xs->xso.real_dev = real_dev;
spin_unlock(&ipsec->x->lock);
__xfrm_state_delete
- bond_ipsec_del_sa()
- .xdo_dev_state_delete()
- bond_ipsec_free_sa()
- .xdo_dev_state_free()
.xdo_dev_state_add()
Thanks
Hangbin
© 2016 - 2026 Red Hat, Inc.