[PATCH net v2] ip6mr: Fix lockdep and sparse RCU warnings

Stefan Wiehler posted 1 patch 1 month, 4 weeks ago
net/ipv6/ip6mr.c | 103 +++++++++++++++++++++++++++++------------------
1 file changed, 64 insertions(+), 39 deletions(-)
[PATCH net v2] ip6mr: Fix lockdep and sparse RCU warnings
Posted by Stefan Wiehler 1 month, 4 weeks ago
ip6mr_vif_seq_start() must lock RCU even in a case of error, because
stop callback is called unconditionally.

When IPV6_MROUTE_MULTIPLE_TABLES is enabled, calls to ip6mr_get_table()
should be done under RCU or RTNL lock. Lock RCU before the call unless
it's done already or RTNL lock is held.

Signed-off-by: Petr Malat <oss@malat.biz>
Signed-off-by: Stefan Wiehler <stefan.wiehler@nokia.com>
Fixes: d1db275dd3f6 ("ipv6: ip6mr: support multiple tables")
---
v2:
  - rebase on top of net tree
  - add Fixes tag
  - refactor out paths
v1: https://patchwork.kernel.org/project/netdevbpf/patch/20240605195355.363936-1-oss@malat.biz/
---
 net/ipv6/ip6mr.c | 103 +++++++++++++++++++++++++++++------------------
 1 file changed, 64 insertions(+), 39 deletions(-)

diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 2ce4ae0d8dc3..9a72928861ac 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -411,13 +411,13 @@ static void *ip6mr_vif_seq_start(struct seq_file *seq, loff_t *pos)
 	struct net *net = seq_file_net(seq);
 	struct mr_table *mrt;
 
+	rcu_read_lock();
 	mrt = ip6mr_get_table(net, RT6_TABLE_DFLT);
 	if (!mrt)
 		return ERR_PTR(-ENOENT);
 
 	iter->mrt = mrt;
 
-	rcu_read_lock();
 	return mr_vif_seq_start(seq, pos);
 }
 
@@ -1884,18 +1884,23 @@ int ip6mr_ioctl(struct sock *sk, int cmd, void *arg)
 	struct mfc6_cache *c;
 	struct net *net = sock_net(sk);
 	struct mr_table *mrt;
+	int err;
 
+	rcu_read_lock();
 	mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT);
-	if (!mrt)
-		return -ENOENT;
+	if (!mrt) {
+		err = -ENOENT;
+		goto out;
+	}
 
 	switch (cmd) {
 	case SIOCGETMIFCNT_IN6:
 		vr = (struct sioc_mif_req6 *)arg;
-		if (vr->mifi >= mrt->maxvif)
-			return -EINVAL;
+		if (vr->mifi >= mrt->maxvif) {
+			err = -EINVAL;
+			goto out;
+		}
 		vr->mifi = array_index_nospec(vr->mifi, mrt->maxvif);
-		rcu_read_lock();
 		vif = &mrt->vif_table[vr->mifi];
 		if (VIF_EXISTS(mrt, vr->mifi)) {
 			vr->icount = READ_ONCE(vif->pkt_in);
@@ -1905,12 +1910,11 @@ int ip6mr_ioctl(struct sock *sk, int cmd, void *arg)
 			rcu_read_unlock();
 			return 0;
 		}
-		rcu_read_unlock();
-		return -EADDRNOTAVAIL;
+		err = -EADDRNOTAVAIL;
+		goto out;
 	case SIOCGETSGCNT_IN6:
 		sr = (struct sioc_sg_req6 *)arg;
 
-		rcu_read_lock();
 		c = ip6mr_cache_find(mrt, &sr->src.sin6_addr,
 				     &sr->grp.sin6_addr);
 		if (c) {
@@ -1920,11 +1924,16 @@ int ip6mr_ioctl(struct sock *sk, int cmd, void *arg)
 			rcu_read_unlock();
 			return 0;
 		}
-		rcu_read_unlock();
-		return -EADDRNOTAVAIL;
+		err = -EADDRNOTAVAIL;
+		goto out;
 	default:
-		return -ENOIOCTLCMD;
+		err = -ENOIOCTLCMD;
+		goto out;
 	}
+
+out:
+	rcu_read_unlock();
+	return err;
 }
 
 #ifdef CONFIG_COMPAT
@@ -1952,19 +1961,35 @@ int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
 	struct mfc6_cache *c;
 	struct net *net = sock_net(sk);
 	struct mr_table *mrt;
-
-	mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT);
-	if (!mrt)
-		return -ENOENT;
+	int err;
 
 	switch (cmd) {
 	case SIOCGETMIFCNT_IN6:
 		if (copy_from_user(&vr, arg, sizeof(vr)))
 			return -EFAULT;
-		if (vr.mifi >= mrt->maxvif)
-			return -EINVAL;
+		break;
+	case SIOCGETSGCNT_IN6:
+		if (copy_from_user(&sr, arg, sizeof(sr)))
+			return -EFAULT;
+		break;
+	default:
+		return -ENOIOCTLCMD;
+	}
+
+	rcu_read_lock();
+	mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT);
+	if (!mrt) {
+		err = -ENOENT;
+		goto out;
+	}
+
+	switch (cmd) {
+	case SIOCGETMIFCNT_IN6:
+		if (vr.mifi >= mrt->maxvif) {
+			err = -EINVAL;
+			goto out;
+		}
 		vr.mifi = array_index_nospec(vr.mifi, mrt->maxvif);
-		rcu_read_lock();
 		vif = &mrt->vif_table[vr.mifi];
 		if (VIF_EXISTS(mrt, vr.mifi)) {
 			vr.icount = READ_ONCE(vif->pkt_in);
@@ -1977,13 +2002,9 @@ int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
 				return -EFAULT;
 			return 0;
 		}
-		rcu_read_unlock();
-		return -EADDRNOTAVAIL;
+		err = -EADDRNOTAVAIL;
+		goto out;
 	case SIOCGETSGCNT_IN6:
-		if (copy_from_user(&sr, arg, sizeof(sr)))
-			return -EFAULT;
-
-		rcu_read_lock();
 		c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr);
 		if (c) {
 			sr.pktcnt = c->_c.mfc_un.res.pkt;
@@ -1995,11 +2016,13 @@ int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
 				return -EFAULT;
 			return 0;
 		}
-		rcu_read_unlock();
-		return -EADDRNOTAVAIL;
-	default:
-		return -ENOIOCTLCMD;
+		err = -EADDRNOTAVAIL;
+		goto out;
 	}
+
+out:
+	rcu_read_unlock();
+	return err;
 }
 #endif
 
@@ -2275,11 +2298,13 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm,
 	struct mfc6_cache *cache;
 	struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
 
+	rcu_read_lock();
 	mrt = ip6mr_get_table(net, RT6_TABLE_DFLT);
-	if (!mrt)
-		return -ENOENT;
+	if (!mrt) {
+		err = -ENOENT;
+		goto out;
+	}
 
-	rcu_read_lock();
 	cache = ip6mr_cache_find(mrt, &rt->rt6i_src.addr, &rt->rt6i_dst.addr);
 	if (!cache && skb->dev) {
 		int vif = ip6mr_find_vif(mrt, skb->dev);
@@ -2297,15 +2322,15 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm,
 
 		dev = skb->dev;
 		if (!dev || (vif = ip6mr_find_vif(mrt, dev)) < 0) {
-			rcu_read_unlock();
-			return -ENODEV;
+			err = -ENODEV;
+			goto out;
 		}
 
 		/* really correct? */
 		skb2 = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC);
 		if (!skb2) {
-			rcu_read_unlock();
-			return -ENOMEM;
+			err = -ENOMEM;
+			goto out;
 		}
 
 		NETLINK_CB(skb2).portid = portid;
@@ -2327,12 +2352,12 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm,
 		iph->daddr = rt->rt6i_dst.addr;
 
 		err = ip6mr_cache_unresolved(mrt, vif, skb2, dev);
-		rcu_read_unlock();
-
-		return err;
+		goto out;
 	}
 
 	err = mr_fill_mroute(mrt, skb, &cache->_c, rtm);
+
+out:
 	rcu_read_unlock();
 	return err;
 }
-- 
2.42.0
Re: [PATCH net v2] ip6mr: Fix lockdep and sparse RCU warnings
Posted by Eric Dumazet 1 month, 4 weeks ago
On Tue, Oct 1, 2024 at 12:05 PM Stefan Wiehler <stefan.wiehler@nokia.com> wrote:
>
> ip6mr_vif_seq_start() must lock RCU even in a case of error, because
> stop callback is called unconditionally.
>

OK, but RT6_TABLE_DFLT always exists, ip6mr_get_table(net, RT6_TABLE_DFLT)
can never fail.

This is ensured at netns creation, from ip6mr_rules_init()

This complex patch adds some code churn, with no clear bug fix ?
Re: [PATCH net v2] ip6mr: Fix lockdep and sparse RCU warnings
Posted by Stefan Wiehler 1 month, 3 weeks ago
> OK, but RT6_TABLE_DFLT always exists, ip6mr_get_table(net, RT6_TABLE_DFLT)
> can never fail.
> 
> This is ensured at netns creation, from ip6mr_rules_init()

OK, but nevertheless we need to enter a RCU read-side critical section before
ip6mr_get_table() is called.

> This complex patch adds some code churn, with no clear bug fix ?

We should have probably mentioned the Lockdep-RCU splat that brought this topic
up:

[   48.834645] WARNING: suspicious RCU usage
[   48.834647] 6.1.103-584209f6d5-nokia_sm_x86 #1 Tainted: G S         O
[   48.834649] -----------------------------
[   48.834651] /net/ipv6/ip6mr.c:132 RCU-list traversed in non-reader section!!
[   48.834654]
               other info that might help us debug this:

[   48.834656]
               rcu_scheduler_active = 2, debug_locks = 1
[   48.834658] no locks held by radvd/5777.
[   48.834660]
               stack backtrace:
[   48.834663] CPU: 0 PID: 5777 Comm: radvd Tainted: G S         O       6.1.103-584209f6d5-nokia_sm_x86 #1
[   48.834666] Hardware name: Nokia Asil/Default string, BIOS 0ACNA113 06/07/2024
[   48.834673] Call Trace:
[   48.834674]  <TASK>
[   48.834677]  dump_stack_lvl+0xb7/0xe9
[   48.834687]  lockdep_rcu_suspicious.cold+0x2d/0x64
[   48.834697]  ip6mr_get_table+0x9f/0xb0
[   48.834704]  ip6mr_ioctl+0x50/0x360
[   48.834713]  ? sk_ioctl+0x5f/0x1c0
[   48.834719]  sk_ioctl+0x5f/0x1c0
[   48.834723]  ? find_held_lock+0x2b/0x80
[   48.834731]  sock_do_ioctl+0x7b/0x140
[   48.834737]  ? proc_nr_files+0x30/0x30
[   48.834744]  sock_ioctl+0x1f5/0x360
[   48.834754]  __x64_sys_ioctl+0x8d/0xd0
[   48.834760]  do_syscall_64+0x3c/0x90
[   48.834765]  entry_SYSCALL_64_after_hwframe+0x6e/0xd8
[   48.834769] RIP: 0033:0x7fda5f56e2ac
[   48.834773] Code: 1e fa 48 8d 44 24 08 48 89 54 24 e0 48 89 44 24 c0 48 8d 44 24 d0 48 89 44 24 c8 b8 1 0 00 00 00 c7 44 24 b8 10 00 00 00 0f 05 <3d> 00 f0 ff ff 89 c2 77 0b 89 d0 c3 0f 1f 84 00 00 00 00 00 48 8b
[   48.834776] RSP: 002b:00007fff52d4bda8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
[   48.834782] RAX: ffffffffffffffda RBX: 000000000171cd80 RCX: 00007fda5f56e2ac
[   48.834784] RDX: 00007fff52d4bdb0 RSI: 0000000000008913 RDI: 0000000000000003
[   48.834787] RBP: 000000000171cd30 R08: 0000000000000007 R09: 000000000000003c
[   48.834789] R10: 0000000000000004 R11: 0000000000000246 R12: 0000000000000003
[   48.834791] R13: 0000000000000000 R14: 0000000000000004 R15: 000000000040d43c
[   48.834802]  </TASK>

Kind regards,

Stefan
Re: [PATCH net v2] ip6mr: Fix lockdep and sparse RCU warnings
Posted by Eric Dumazet 1 month, 3 weeks ago
On Tue, Oct 1, 2024 at 2:50 PM Stefan Wiehler <stefan.wiehler@nokia.com> wrote:
>
> > OK, but RT6_TABLE_DFLT always exists, ip6mr_get_table(net, RT6_TABLE_DFLT)
> > can never fail.
> >
> > This is ensured at netns creation, from ip6mr_rules_init()
>
> OK, but nevertheless we need to enter a RCU read-side critical section before
> ip6mr_get_table() is called.

This could be a lockdep annotation error then, at least for
RT6_TABLE_DFLT, oh well.

Note that net/ipv4/ipmr.c would have a similar issue.

Please split your patch in small units, their Fixes: tags are likely
different, and if some code breaks something,
fixing the issue will be easier.

The changelog seemed to only address the first ip6mr_vif_seq_start() part.
Re: [PATCH net v2] ip6mr: Fix lockdep and sparse RCU warnings
Posted by Stefan Wiehler 1 month, 3 weeks ago
> This could be a lockdep annotation error then, at least for
> RT6_TABLE_DFLT, oh well.

As you have already explained, we can ignore the ip6mr_vif_seq_start() error
path, so the issue boils down to ip6mr_get_table() being called without
entering a RCU read-side critical section from these 4 functions:
ipmr_vif_seq_start(), ip6mr_ioctl(), ip6mr_compat_ioctl() and
ip6mr_get_route(). It is my understanding that in none of these four cases the
RTNL lock is held either; at least according to the RCU-lockdep splat we
clearly see that this is not the case in ip6mr_ioctl()  – but please correct me
if I'm wrong.

> Note that net/ipv4/ipmr.c would have a similar issue.

Yes, looks indeed like that :-/

> Please split your patch in small units, their Fixes: tags are likely
> different, and if some code breaks something,
> fixing the issue will be easier.
> 
> The changelog seemed to only address the first ip6mr_vif_seq_start() part.

If you prefer that I can split the change into 4 commits addressing each of the
4 functions mentioned before.

Kind regards,

Stefan
Re: [PATCH net v2] ip6mr: Fix lockdep and sparse RCU warnings
Posted by ericnetdev dumazet 1 month, 3 weeks ago
Le mar. 1 oct. 2024 à 16:36, Stefan Wiehler <stefan.wiehler@nokia.com> a écrit :
>
> > This could be a lockdep annotation error then, at least for
> > RT6_TABLE_DFLT, oh well.
>
> As you have already explained, we can ignore the ip6mr_vif_seq_start() error
> path, so the issue boils down to ip6mr_get_table() being called without
> entering a RCU read-side critical section from these 4 functions:
> ipmr_vif_seq_start(), ip6mr_ioctl(), ip6mr_compat_ioctl() and
> ip6mr_get_route(). It is my understanding that in none of these four cases the
> RTNL lock is held either; at least according to the RCU-lockdep splat we
> clearly see that this is not the case in ip6mr_ioctl()  – but please correct me
> if I'm wrong.
>
> > Note that net/ipv4/ipmr.c would have a similar issue.
>
> Yes, looks indeed like that :-/
>
> > Please split your patch in small units, their Fixes: tags are likely
> > different, and if some code breaks something,
> > fixing the issue will be easier.
> >
> > The changelog seemed to only address the first ip6mr_vif_seq_start() part.
>
> If you prefer that I can split the change into 4 commits addressing each of the
> 4 functions mentioned before.

Yes please. Extending rcu_read_lock() sections needs inspection,
because we can not sleep there.