From: Dmitry Safonov <dima@arista.com>
Currently there are a couple of minor issues with destroying the keys
tcp_v4_destroy_sock():
1. The socket is yet in TCP bind buckets, making it reachable for
incoming segments [on another CPU core], potentially available to send
late FIN/ACK/RST replies.
2. There is at least one code path, where tcp_done() is called before
sending RST [kudos to Bob for investigation]. This is a case of
a server, that finished sending its data and just called close().
The socket is in TCP_FIN_WAIT2 and has RCV_SHUTDOWN (set by
__tcp_close())
tcp_v4_do_rcv()/tcp_v6_do_rcv()
tcp_rcv_state_process() /* LINUX_MIB_TCPABORTONDATA */
tcp_reset()
tcp_done_with_error()
tcp_done()
inet_csk_destroy_sock() /* Destroys AO/MD5 keys */
/* tcp_rcv_state_process() returns SKB_DROP_REASON_TCP_ABORT_ON_DATA */
tcp_v4_send_reset() /* Sends an unsigned RST segment */
tcpdump:
> 22:53:15.399377 00:00:b2:1f:00:00 > 00:00:01:01:00:00, ethertype IPv4 (0x0800), length 74: (tos 0x0, ttl 64, id 33929, offset 0, flags [DF], proto TCP (6), length 60)
> 1.0.0.1.34567 > 1.0.0.2.49848: Flags [F.], seq 2185658590, ack 3969644355, win 502, options [nop,nop,md5 valid], length 0
> 22:53:15.399396 00:00:01:01:00:00 > 00:00:b2:1f:00:00, ethertype IPv4 (0x0800), length 86: (tos 0x0, ttl 64, id 51951, offset 0, flags [DF], proto TCP (6), length 72)
> 1.0.0.2.49848 > 1.0.0.1.34567: Flags [.], seq 3969644375, ack 2185658591, win 128, options [nop,nop,md5 valid,nop,nop,sack 1 {2185658590:2185658591}], length 0
> 22:53:16.429588 00:00:b2:1f:00:00 > 00:00:01:01:00:00, ethertype IPv4 (0x0800), length 60: (tos 0x0, ttl 64, id 0, offset 0, flags [DF], proto TCP (6), length 40)
> 1.0.0.1.34567 > 1.0.0.2.49848: Flags [R], seq 2185658590, win 0, length 0
> 22:53:16.664725 00:00:b2:1f:00:00 > 00:00:01:01:00:00, ethertype IPv4 (0x0800), length 74: (tos 0x0, ttl 64, id 0, offset 0, flags [DF], proto TCP (6), length 60)
> 1.0.0.1.34567 > 1.0.0.2.49848: Flags [R], seq 2185658591, win 0, options [nop,nop,md5 valid], length 0
> 22:53:17.289832 00:00:b2:1f:00:00 > 00:00:01:01:00:00, ethertype IPv4 (0x0800), length 74: (tos 0x0, ttl 64, id 0, offset 0, flags [DF], proto TCP (6), length 60)
> 1.0.0.1.34567 > 1.0.0.2.49848: Flags [R], seq 2185658591, win 0, options [nop,nop,md5 valid], length 0
Note the signed RSTs later in the dump - those are sent by the server
when the fin-wait socket gets removed from hash buckets, by
the listener socket.
Instead of destroying AO/MD5 info and their keys in inet_csk_destroy_sock(),
slightly delay it until the actual socket .sk_destruct(). As shutdown'ed
socket can yet send non-data replies, they should be signed in order for
the peer to process them. Now it also matches how AO/MD5 gets destructed
for TIME-WAIT sockets (in tcp_twsk_destructor()).
This seems optimal for TCP-MD5, while for TCP-AO it seems to have an
open problem: once RST get sent and socket gets actually destructed,
there is no information on the initial sequence numbers. So, in case
this last RST gets lost in the network, the server's listener socket
won't be able to properly sign another RST. Nothing in RFC 1122
prescribes keeping any local state after non-graceful reset.
Luckily, BGP are known to use keep alive(s).
While the issue is quite minor/cosmetic, these days monitoring network
counters is a common practice and getting invalid signed segments from
a trusted BGP peer can get customers worried.
Investigated-by: Bob Gilligan <gilligan@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
---
net/ipv4/tcp.c | 31 +++++++++++++++++++++++++++++++
net/ipv4/tcp_ipv4.c | 25 -------------------------
2 files changed, 31 insertions(+), 25 deletions(-)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 71a956fbfc5533224ee00e792de2cfdccd4d40aa..4e996e937e8e5f0e75764caa24240e25006deece 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -412,6 +412,36 @@ static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp)
return rate64;
}
+#ifdef CONFIG_TCP_MD5SIG
+static void tcp_md5sig_info_free_rcu(struct rcu_head *head)
+{
+ struct tcp_md5sig_info *md5sig;
+
+ md5sig = container_of(head, struct tcp_md5sig_info, rcu);
+ kfree(md5sig);
+ static_branch_slow_dec_deferred(&tcp_md5_needed);
+ tcp_md5_release_sigpool();
+}
+#endif
+
+static void tcp_destruct_sock(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+#ifdef CONFIG_TCP_MD5SIG
+ if (tp->md5sig_info) {
+ struct tcp_md5sig_info *md5sig;
+
+ md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
+ tcp_clear_md5_list(sk);
+ call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu);
+ rcu_assign_pointer(tp->md5sig_info, NULL);
+ }
+#endif
+ tcp_ao_destroy_sock(sk, false);
+ inet_sock_destruct(sk);
+}
+
/* Address-family independent initialization for a tcp_sock.
*
* NOTE: A lot of things set to zero explicitly by call to
@@ -464,6 +494,7 @@ void tcp_init_sock(struct sock *sk)
tp->tsoffset = 0;
tp->rack.reo_wnd_steps = 1;
+ sk->sk_destruct = tcp_destruct_sock;
sk->sk_write_space = sk_stream_write_space;
sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 84d3d556ed8062d07fe7019bc0dadd90d3b80d96..32814f205fdfdcbd4be4765a4e127c8f175d3b14 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2521,18 +2521,6 @@ static int tcp_v4_init_sock(struct sock *sk)
return 0;
}
-#ifdef CONFIG_TCP_MD5SIG
-static void tcp_md5sig_info_free_rcu(struct rcu_head *head)
-{
- struct tcp_md5sig_info *md5sig;
-
- md5sig = container_of(head, struct tcp_md5sig_info, rcu);
- kfree(md5sig);
- static_branch_slow_dec_deferred(&tcp_md5_needed);
- tcp_md5_release_sigpool();
-}
-#endif
-
static void tcp_release_user_frags(struct sock *sk)
{
#ifdef CONFIG_PAGE_POOL
@@ -2569,19 +2557,6 @@ void tcp_v4_destroy_sock(struct sock *sk)
/* Cleans up our, hopefully empty, out_of_order_queue. */
skb_rbtree_purge(&tp->out_of_order_queue);
-#ifdef CONFIG_TCP_MD5SIG
- /* Clean up the MD5 key list, if any */
- if (tp->md5sig_info) {
- struct tcp_md5sig_info *md5sig;
-
- md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
- tcp_clear_md5_list(sk);
- call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu);
- rcu_assign_pointer(tp->md5sig_info, NULL);
- }
-#endif
- tcp_ao_destroy_sock(sk, false);
-
/* Clean up a referenced TCP bind bucket. */
if (inet_csk(sk)->icsk_bind_hash)
inet_put_port(sk);
--
2.42.2
Hi Dmitry, kernel test robot noticed the following build warnings: [auto build test WARNING on a7bd72158063740212344fad5d99dcef45bc70d6] url: https://github.com/intel-lab-lkp/linux/commits/Dmitry-Safonov-via-B4-Relay/tcp-Destroy-TCP-AO-TCP-MD5-keys-in-sk_destruct/20250822-125815 base: a7bd72158063740212344fad5d99dcef45bc70d6 patch link: https://lore.kernel.org/r/20250822-b4-tcp-ao-md5-rst-finwait2-v1-1-25825d085dcb%40arista.com patch subject: [PATCH net-next 1/2] tcp: Destroy TCP-AO, TCP-MD5 keys in .sk_destruct() config: x86_64-buildonly-randconfig-001-20250823 (https://download.01.org/0day-ci/archive/20250823/202508230331.PEHBlmSk-lkp@intel.com/config) compiler: gcc-12 (Debian 12.2.0-14+deb12u1) 12.2.0 reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250823/202508230331.PEHBlmSk-lkp@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Closes: https://lore.kernel.org/oe-kbuild-all/202508230331.PEHBlmSk-lkp@intel.com/ All warnings (new ones prefixed by >>): net/ipv4/tcp.c: In function 'tcp_destruct_sock': >> net/ipv4/tcp.c:429:26: warning: unused variable 'tp' [-Wunused-variable] 429 | struct tcp_sock *tp = tcp_sk(sk); | ^~ vim +/tp +429 net/ipv4/tcp.c 426 427 static void tcp_destruct_sock(struct sock *sk) 428 { > 429 struct tcp_sock *tp = tcp_sk(sk); 430 -- 0-DAY CI Kernel Test Service https://github.com/intel/lkp-tests/wiki
On 8/22/25 01:55, Dmitry Safonov via B4 Relay wrote: > From: Dmitry Safonov <dima@arista.com> > > Currently there are a couple of minor issues with destroying the keys > tcp_v4_destroy_sock(): > > 1. The socket is yet in TCP bind buckets, making it reachable for > incoming segments [on another CPU core], potentially available to send > late FIN/ACK/RST replies. > > 2. There is at least one code path, where tcp_done() is called before > sending RST [kudos to Bob for investigation]. This is a case of > a server, that finished sending its data and just called close(). > > The socket is in TCP_FIN_WAIT2 and has RCV_SHUTDOWN (set by > __tcp_close()) > > tcp_v4_do_rcv()/tcp_v6_do_rcv() > tcp_rcv_state_process() /* LINUX_MIB_TCPABORTONDATA */ > tcp_reset() > tcp_done_with_error() > tcp_done() > inet_csk_destroy_sock() /* Destroys AO/MD5 keys */ > /* tcp_rcv_state_process() returns SKB_DROP_REASON_TCP_ABORT_ON_DATA */ > tcp_v4_send_reset() /* Sends an unsigned RST segment */ > > tcpdump: >> 22:53:15.399377 00:00:b2:1f:00:00 > 00:00:01:01:00:00, ethertype IPv4 (0x0800), length 74: (tos 0x0, ttl 64, id 33929, offset 0, flags [DF], proto TCP (6), length 60) >> 1.0.0.1.34567 > 1.0.0.2.49848: Flags [F.], seq 2185658590, ack 3969644355, win 502, options [nop,nop,md5 valid], length 0 >> 22:53:15.399396 00:00:01:01:00:00 > 00:00:b2:1f:00:00, ethertype IPv4 (0x0800), length 86: (tos 0x0, ttl 64, id 51951, offset 0, flags [DF], proto TCP (6), length 72) >> 1.0.0.2.49848 > 1.0.0.1.34567: Flags [.], seq 3969644375, ack 2185658591, win 128, options [nop,nop,md5 valid,nop,nop,sack 1 {2185658590:2185658591}], length 0 >> 22:53:16.429588 00:00:b2:1f:00:00 > 00:00:01:01:00:00, ethertype IPv4 (0x0800), length 60: (tos 0x0, ttl 64, id 0, offset 0, flags [DF], proto TCP (6), length 40) >> 1.0.0.1.34567 > 1.0.0.2.49848: Flags [R], seq 2185658590, win 0, length 0 >> 22:53:16.664725 00:00:b2:1f:00:00 > 00:00:01:01:00:00, ethertype IPv4 (0x0800), length 74: (tos 0x0, ttl 64, id 0, offset 0, flags [DF], proto TCP (6), length 60) >> 1.0.0.1.34567 > 1.0.0.2.49848: Flags [R], seq 2185658591, win 0, options [nop,nop,md5 valid], length 0 >> 22:53:17.289832 00:00:b2:1f:00:00 > 00:00:01:01:00:00, ethertype IPv4 (0x0800), length 74: (tos 0x0, ttl 64, id 0, offset 0, flags [DF], proto TCP (6), length 60) >> 1.0.0.1.34567 > 1.0.0.2.49848: Flags [R], seq 2185658591, win 0, options [nop,nop,md5 valid], length 0 > > Note the signed RSTs later in the dump - those are sent by the server > when the fin-wait socket gets removed from hash buckets, by > the listener socket. > > Instead of destroying AO/MD5 info and their keys in inet_csk_destroy_sock(), > slightly delay it until the actual socket .sk_destruct(). As shutdown'ed > socket can yet send non-data replies, they should be signed in order for > the peer to process them. Now it also matches how AO/MD5 gets destructed > for TIME-WAIT sockets (in tcp_twsk_destructor()). > > This seems optimal for TCP-MD5, while for TCP-AO it seems to have an > open problem: once RST get sent and socket gets actually destructed, > there is no information on the initial sequence numbers. So, in case > this last RST gets lost in the network, the server's listener socket > won't be able to properly sign another RST. Nothing in RFC 1122 > prescribes keeping any local state after non-graceful reset. > Luckily, BGP are known to use keep alive(s). > > While the issue is quite minor/cosmetic, these days monitoring network > counters is a common practice and getting invalid signed segments from > a trusted BGP peer can get customers worried. > > Investigated-by: Bob Gilligan <gilligan@arista.com> > Signed-off-by: Dmitry Safonov <dima@arista.com> > --- > net/ipv4/tcp.c | 31 +++++++++++++++++++++++++++++++ > net/ipv4/tcp_ipv4.c | 25 ------------------------- > 2 files changed, 31 insertions(+), 25 deletions(-) > > diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c > index 71a956fbfc5533224ee00e792de2cfdccd4d40aa..4e996e937e8e5f0e75764caa24240e25006deece 100644 > --- a/net/ipv4/tcp.c > +++ b/net/ipv4/tcp.c > @@ -412,6 +412,36 @@ static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp) > return rate64; > } > [...] > + > +static void tcp_destruct_sock(struct sock *sk) > +{ > + struct tcp_sock *tp = tcp_sk(sk); It looks like this variable is unused when CONFIG_TCP_MD5SIG is not set and this is causing the test CI build to fail. net/ipv4/tcp.c: In function ‘tcp_destruct_sock’: net/ipv4/tcp.c:417:26: error: unused variable ‘tp’ [-Werror=unused-variable] 417 | struct tcp_sock *tp = tcp_sk(sk); | ^~ cc1: all warnings being treated as errors make[4]: *** [scripts/Makefile.build:287: net/ipv4/tcp. cheers, Victor
Hi Victor, Thanks, going to correct that in v2. On Fri, Aug 22, 2025 at 1:40 PM Victor Nogueira <victor@mojatatu.com> wrote: > > On 8/22/25 01:55, Dmitry Safonov via B4 Relay wrote: > > From: Dmitry Safonov <dima@arista.com> > > > > Currently there are a couple of minor issues with destroying the keys > > tcp_v4_destroy_sock(): > > > > 1. The socket is yet in TCP bind buckets, making it reachable for > > incoming segments [on another CPU core], potentially available to send > > late FIN/ACK/RST replies. > > > > 2. There is at least one code path, where tcp_done() is called before > > sending RST [kudos to Bob for investigation]. This is a case of > > a server, that finished sending its data and just called close(). > > > > The socket is in TCP_FIN_WAIT2 and has RCV_SHUTDOWN (set by > > __tcp_close()) > > > > tcp_v4_do_rcv()/tcp_v6_do_rcv() > > tcp_rcv_state_process() /* LINUX_MIB_TCPABORTONDATA */ > > tcp_reset() > > tcp_done_with_error() > > tcp_done() > > inet_csk_destroy_sock() /* Destroys AO/MD5 keys */ > > /* tcp_rcv_state_process() returns SKB_DROP_REASON_TCP_ABORT_ON_DATA */ > > tcp_v4_send_reset() /* Sends an unsigned RST segment */ > > > > tcpdump: > >> 22:53:15.399377 00:00:b2:1f:00:00 > 00:00:01:01:00:00, ethertype IPv4 (0x0800), length 74: (tos 0x0, ttl 64, id 33929, offset 0, flags [DF], proto TCP (6), length 60) > >> 1.0.0.1.34567 > 1.0.0.2.49848: Flags [F.], seq 2185658590, ack 3969644355, win 502, options [nop,nop,md5 valid], length 0 > >> 22:53:15.399396 00:00:01:01:00:00 > 00:00:b2:1f:00:00, ethertype IPv4 (0x0800), length 86: (tos 0x0, ttl 64, id 51951, offset 0, flags [DF], proto TCP (6), length 72) > >> 1.0.0.2.49848 > 1.0.0.1.34567: Flags [.], seq 3969644375, ack 2185658591, win 128, options [nop,nop,md5 valid,nop,nop,sack 1 {2185658590:2185658591}], length 0 > >> 22:53:16.429588 00:00:b2:1f:00:00 > 00:00:01:01:00:00, ethertype IPv4 (0x0800), length 60: (tos 0x0, ttl 64, id 0, offset 0, flags [DF], proto TCP (6), length 40) > >> 1.0.0.1.34567 > 1.0.0.2.49848: Flags [R], seq 2185658590, win 0, length 0 > >> 22:53:16.664725 00:00:b2:1f:00:00 > 00:00:01:01:00:00, ethertype IPv4 (0x0800), length 74: (tos 0x0, ttl 64, id 0, offset 0, flags [DF], proto TCP (6), length 60) > >> 1.0.0.1.34567 > 1.0.0.2.49848: Flags [R], seq 2185658591, win 0, options [nop,nop,md5 valid], length 0 > >> 22:53:17.289832 00:00:b2:1f:00:00 > 00:00:01:01:00:00, ethertype IPv4 (0x0800), length 74: (tos 0x0, ttl 64, id 0, offset 0, flags [DF], proto TCP (6), length 60) > >> 1.0.0.1.34567 > 1.0.0.2.49848: Flags [R], seq 2185658591, win 0, options [nop,nop,md5 valid], length 0 > > > > Note the signed RSTs later in the dump - those are sent by the server > > when the fin-wait socket gets removed from hash buckets, by > > the listener socket. > > > > Instead of destroying AO/MD5 info and their keys in inet_csk_destroy_sock(), > > slightly delay it until the actual socket .sk_destruct(). As shutdown'ed > > socket can yet send non-data replies, they should be signed in order for > > the peer to process them. Now it also matches how AO/MD5 gets destructed > > for TIME-WAIT sockets (in tcp_twsk_destructor()). > > > > This seems optimal for TCP-MD5, while for TCP-AO it seems to have an > > open problem: once RST get sent and socket gets actually destructed, > > there is no information on the initial sequence numbers. So, in case > > this last RST gets lost in the network, the server's listener socket > > won't be able to properly sign another RST. Nothing in RFC 1122 > > prescribes keeping any local state after non-graceful reset. > > Luckily, BGP are known to use keep alive(s). > > > > While the issue is quite minor/cosmetic, these days monitoring network > > counters is a common practice and getting invalid signed segments from > > a trusted BGP peer can get customers worried. > > > > Investigated-by: Bob Gilligan <gilligan@arista.com> > > Signed-off-by: Dmitry Safonov <dima@arista.com> > > --- > > net/ipv4/tcp.c | 31 +++++++++++++++++++++++++++++++ > > net/ipv4/tcp_ipv4.c | 25 ------------------------- > > 2 files changed, 31 insertions(+), 25 deletions(-) > > > > diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c > > index 71a956fbfc5533224ee00e792de2cfdccd4d40aa..4e996e937e8e5f0e75764caa24240e25006deece 100644 > > --- a/net/ipv4/tcp.c > > +++ b/net/ipv4/tcp.c > > @@ -412,6 +412,36 @@ static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp) > > return rate64; > > } > > [...] > > + > > +static void tcp_destruct_sock(struct sock *sk) > > +{ > > + struct tcp_sock *tp = tcp_sk(sk); > > It looks like this variable is unused when CONFIG_TCP_MD5SIG is not set > > and this is causing the test CI build to fail. > > net/ipv4/tcp.c: In function ‘tcp_destruct_sock’: > net/ipv4/tcp.c:417:26: error: unused variable ‘tp’ [-Werror=unused-variable] > 417 | struct tcp_sock *tp = tcp_sk(sk); > | ^~ > cc1: all warnings being treated as errors > make[4]: *** [scripts/Makefile.build:287: net/ipv4/tcp. > > cheers, > Victor
© 2016 - 2025 Red Hat, Inc.