[PATCH net-next] tcp: drop tcp_v{4,6}_restore_cb

Stanislav Fomichev posted 1 patch 10 months ago
include/net/tcp.h   | 46 +++++++++++++++++++++++++--------------------
net/ipv4/tcp_ipv4.c | 16 ----------------
net/ipv6/tcp_ipv6.c | 25 ------------------------
3 files changed, 26 insertions(+), 61 deletions(-)
[PATCH net-next] tcp: drop tcp_v{4,6}_restore_cb
Posted by Stanislav Fomichev 10 months ago
Instead of moving and restoring IP[6]CB, reorder tcp_skb_cb
to alias with inet[6]_skb_parm. Add static asserts to make
sure tcp_skb_cb fits into skb.cb and that inet[6]_skb_parm is
at the proper offset.

Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
---
 include/net/tcp.h   | 46 +++++++++++++++++++++++++--------------------
 net/ipv4/tcp_ipv4.c | 16 ----------------
 net/ipv6/tcp_ipv6.c | 25 ------------------------
 3 files changed, 26 insertions(+), 61 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 4450c384ef17..e80fd505f139 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1010,6 +1010,27 @@ enum tcp_skb_cb_sacked_flags {
  * If this grows please adjust skbuff.h:skbuff->cb[xxx] size appropriately.
  */
 struct tcp_skb_cb {
+	union {
+		struct {
+#define TCPCB_DELIVERED_CE_MASK ((1U<<20) - 1)
+			/* There is space for up to 24 bytes */
+			__u32 is_app_limited:1, /* cwnd not fully used? */
+			      delivered_ce:20,
+			      unused:11;
+			/* pkts S/ACKed so far upon tx of skb, incl retrans: */
+			__u32 delivered;
+			/* start of send pipeline phase */
+			u64 first_tx_mstamp;
+			/* when we reached the "delivered" count */
+			u64 delivered_mstamp;
+		} tx;   /* only used for outgoing skbs */
+		union {
+			struct inet_skb_parm	h4;
+#if IS_ENABLED(CONFIG_IPV6)
+			struct inet6_skb_parm	h6;
+#endif
+		} header;	/* For incoming skbs */
+	};
 	__u32		seq;		/* Starting sequence number	*/
 	__u32		end_seq;	/* SEQ + FIN + SYN + datalen	*/
 	union {
@@ -1033,28 +1054,13 @@ struct tcp_skb_cb {
 			has_rxtstamp:1,	/* SKB has a RX timestamp	*/
 			unused:4;
 	__u32		ack_seq;	/* Sequence number ACK'd	*/
-	union {
-		struct {
-#define TCPCB_DELIVERED_CE_MASK ((1U<<20) - 1)
-			/* There is space for up to 24 bytes */
-			__u32 is_app_limited:1, /* cwnd not fully used? */
-			      delivered_ce:20,
-			      unused:11;
-			/* pkts S/ACKed so far upon tx of skb, incl retrans: */
-			__u32 delivered;
-			/* start of send pipeline phase */
-			u64 first_tx_mstamp;
-			/* when we reached the "delivered" count */
-			u64 delivered_mstamp;
-		} tx;   /* only used for outgoing skbs */
-		union {
-			struct inet_skb_parm	h4;
+};
+
+static_assert(sizeof(struct tcp_skb_cb) <= sizeof_field(struct sk_buff, cb));
+static_assert(offsetof(struct tcp_skb_cb, header.h4) == 0);
 #if IS_ENABLED(CONFIG_IPV6)
-			struct inet6_skb_parm	h6;
+static_assert(offsetof(struct tcp_skb_cb, header.h6) == 0);
 #endif
-		} header;	/* For incoming skbs */
-	};
-};
 
 #define TCP_SKB_CB(__skb)	((struct tcp_skb_cb *)&((__skb)->cb[0]))
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 8cce0d5489da..9654f663fd0d 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2153,22 +2153,9 @@ int tcp_filter(struct sock *sk, struct sk_buff *skb)
 }
 EXPORT_IPV6_MOD(tcp_filter);
 
-static void tcp_v4_restore_cb(struct sk_buff *skb)
-{
-	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
-		sizeof(struct inet_skb_parm));
-}
-
 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
 			   const struct tcphdr *th)
 {
-	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
-	 * barrier() makes sure compiler wont play fool^Waliasing games.
-	 */
-	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
-		sizeof(struct inet_skb_parm));
-	barrier();
-
 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
 				    skb->len - th->doff * 4);
@@ -2293,7 +2280,6 @@ int tcp_v4_rcv(struct sk_buff *skb)
 				 * Try to feed this packet to this socket
 				 * instead of discarding it.
 				 */
-				tcp_v4_restore_cb(skb);
 				sock_put(sk);
 				goto lookup;
 			}
@@ -2302,7 +2288,6 @@ int tcp_v4_rcv(struct sk_buff *skb)
 		nf_reset_ct(skb);
 		if (nsk == sk) {
 			reqsk_put(req);
-			tcp_v4_restore_cb(skb);
 		} else {
 			drop_reason = tcp_child_process(sk, nsk, skb);
 			if (drop_reason) {
@@ -2430,7 +2415,6 @@ int tcp_v4_rcv(struct sk_buff *skb)
 		if (sk2) {
 			inet_twsk_deschedule_put(inet_twsk(sk));
 			sk = sk2;
-			tcp_v4_restore_cb(skb);
 			refcounted = false;
 			__this_cpu_write(tcp_tw_isn, isn);
 			goto process;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index b03c223eda4f..f7734ba7f3e6 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1342,16 +1342,6 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	return 0; /* don't send reset */
 }
 
-static void tcp_v6_restore_cb(struct sk_buff *skb)
-{
-	/* We need to move header back to the beginning if xfrm6_policy_check()
-	 * and tcp_v6_fill_cb() are going to be called again.
-	 * ip6_datagram_recv_specific_ctl() also expects IP6CB to be there.
-	 */
-	memmove(IP6CB(skb), &TCP_SKB_CB(skb)->header.h6,
-		sizeof(struct inet6_skb_parm));
-}
-
 static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
 					 struct request_sock *req,
 					 struct dst_entry *dst,
@@ -1552,8 +1542,6 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
 			newnp->pktoptions = skb_clone_and_charge_r(ireq->pktopts, newsk);
 			consume_skb(ireq->pktopts);
 			ireq->pktopts = NULL;
-			if (newnp->pktoptions)
-				tcp_v6_restore_cb(newnp->pktoptions);
 		}
 	} else {
 		if (!req_unhash && found_dup_sk) {
@@ -1710,7 +1698,6 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
 		if (inet6_test_bit(REPFLOW, sk))
 			np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb));
 		if (ipv6_opt_accepted(sk, opt_skb, &TCP_SKB_CB(opt_skb)->header.h6)) {
-			tcp_v6_restore_cb(opt_skb);
 			opt_skb = xchg(&np->pktoptions, opt_skb);
 		} else {
 			__kfree_skb(opt_skb);
@@ -1725,15 +1712,6 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
 static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr,
 			   const struct tcphdr *th)
 {
-	/* This is tricky: we move IP6CB at its correct location into
-	 * TCP_SKB_CB(). It must be done after xfrm6_policy_check(), because
-	 * _decode_session6() uses IP6CB().
-	 * barrier() makes sure compiler won't play aliasing games.
-	 */
-	memmove(&TCP_SKB_CB(skb)->header.h6, IP6CB(skb),
-		sizeof(struct inet6_skb_parm));
-	barrier();
-
 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
 				    skb->len - th->doff*4);
@@ -1849,7 +1827,6 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
 				 * Try to feed this packet to this socket
 				 * instead of discarding it.
 				 */
-				tcp_v6_restore_cb(skb);
 				sock_put(sk);
 				goto lookup;
 			}
@@ -1858,7 +1835,6 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
 		nf_reset_ct(skb);
 		if (nsk == sk) {
 			reqsk_put(req);
-			tcp_v6_restore_cb(skb);
 		} else {
 			drop_reason = tcp_child_process(sk, nsk, skb);
 			if (drop_reason) {
@@ -1987,7 +1963,6 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
 			struct inet_timewait_sock *tw = inet_twsk(sk);
 			inet_twsk_deschedule_put(tw);
 			sk = sk2;
-			tcp_v6_restore_cb(skb);
 			refcounted = false;
 			__this_cpu_write(tcp_tw_isn, isn);
 			goto process;
-- 
2.49.0
Re: [PATCH net-next] tcp: drop tcp_v{4,6}_restore_cb
Posted by Eric Dumazet 10 months ago
On Thu, Apr 10, 2025 at 6:16 PM Stanislav Fomichev <sdf@fomichev.me> wrote:
>
> Instead of moving and restoring IP[6]CB, reorder tcp_skb_cb
> to alias with inet[6]_skb_parm. Add static asserts to make
> sure tcp_skb_cb fits into skb.cb and that inet[6]_skb_parm is
> at the proper offset.

May I ask : why ?

I think you are simply reverting 971f10eca18 ("tcp: better TCP_SKB_CB
layout to reduce cache line misses")
without any performance measurements.
Re: [PATCH net-next] tcp: drop tcp_v{4,6}_restore_cb
Posted by Stanislav Fomichev 10 months ago
On 04/10, Eric Dumazet wrote:
> On Thu, Apr 10, 2025 at 6:16 PM Stanislav Fomichev <sdf@fomichev.me> wrote:
> >
> > Instead of moving and restoring IP[6]CB, reorder tcp_skb_cb
> > to alias with inet[6]_skb_parm. Add static asserts to make
> > sure tcp_skb_cb fits into skb.cb and that inet[6]_skb_parm is
> > at the proper offset.
> 
> May I ask : why ?
> 
> I think you are simply reverting 971f10eca18 ("tcp: better TCP_SKB_CB
> layout to reduce cache line misses")
> without any performance measurements.

Oh, wow, I did not go that far back into the history, thanks for the
pointer! Let me see if there is any perf impact form this...
Re: [PATCH net-next] tcp: drop tcp_v{4,6}_restore_cb
Posted by Eric Dumazet 10 months ago
On Thu, Apr 10, 2025 at 6:45 PM Stanislav Fomichev <stfomichev@gmail.com> wrote:
>
> On 04/10, Eric Dumazet wrote:
> > On Thu, Apr 10, 2025 at 6:16 PM Stanislav Fomichev <sdf@fomichev.me> wrote:
> > >
> > > Instead of moving and restoring IP[6]CB, reorder tcp_skb_cb
> > > to alias with inet[6]_skb_parm. Add static asserts to make
> > > sure tcp_skb_cb fits into skb.cb and that inet[6]_skb_parm is
> > > at the proper offset.
> >
> > May I ask : why ?
> >
> > I think you are simply reverting 971f10eca18 ("tcp: better TCP_SKB_CB
> > layout to reduce cache line misses")
> > without any performance measurements.
>
> Oh, wow, I did not go that far back into the history, thanks for the
> pointer! Let me see if there is any perf impact form this...

To be fair, we now have RB-tree for the out of order queue, we no
longer of O(N) costs
when trying to insert an skb in this queue. Also we try to coalesce
skbs together.

Tests would require thousands of skbs in the out-of-order queue.

Think of long-distance flows (rtt > 100ms), and big tcp_rmem[] and
tcp_wmem[] limits for the sender/receiver.