The MPTCP protocol uses a significantly different CB layout WRT TCP, as it
includes different information and use 64 bits for the sequence numbers.
As the msk-level rcvbuf buffer size is limited by the core socket code the
INT_MAX, we can safely use 32 bits for MPTCP-level sequence number. This
allow updating the MPTCP CB layout so that fields with a corresponding TCP-level
data use the same area inside the CB itself.
Add build time check the unsure the latter invariant.
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
rfc -> v1:
- keep `ack_seq` up2date
---
net/mptcp/protocol.c | 81 ++++++++++++++++++++++++++------------------
net/mptcp/protocol.h | 6 ++--
2 files changed, 52 insertions(+), 35 deletions(-)
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index c0b77d77c268..49e62f817fd6 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -28,7 +28,7 @@
#include "protocol.h"
#include "mib.h"
-static unsigned int mptcp_inq_hint(const struct sock *sk);
+static int mptcp_inq_hint(const struct sock *sk);
#define CREATE_TRACE_POINTS
#include <trace/events/mptcp.h>
@@ -165,7 +165,7 @@ static bool __mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,
!skb_try_coalesce(to, from, fragstolen, delta))
return false;
- pr_debug("colesced seq %llx into %llx new len %d new end seq %llx\n",
+ pr_debug("colesced seq %x into %x new len %d new end seq %x\n",
MPTCP_SKB_CB(from)->map_seq, MPTCP_SKB_CB(to)->map_seq,
to->len, MPTCP_SKB_CB(from)->end_seq);
MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq;
@@ -235,20 +235,20 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb)
{
struct sock *sk = (struct sock *)msk;
struct rb_node **p, *parent;
- u64 seq, end_seq, max_seq;
+ u32 seq, end_seq, max_seq;
struct sk_buff *skb1;
seq = MPTCP_SKB_CB(skb)->map_seq;
end_seq = MPTCP_SKB_CB(skb)->end_seq;
max_seq = atomic64_read(&msk->rcv_wnd_sent);
- pr_debug("msk=%p seq=%llx limit=%llx empty=%d\n", msk, seq, max_seq,
+ pr_debug("msk=%p seq=%x limit=%x empty=%d\n", msk, seq, max_seq,
RB_EMPTY_ROOT(&msk->out_of_order_queue));
- if (after64(end_seq, max_seq)) {
+ if (after(end_seq, max_seq)) {
/* out of window */
mptcp_drop(sk, skb);
- pr_debug("oow by %lld, rcv_wnd_sent %llu\n",
- (unsigned long long)end_seq - (unsigned long)max_seq,
+ pr_debug("oow by %d, rcv_wnd_sent %llu\n",
+ end_seq - max_seq,
(unsigned long long)atomic64_read(&msk->rcv_wnd_sent));
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW);
return;
@@ -273,7 +273,7 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb)
}
/* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */
- if (!before64(seq, MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq)) {
+ if (!before(seq, MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq)) {
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUETAIL);
parent = &msk->ooo_last_skb->rbnode;
p = &parent->rb_right;
@@ -285,18 +285,18 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb)
while (*p) {
parent = *p;
skb1 = rb_to_skb(parent);
- if (before64(seq, MPTCP_SKB_CB(skb1)->map_seq)) {
+ if (before(seq, MPTCP_SKB_CB(skb1)->map_seq)) {
p = &parent->rb_left;
continue;
}
- if (before64(seq, MPTCP_SKB_CB(skb1)->end_seq)) {
- if (!after64(end_seq, MPTCP_SKB_CB(skb1)->end_seq)) {
+ if (before(seq, MPTCP_SKB_CB(skb1)->end_seq)) {
+ if (!after(end_seq, MPTCP_SKB_CB(skb1)->end_seq)) {
/* All the bits are present. Drop. */
mptcp_drop(sk, skb);
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
return;
}
- if (after64(seq, MPTCP_SKB_CB(skb1)->map_seq)) {
+ if (after(seq, MPTCP_SKB_CB(skb1)->map_seq)) {
/* partial overlap:
* | skb |
* | skb1 |
@@ -327,7 +327,7 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb)
merge_right:
/* Remove other segments covered by skb. */
while ((skb1 = skb_rb_next(skb)) != NULL) {
- if (before64(end_seq, MPTCP_SKB_CB(skb1)->end_seq))
+ if (before(end_seq, MPTCP_SKB_CB(skb1)->end_seq))
break;
rb_erase(&skb1->rbnode, &msk->out_of_order_queue);
mptcp_drop(sk, skb1);
@@ -349,10 +349,11 @@ static void mptcp_init_skb(struct sock *ssk, struct sk_buff *skb, int offset)
/* the skb map_seq accounts for the skb offset:
* mptcp_subflow_get_mapped_dsn() is based on the current tp->copied_seq
- * value
+ * value; note that seq numbers are truncated to 32bits
*/
MPTCP_SKB_CB(skb)->map_seq = mptcp_subflow_get_mapped_dsn(subflow) - offset;
MPTCP_SKB_CB(skb)->end_seq = MPTCP_SKB_CB(skb)->map_seq + skb->len;
+ MPTCP_SKB_CB(skb)->flags = 0;
MPTCP_SKB_CB(skb)->has_rxtstamp = has_rxtstamp;
MPTCP_SKB_CB(skb)->cant_coalesce = 0;
@@ -364,13 +365,14 @@ static void mptcp_init_skb(struct sock *ssk, struct sk_buff *skb, int offset)
static bool __mptcp_move_skb(struct sock *sk, struct sk_buff *skb)
{
- u64 copy_len = MPTCP_SKB_CB(skb)->end_seq - MPTCP_SKB_CB(skb)->map_seq;
+ u32 copy_len = MPTCP_SKB_CB(skb)->end_seq - MPTCP_SKB_CB(skb)->map_seq;
struct mptcp_sock *msk = mptcp_sk(sk);
+ u32 ack_seq = msk->ack_seq;
struct sk_buff *tail;
mptcp_borrow_fwdmem(sk, skb);
- if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) {
+ if (MPTCP_SKB_CB(skb)->map_seq == ack_seq) {
/* in sequence */
msk->bytes_received += copy_len;
WRITE_ONCE(msk->ack_seq, msk->ack_seq + copy_len);
@@ -381,7 +383,7 @@ static bool __mptcp_move_skb(struct sock *sk, struct sk_buff *skb)
skb_set_owner_r(skb, sk);
__skb_queue_tail(&sk->sk_receive_queue, skb);
return true;
- } else if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) {
+ } else if (after(MPTCP_SKB_CB(skb)->map_seq, ack_seq)) {
mptcp_data_queue_ofo(msk, skb);
return false;
}
@@ -762,40 +764,40 @@ static bool __mptcp_ofo_queue(struct mptcp_sock *msk)
{
struct sock *sk = (struct sock *)msk;
struct sk_buff *skb, *tail;
+ u32 seq_delta, ack_seq;
bool moved = false;
struct rb_node *p;
- u64 end_seq;
p = rb_first(&msk->out_of_order_queue);
pr_debug("msk=%p empty=%d\n", msk, RB_EMPTY_ROOT(&msk->out_of_order_queue));
while (p) {
+ ack_seq = msk->ack_seq;
skb = rb_to_skb(p);
- if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq))
+ if (after(MPTCP_SKB_CB(skb)->map_seq, ack_seq))
break;
p = rb_next(p);
rb_erase(&skb->rbnode, &msk->out_of_order_queue);
- if (unlikely(!after64(MPTCP_SKB_CB(skb)->end_seq,
- msk->ack_seq))) {
+ if (unlikely(!after(MPTCP_SKB_CB(skb)->end_seq, ack_seq))) {
mptcp_drop(sk, skb);
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
continue;
}
- end_seq = MPTCP_SKB_CB(skb)->end_seq;
+ seq_delta = MPTCP_SKB_CB(skb)->end_seq - ack_seq;
tail = skb_peek_tail(&sk->sk_receive_queue);
if (!tail || !mptcp_try_coalesce(sk, tail, skb)) {
- int delta = msk->ack_seq - MPTCP_SKB_CB(skb)->map_seq;
+ int delta = ack_seq - MPTCP_SKB_CB(skb)->map_seq;
/* skip overlapping data, if any */
- pr_debug("uncoalesced seq=%llx ack seq=%llx delta=%d\n",
- MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq,
+ pr_debug("uncoalesced seq=%x ack seq=%x delta=%d\n",
+ MPTCP_SKB_CB(skb)->map_seq, ack_seq,
delta);
__skb_queue_tail(&sk->sk_receive_queue, skb);
}
- msk->bytes_received += end_seq - msk->ack_seq;
- WRITE_ONCE(msk->ack_seq, end_seq);
+ msk->bytes_received += seq_delta;
+ WRITE_ONCE(msk->ack_seq, msk->ack_seq + seq_delta);
moved = true;
}
return moved;
@@ -2243,19 +2245,20 @@ static bool mptcp_move_skbs(struct sock *sk)
return enqueued;
}
-static unsigned int mptcp_inq_hint(const struct sock *sk)
+static int mptcp_inq_hint(const struct sock *sk)
{
const struct mptcp_sock *msk = mptcp_sk(sk);
const struct sk_buff *skb;
skb = skb_peek(&sk->sk_receive_queue);
if (skb) {
- u64 hint_val = READ_ONCE(msk->ack_seq) - MPTCP_SKB_CB(skb)->map_seq;
+ int hint_val = (u32)READ_ONCE(msk->ack_seq) -
+ MPTCP_SKB_CB(skb)->map_seq;
- if (hint_val >= INT_MAX)
- return INT_MAX;
+ if (hint_val < 0)
+ return -hint_val;
- return (unsigned int)hint_val;
+ return hint_val;
}
if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
@@ -2363,7 +2366,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
tcp_recv_timestamp(msg, sk, &tss);
if (cmsg_flags & MPTCP_CMSG_INQ) {
- unsigned int inq = mptcp_inq_hint(sk);
+ int inq = mptcp_inq_hint(sk);
put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq);
}
@@ -4583,11 +4586,23 @@ static int mptcp_napi_poll(struct napi_struct *napi, int budget)
return work_done;
}
+#define CHK_CB_FIELD(mptcp_field, tcp_field) \
+ ({ \
+ BUILD_BUG_ON(offsetof(struct mptcp_skb_cb, mptcp_field) != \
+ offsetof(struct tcp_skb_cb, tcp_field)); \
+ BUILD_BUG_ON(offsetofend(struct mptcp_skb_cb, mptcp_field) != \
+ offsetofend(struct tcp_skb_cb, tcp_field)); \
+ })
+
void __init mptcp_proto_init(void)
{
struct mptcp_delegated_action *delegated;
int cpu;
+ CHK_CB_FIELD(map_seq, seq);
+ CHK_CB_FIELD(end_seq, end_seq);
+ CHK_CB_FIELD(flags, tcp_flags);
+
mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo;
if (percpu_counter_init(&mptcp_sockets_allocated, 0, GFP_KERNEL))
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index dd437643e604..e541f42fca25 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -126,8 +126,10 @@
#define MPTCP_SYNC_SNDBUF 7
struct mptcp_skb_cb {
- u64 map_seq;
- u64 end_seq;
+ u32 map_seq;
+ u32 end_seq;
+ u32 unused;
+ u16 flags;
u8 has_rxtstamp;
u8 cant_coalesce;
};
--
2.53.0
© 2016 - 2026 Red Hat, Inc.