From: Gang Yan <yangang@kylinos.cn>
Extend MPTCP's sendmsg handling to recognize and honor the MSG_EOR flag,
which marks the end of a record for application-level message boundaries.
Data fragments tagged with MSG_EOR are explicitly marked in the
mptcp_data_frag structure and skb context to prevent unintended
coalescing with subsequent data chunks. This ensures the intent of
applications using MSG_EOR is preserved across MPTCP subflows,
maintaining consistent message segmentation behavior.
Signed-off-by: Gang Yan <yangang@kylinos.cn>
---
net/mptcp/protocol.c | 24 +++++++++++++++++++++---
net/mptcp/protocol.h | 3 ++-
2 files changed, 23 insertions(+), 4 deletions(-)
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 01690a84ea6d..dafa178f43c5 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1032,7 +1032,8 @@ static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk,
const struct page_frag *pfrag,
const struct mptcp_data_frag *df)
{
- return df && pfrag->page == df->page &&
+ return df && !df->eor &&
+ pfrag->page == df->page &&
pfrag->size - pfrag->offset > 0 &&
pfrag->offset == (df->offset + df->data_len) &&
df->data_seq + df->data_len == msk->write_seq;
@@ -1174,6 +1175,7 @@ mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag,
dfrag->offset = offset + sizeof(struct mptcp_data_frag);
dfrag->already_sent = 0;
dfrag->page = pfrag->page;
+ dfrag->eor = 0;
return dfrag;
}
@@ -1435,6 +1437,13 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
mptcp_update_infinite_map(msk, ssk, mpext);
trace_mptcp_sendmsg_frag(mpext);
mptcp_subflow_ctx(ssk)->rel_write_seq += copy;
+
+ /* if this is the last chunk of a dfrag with MSG_EOR set,
+ * mark the skb to prevent coalescing with subsequent data.
+ */
+ if (dfrag->eor && info->sent + copy >= dfrag->data_len)
+ TCP_SKB_CB(skb)->eor = 1;
+
return copy;
}
@@ -1895,7 +1904,8 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
long timeo;
/* silently ignore everything else */
- msg->msg_flags &= MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_FASTOPEN;
+ msg->msg_flags &= MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL |
+ MSG_FASTOPEN | MSG_EOR;
lock_sock(sk);
@@ -2002,8 +2012,16 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
goto do_error;
}
- if (copied)
+ if (copied) {
+ /* mark the last dfrag with EOR if MSG_EOR was set */
+ if (msg->msg_flags & MSG_EOR) {
+ struct mptcp_data_frag *dfrag = mptcp_pending_tail(sk);
+
+ if (dfrag)
+ dfrag->eor = 1;
+ }
__mptcp_push_pending(sk, msg->msg_flags);
+ }
out:
release_sock(sk);
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index afdead91a4d7..db96f2945cbd 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -265,7 +265,8 @@ struct mptcp_data_frag {
u16 data_len;
u16 offset;
u8 overhead;
- u8 __unused;
+ u8 eor:1,
+ __unused:7;
u16 already_sent;
struct page *page;
};
--
2.43.0
On Tue, 31 Mar 2026 17:08:09 +0800, Gang Yan <gang.yan@linux.dev> wrote:
> diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
> index afdead91a4d7..db96f2945cbd 100644
> --- a/net/mptcp/protocol.h
> +++ b/net/mptcp/protocol.h
> @@ -265,7 +265,8 @@ struct mptcp_data_frag {
> u16 data_len;
> u16 offset;
> u8 overhead;
> - u8 __unused;
> + u8 eor:1,
> + __unused:7;
Here, we could also use the whole u8 than using only one bit which is a
bit more costly to read/write, and we would avoid any KMSAN warnings (if
any).
I can also do this modification when applying the patch if that's OK:
u8 eor; /* currently using 1 bit */
WDYT?
Reviewed-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
--
Matthieu Baerts (NGI0) <matttbe@kernel.org>
© 2016 - 2026 Red Hat, Inc.