From: Geliang Tang <tanggeliang@kylinos.cn>
This patch introduces an MPTCP-specific helper, mptcp_sock_set_nodelay,
which sets the TCP_NODELAY option for every subflow socket within an
MPTCP connection. It is utilized on both the target and host sides in
the 'NVMe over MPTCP' implementation.
Using tcp_sock_set_nodelay() with MPTCP will cause list corruption:
nvmet: adding nsid 1 to subsystem nqn.2014-08.org.nvmexpress.mptcpdev
nvmet_tcp: enabling port 1234 (127.0.0.1:4420)
slab MPTCP start ffff8880108f0b80 pointer offset 2480 size 2816
list_add corruption. prev->next should be next (ffff8880108f1530), but
was ffff8885108f1530. (prev=ffff8880108f1530).
------------[ cut here ]------------
kernel BUG at lib/list_debug.c:32!
Oops: invalid opcode: 0000 [#1] SMP KASAN NOPTI
CPU: 1 UID: 0 PID: 182 Comm: nvme Not tainted 6.16.0-rc3+ #1 PREEMPT(full)
Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
Co-developed-by: zhenwei pi <zhenwei.pi@linux.dev>
Signed-off-by: zhenwei pi <zhenwei.pi@linux.dev>
Co-developed-by: Hui Zhu <zhuhui@kylinos.cn>
Signed-off-by: Hui Zhu <zhuhui@kylinos.cn>
Co-developed-by: Gang Yan <yangang@kylinos.cn>
Signed-off-by: Gang Yan <yangang@kylinos.cn>
Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
---
drivers/nvme/host/tcp.c | 2 ++
drivers/nvme/target/tcp.c | 2 ++
include/net/mptcp.h | 4 ++++
net/mptcp/protocol.c | 17 +++++++++++++++++
4 files changed, 25 insertions(+)
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 8446630cceca..dc5b3ecdd885 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1810,6 +1810,8 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid,
tcp_sock_set_syncnt(queue->sock->sk, 1);
/* Set TCP no delay */
+ sk_is_msk(queue->sock->sk) ?
+ mptcp_sock_set_nodelay(queue->sock->sk) :
tcp_sock_set_nodelay(queue->sock->sk);
/*
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 5a58b544f258..8452d38614a6 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -2087,6 +2087,8 @@ static int nvmet_tcp_add_port(struct nvmet_port *nport)
port->data_ready = port->sock->sk->sk_data_ready;
port->sock->sk->sk_data_ready = nvmet_tcp_listen_data_ready;
sock_set_reuseaddr(port->sock->sk);
+ sk_is_msk(port->sock->sk) ?
+ mptcp_sock_set_nodelay(port->sock->sk) :
tcp_sock_set_nodelay(port->sock->sk);
if (so_priority > 0)
sock_set_priority(port->sock->sk, so_priority);
diff --git a/include/net/mptcp.h b/include/net/mptcp.h
index 82660374859a..60cbf29448b0 100644
--- a/include/net/mptcp.h
+++ b/include/net/mptcp.h
@@ -244,6 +244,8 @@ static inline __be32 mptcp_reset_option(const struct sk_buff *skb)
}
void mptcp_active_detect_blackhole(struct sock *sk, bool expired);
+
+void mptcp_sock_set_nodelay(struct sock *sk);
#else
static inline void mptcp_init(void)
@@ -335,6 +337,8 @@ static inline struct request_sock *mptcp_subflow_reqsk_alloc(const struct reques
static inline __be32 mptcp_reset_option(const struct sk_buff *skb) { return htonl(0u); }
static inline void mptcp_active_detect_blackhole(struct sock *sk, bool expired) { }
+
+static inline void mptcp_sock_set_nodelay(struct sock *sk) { }
#endif /* CONFIG_MPTCP */
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index c8fcc46ed042..451bc4df4fa4 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -3806,6 +3806,23 @@ static void mptcp_sock_check_graft(struct sock *sk, struct sock *ssk)
}
}
+void mptcp_sock_set_nodelay(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct mptcp_subflow_context *subflow;
+
+ lock_sock(sk);
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ lock_sock(ssk);
+ __tcp_sock_set_nodelay(ssk, true);
+ release_sock(ssk);
+ }
+ release_sock(sk);
+}
+EXPORT_SYMBOL(mptcp_sock_set_nodelay);
+
bool mptcp_finish_join(struct sock *ssk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
--
2.53.0