[v1] NVME over MPTCP

[RFC mptcp-next 6/6] nvmet-tcp: clear MSG_DONTWAIT for MPTCP (TODO: HELP WANTED)

Posted by Geliang Tang 2 weeks, 6 days ago

From: Geliang Tang <tanggeliang@kylinos.cn>

When running the tests from a previous commit in a loop, there is about a
one in ten chance of reporting the following "timeout" error:

'''
[   32.867710] nvme nvme0: I/O tag 1 (0001) type 4 opcode 0x2 (I/O Cmd) QID 2 timeout
[   32.867929] nvme nvme0: starting error recovery
[   32.867994] nvme nvme0: I/O tag 2 (0002) type 4 opcode 0x2 (I/O Cmd) QID 2 timeout
[   32.868112] nvme nvme0: I/O tag 3 (0003) type 4 opcode 0x2 (I/O Cmd) QID 2 timeout
[   32.868359] nvme nvme0: I/O tag 4 (0004) type 4 opcode 0x2 (I/O Cmd) QID 2 timeout
[   32.868446] nvme nvme0: I/O tag 5 (0005) type 4 opcode 0x2 (I/O Cmd) QID 2 timeout
[   32.868592] nvme0c0n1: I/O Cmd(0x2) @ LBA 1046528, 8 blocks, I/O Error (sct 0x3 / sc 0x70)
[   32.868817] recoverable transport error, dev nvme0c0n1, sector 1046528 op 0x0:(READ) flags 0x2080700 phys_seg 1 prio class 0
[   32.868976] block nvme0n1: no usable path - requeuing I/O
[   32.869038] nvme nvme0: I/O tag 6 (0006) type 4 opcode 0x2 (I/O Cmd) QID 2 timeout
[   32.869119] nvme nvme0: I/O tag 7 (0007) type 4 opcode 0x2 (I/O Cmd) QID 2 timeout
... ...
[   32.877945] nvme nvme0: I/O tag 122 (107a) type 4 opcode 0x2 (I/O Cmd) QID 2 timeout
[   32.878025] block nvme0n1: no usable path - requeuing I/O
[   32.878079] block nvme0n1: no usable path - requeuing I/O
[   32.878128] block nvme0n1: no usable path - requeuing I/O
[   32.878180] block nvme0n1: no usable path - requeuing I/O
[   32.878238] block nvme0n1: no usable path - requeuing I/O
[   32.878296] block nvme0n1: no usable path - requeuing I/O
[   32.878350] block nvme0n1: no usable path - requeuing I/O
[   32.878403] block nvme0n1: no usable path - requeuing I/O
[   32.878455] block nvme0n1: no usable path - requeuing I/O
[   32.883603] nvme nvme0: Reconnecting in 10 seconds...
'''

Through debugging, I discovered that setting the MSG_DONTWAIT flag leads to
MPTCP returning EAGAIN in mptcp_sendmsg.

This patch addresses it by dropping the MSG_DONTWAIT flag for MPTCP. I know
this isn't an ideal fix, and I need your suggestions.

Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
---
 drivers/nvme/target/tcp.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 854b70b4a6f4..683a451bcb08 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -645,12 +645,15 @@ static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
 
 	while (cmd->cur_sg) {
 		struct msghdr msg = {
-			.msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES,
+			.msg_flags = MSG_SPLICE_PAGES,
 		};
 		struct page *page = sg_page(cmd->cur_sg);
 		struct bio_vec bvec;
 		u32 left = cmd->cur_sg->length - cmd->offset;
 
+		if (cmd->queue->sock->sk->sk_protocol != IPPROTO_MPTCP)
+			msg.msg_flags |= MSG_DONTWAIT;
+
 		if ((!last_in_batch && cmd->queue->send_list_len) ||
 		    cmd->wbytes_done + left < cmd->req.transfer_len ||
 		    queue->data_digest || !queue->nvme_sq.sqhd_disabled)
@@ -694,12 +697,15 @@ static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
 static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd,
 		bool last_in_batch)
 {
-	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES, };
+	struct msghdr msg = { .msg_flags = MSG_SPLICE_PAGES, };
 	struct bio_vec bvec;
 	u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
 	int left = sizeof(*cmd->rsp_pdu) - cmd->offset + hdgst;
 	int ret;
 
+	if (cmd->queue->sock->sk->sk_protocol != IPPROTO_MPTCP)
+		msg.msg_flags |= MSG_DONTWAIT;
+
 	if (!last_in_batch && cmd->queue->send_list_len)
 		msg.msg_flags |= MSG_MORE;
 	else
-- 
2.43.0

Re: [RFC mptcp-next 6/6] nvmet-tcp: clear MSG_DONTWAIT for MPTCP (TODO: HELP WANTED)

Posted by Geliang Tang 1 day, 2 hours ago

On Fri, 2025-11-07 at 11:37 +0800, Geliang Tang wrote:
> From: Geliang Tang <tanggeliang@kylinos.cn>
> 
> When running the tests from a previous commit in a loop, there is
> about a
> one in ten chance of reporting the following "timeout" error:
> 
> '''
> [   32.867710] nvme nvme0: I/O tag 1 (0001) type 4 opcode 0x2 (I/O
> Cmd) QID 2 timeout
> [   32.867929] nvme nvme0: starting error recovery
> [   32.867994] nvme nvme0: I/O tag 2 (0002) type 4 opcode 0x2 (I/O
> Cmd) QID 2 timeout
> [   32.868112] nvme nvme0: I/O tag 3 (0003) type 4 opcode 0x2 (I/O
> Cmd) QID 2 timeout
> [   32.868359] nvme nvme0: I/O tag 4 (0004) type 4 opcode 0x2 (I/O
> Cmd) QID 2 timeout
> [   32.868446] nvme nvme0: I/O tag 5 (0005) type 4 opcode 0x2 (I/O
> Cmd) QID 2 timeout
> [   32.868592] nvme0c0n1: I/O Cmd(0x2) @ LBA 1046528, 8 blocks, I/O
> Error (sct 0x3 / sc 0x70)
> [   32.868817] recoverable transport error, dev nvme0c0n1, sector
> 1046528 op 0x0:(READ) flags 0x2080700 phys_seg 1 prio class 0
> [   32.868976] block nvme0n1: no usable path - requeuing I/O
> [   32.869038] nvme nvme0: I/O tag 6 (0006) type 4 opcode 0x2 (I/O
> Cmd) QID 2 timeout
> [   32.869119] nvme nvme0: I/O tag 7 (0007) type 4 opcode 0x2 (I/O
> Cmd) QID 2 timeout
> ... ...
> [   32.877945] nvme nvme0: I/O tag 122 (107a) type 4 opcode 0x2 (I/O
> Cmd) QID 2 timeout
> [   32.878025] block nvme0n1: no usable path - requeuing I/O
> [   32.878079] block nvme0n1: no usable path - requeuing I/O
> [   32.878128] block nvme0n1: no usable path - requeuing I/O
> [   32.878180] block nvme0n1: no usable path - requeuing I/O
> [   32.878238] block nvme0n1: no usable path - requeuing I/O
> [   32.878296] block nvme0n1: no usable path - requeuing I/O
> [   32.878350] block nvme0n1: no usable path - requeuing I/O
> [   32.878403] block nvme0n1: no usable path - requeuing I/O
> [   32.878455] block nvme0n1: no usable path - requeuing I/O
> [   32.883603] nvme nvme0: Reconnecting in 10 seconds...
> '''
> 
> Through debugging, I discovered that setting the MSG_DONTWAIT flag
> leads to
> MPTCP returning EAGAIN in mptcp_sendmsg.
> 
> This patch addresses it by dropping the MSG_DONTWAIT flag for MPTCP.
> I know
> this isn't an ideal fix, and I need your suggestions.

Good news! With help from Paolo and Gang Yan, I have finally solved
this issue. It turned out that the write_space function, overridden by
NVMe, was unable to be invoked by MPTCP.

This can be fixed by making the following modifications:

--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -973,7 @@ static inline void mptcp_write_space(struct sock *sk)
   /* pairs with memory barrier in mptcp_poll */
   smp_mb();
   if (mptcp_stream_memory_free(sk, 1))
-      sk_stream_write_space(sk);
+      INDIRECT_CALL_1(sk->sk_write_space, sk_stream_write_space, sk);

I'll send out v2 of this set soon.

Thanks,
-Geliang

> 
> Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
> ---
>  drivers/nvme/target/tcp.c | 10 ++++++++--
>  1 file changed, 8 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
> index 854b70b4a6f4..683a451bcb08 100644
> --- a/drivers/nvme/target/tcp.c
> +++ b/drivers/nvme/target/tcp.c
> @@ -645,12 +645,15 @@ static int nvmet_try_send_data(struct
> nvmet_tcp_cmd *cmd, bool last_in_batch)
>  
>  	while (cmd->cur_sg) {
>  		struct msghdr msg = {
> -			.msg_flags = MSG_DONTWAIT |
> MSG_SPLICE_PAGES,
> +			.msg_flags = MSG_SPLICE_PAGES,
>  		};
>  		struct page *page = sg_page(cmd->cur_sg);
>  		struct bio_vec bvec;
>  		u32 left = cmd->cur_sg->length - cmd->offset;
>  
> +		if (cmd->queue->sock->sk->sk_protocol !=
> IPPROTO_MPTCP)
> +			msg.msg_flags |= MSG_DONTWAIT;
> +
>  		if ((!last_in_batch && cmd->queue->send_list_len) ||
>  		    cmd->wbytes_done + left < cmd->req.transfer_len
> ||
>  		    queue->data_digest || !queue-
> >nvme_sq.sqhd_disabled)
> @@ -694,12 +697,15 @@ static int nvmet_try_send_data(struct
> nvmet_tcp_cmd *cmd, bool last_in_batch)
>  static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd,
>  		bool last_in_batch)
>  {
> -	struct msghdr msg = { .msg_flags = MSG_DONTWAIT |
> MSG_SPLICE_PAGES, };
> +	struct msghdr msg = { .msg_flags = MSG_SPLICE_PAGES, };
>  	struct bio_vec bvec;
>  	u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
>  	int left = sizeof(*cmd->rsp_pdu) - cmd->offset + hdgst;
>  	int ret;
>  
> +	if (cmd->queue->sock->sk->sk_protocol != IPPROTO_MPTCP)
> +		msg.msg_flags |= MSG_DONTWAIT;
> +
>  	if (!last_in_batch && cmd->queue->send_list_len)
>  		msg.msg_flags |= MSG_MORE;
>  	else

[RFC mptcp-next 1/6] mptcp: add sock_set_nodelay
[RFC mptcp-next 2/6] nvmet-tcp: add mptcp support
[RFC mptcp-next 3/6] mptcp: add sock_set_reuseaddr
[RFC mptcp-next 4/6] nvme-tcp: add mptcp support
[RFC mptcp-next 5/6] selftests: mptcp: add NVMe-over-MPTCP test
[RFC mptcp-next 6/6] nvmet-tcp: clear MSG_DONTWAIT for MPTCP (TODO: HELP WANTED)