From: Geliang Tang <tanggeliang@kylinos.cn>
When running the tests from a previous commit in a loop, there is about a
one in ten chance of reporting the following "timeout" error:
'''
[ 32.867710] nvme nvme0: I/O tag 1 (0001) type 4 opcode 0x2 (I/O Cmd) QID 2 timeout
[ 32.867929] nvme nvme0: starting error recovery
[ 32.867994] nvme nvme0: I/O tag 2 (0002) type 4 opcode 0x2 (I/O Cmd) QID 2 timeout
[ 32.868112] nvme nvme0: I/O tag 3 (0003) type 4 opcode 0x2 (I/O Cmd) QID 2 timeout
[ 32.868359] nvme nvme0: I/O tag 4 (0004) type 4 opcode 0x2 (I/O Cmd) QID 2 timeout
[ 32.868446] nvme nvme0: I/O tag 5 (0005) type 4 opcode 0x2 (I/O Cmd) QID 2 timeout
[ 32.868592] nvme0c0n1: I/O Cmd(0x2) @ LBA 1046528, 8 blocks, I/O Error (sct 0x3 / sc 0x70)
[ 32.868817] recoverable transport error, dev nvme0c0n1, sector 1046528 op 0x0:(READ) flags 0x2080700 phys_seg 1 prio class 0
[ 32.868976] block nvme0n1: no usable path - requeuing I/O
[ 32.869038] nvme nvme0: I/O tag 6 (0006) type 4 opcode 0x2 (I/O Cmd) QID 2 timeout
[ 32.869119] nvme nvme0: I/O tag 7 (0007) type 4 opcode 0x2 (I/O Cmd) QID 2 timeout
... ...
[ 32.877945] nvme nvme0: I/O tag 122 (107a) type 4 opcode 0x2 (I/O Cmd) QID 2 timeout
[ 32.878025] block nvme0n1: no usable path - requeuing I/O
[ 32.878079] block nvme0n1: no usable path - requeuing I/O
[ 32.878128] block nvme0n1: no usable path - requeuing I/O
[ 32.878180] block nvme0n1: no usable path - requeuing I/O
[ 32.878238] block nvme0n1: no usable path - requeuing I/O
[ 32.878296] block nvme0n1: no usable path - requeuing I/O
[ 32.878350] block nvme0n1: no usable path - requeuing I/O
[ 32.878403] block nvme0n1: no usable path - requeuing I/O
[ 32.878455] block nvme0n1: no usable path - requeuing I/O
[ 32.883603] nvme nvme0: Reconnecting in 10 seconds...
'''
Through debugging, I discovered that setting the MSG_DONTWAIT flag leads to
MPTCP returning EAGAIN in mptcp_sendmsg.
This patch addresses it by dropping the MSG_DONTWAIT flag for MPTCP. I know
this isn't an ideal fix, and I need your suggestions.
Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
---
drivers/nvme/target/tcp.c | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 854b70b4a6f4..683a451bcb08 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -645,12 +645,15 @@ static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
while (cmd->cur_sg) {
struct msghdr msg = {
- .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES,
+ .msg_flags = MSG_SPLICE_PAGES,
};
struct page *page = sg_page(cmd->cur_sg);
struct bio_vec bvec;
u32 left = cmd->cur_sg->length - cmd->offset;
+ if (cmd->queue->sock->sk->sk_protocol != IPPROTO_MPTCP)
+ msg.msg_flags |= MSG_DONTWAIT;
+
if ((!last_in_batch && cmd->queue->send_list_len) ||
cmd->wbytes_done + left < cmd->req.transfer_len ||
queue->data_digest || !queue->nvme_sq.sqhd_disabled)
@@ -694,12 +697,15 @@ static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd,
bool last_in_batch)
{
- struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES, };
+ struct msghdr msg = { .msg_flags = MSG_SPLICE_PAGES, };
struct bio_vec bvec;
u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
int left = sizeof(*cmd->rsp_pdu) - cmd->offset + hdgst;
int ret;
+ if (cmd->queue->sock->sk->sk_protocol != IPPROTO_MPTCP)
+ msg.msg_flags |= MSG_DONTWAIT;
+
if (!last_in_batch && cmd->queue->send_list_len)
msg.msg_flags |= MSG_MORE;
else
--
2.43.0
On Fri, 2025-11-07 at 11:37 +0800, Geliang Tang wrote:
> From: Geliang Tang <tanggeliang@kylinos.cn>
>
> When running the tests from a previous commit in a loop, there is
> about a
> one in ten chance of reporting the following "timeout" error:
>
> '''
> [ 32.867710] nvme nvme0: I/O tag 1 (0001) type 4 opcode 0x2 (I/O
> Cmd) QID 2 timeout
> [ 32.867929] nvme nvme0: starting error recovery
> [ 32.867994] nvme nvme0: I/O tag 2 (0002) type 4 opcode 0x2 (I/O
> Cmd) QID 2 timeout
> [ 32.868112] nvme nvme0: I/O tag 3 (0003) type 4 opcode 0x2 (I/O
> Cmd) QID 2 timeout
> [ 32.868359] nvme nvme0: I/O tag 4 (0004) type 4 opcode 0x2 (I/O
> Cmd) QID 2 timeout
> [ 32.868446] nvme nvme0: I/O tag 5 (0005) type 4 opcode 0x2 (I/O
> Cmd) QID 2 timeout
> [ 32.868592] nvme0c0n1: I/O Cmd(0x2) @ LBA 1046528, 8 blocks, I/O
> Error (sct 0x3 / sc 0x70)
> [ 32.868817] recoverable transport error, dev nvme0c0n1, sector
> 1046528 op 0x0:(READ) flags 0x2080700 phys_seg 1 prio class 0
> [ 32.868976] block nvme0n1: no usable path - requeuing I/O
> [ 32.869038] nvme nvme0: I/O tag 6 (0006) type 4 opcode 0x2 (I/O
> Cmd) QID 2 timeout
> [ 32.869119] nvme nvme0: I/O tag 7 (0007) type 4 opcode 0x2 (I/O
> Cmd) QID 2 timeout
> ... ...
> [ 32.877945] nvme nvme0: I/O tag 122 (107a) type 4 opcode 0x2 (I/O
> Cmd) QID 2 timeout
> [ 32.878025] block nvme0n1: no usable path - requeuing I/O
> [ 32.878079] block nvme0n1: no usable path - requeuing I/O
> [ 32.878128] block nvme0n1: no usable path - requeuing I/O
> [ 32.878180] block nvme0n1: no usable path - requeuing I/O
> [ 32.878238] block nvme0n1: no usable path - requeuing I/O
> [ 32.878296] block nvme0n1: no usable path - requeuing I/O
> [ 32.878350] block nvme0n1: no usable path - requeuing I/O
> [ 32.878403] block nvme0n1: no usable path - requeuing I/O
> [ 32.878455] block nvme0n1: no usable path - requeuing I/O
> [ 32.883603] nvme nvme0: Reconnecting in 10 seconds...
> '''
>
> Through debugging, I discovered that setting the MSG_DONTWAIT flag
> leads to
> MPTCP returning EAGAIN in mptcp_sendmsg.
>
> This patch addresses it by dropping the MSG_DONTWAIT flag for MPTCP.
> I know
> this isn't an ideal fix, and I need your suggestions.
Good news! With help from Paolo and Gang Yan, I have finally solved
this issue. It turned out that the write_space function, overridden by
NVMe, was unable to be invoked by MPTCP.
This can be fixed by making the following modifications:
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -973,7 @@ static inline void mptcp_write_space(struct sock *sk)
/* pairs with memory barrier in mptcp_poll */
smp_mb();
if (mptcp_stream_memory_free(sk, 1))
- sk_stream_write_space(sk);
+ INDIRECT_CALL_1(sk->sk_write_space, sk_stream_write_space, sk);
I'll send out v2 of this set soon.
Thanks,
-Geliang
>
> Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
> ---
> drivers/nvme/target/tcp.c | 10 ++++++++--
> 1 file changed, 8 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
> index 854b70b4a6f4..683a451bcb08 100644
> --- a/drivers/nvme/target/tcp.c
> +++ b/drivers/nvme/target/tcp.c
> @@ -645,12 +645,15 @@ static int nvmet_try_send_data(struct
> nvmet_tcp_cmd *cmd, bool last_in_batch)
>
> while (cmd->cur_sg) {
> struct msghdr msg = {
> - .msg_flags = MSG_DONTWAIT |
> MSG_SPLICE_PAGES,
> + .msg_flags = MSG_SPLICE_PAGES,
> };
> struct page *page = sg_page(cmd->cur_sg);
> struct bio_vec bvec;
> u32 left = cmd->cur_sg->length - cmd->offset;
>
> + if (cmd->queue->sock->sk->sk_protocol !=
> IPPROTO_MPTCP)
> + msg.msg_flags |= MSG_DONTWAIT;
> +
> if ((!last_in_batch && cmd->queue->send_list_len) ||
> cmd->wbytes_done + left < cmd->req.transfer_len
> ||
> queue->data_digest || !queue-
> >nvme_sq.sqhd_disabled)
> @@ -694,12 +697,15 @@ static int nvmet_try_send_data(struct
> nvmet_tcp_cmd *cmd, bool last_in_batch)
> static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd,
> bool last_in_batch)
> {
> - struct msghdr msg = { .msg_flags = MSG_DONTWAIT |
> MSG_SPLICE_PAGES, };
> + struct msghdr msg = { .msg_flags = MSG_SPLICE_PAGES, };
> struct bio_vec bvec;
> u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
> int left = sizeof(*cmd->rsp_pdu) - cmd->offset + hdgst;
> int ret;
>
> + if (cmd->queue->sock->sk->sk_protocol != IPPROTO_MPTCP)
> + msg.msg_flags |= MSG_DONTWAIT;
> +
> if (!last_in_batch && cmd->queue->send_list_len)
> msg.msg_flags |= MSG_MORE;
> else
© 2016 - 2025 Red Hat, Inc.