From nobody Mon May 25 18:05:07 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id BAC8F3FB7D1 for ; Mon, 18 May 2026 12:07:55 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779106075; cv=none; b=TzT/JWh6Ritu8ihxLvPMBjizmXR7GbUQZqB4cjxmhEOevJ42LilDCr5gevfCC2RPVBDWL2lVg5IN6BN7X/HeNOWrHstvVS1W1NKd9Jkb84TteeOGMP1CZe3oHjj7hZKEA8DnxOerxH7eCCdXALSxqVrytWkVTKRmrsOynSfNhw0= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779106075; c=relaxed/simple; bh=rwnGRzDABlIFb9UDOeW/VgtQjT7t1yElAKQ20huo3fI=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=JnEbwyxdV/XS/J+BVdCFpdw7i27FgZNMvnOqg8YURRjLyZXwtqbFZo2GO2Cjbclb279VEF2fjdmPFgGCUYQsIfYGfTVWj1vizeHRGjjH9V1MkXYFabrFkp4qTxRrYmMAadYX5wZB4YS2aLRHcsQ6hcaZEKsowbu5zLzsCom+vYc= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=nuyewlO4; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="nuyewlO4" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 3F297C2BCC6; Mon, 18 May 2026 12:07:52 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1779106075; bh=rwnGRzDABlIFb9UDOeW/VgtQjT7t1yElAKQ20huo3fI=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=nuyewlO4xnnL/i6Z0RhdW6RsFQYCLMUP2UjDS6+O8fZRqmSpCRMyTAnydKBUM1cbs 7aCBJa0Ed2VpVvStwIRClW4Yx60Z49T0r6/ycv82Q6JdqHh03ZGLNHuMoyq3hFkegl OSD5rjBavcVKgoXmC1xfOYOVLU6zKZON7dkbqwxKPRNMEXFF5c0rLm2sfDWDxRN7ih Y4qYKZKapKYsSXBZB3/OICN0czOr7bAvz3gle8PJGlz+YLAmS4rOjLpLicPvc6mMBo zlDLuwtKTfYVW9oV/Zn+5IT8GH7HvwVDRMqlEF4tUP/yY5Dyt5WNdpAC33E/JA6DOn cjgBooaGd16hQ== From: Geliang Tang To: mptcp@lists.linux.dev Cc: Geliang Tang , zhenwei pi , Hui Zhu , Gang Yan Subject: [RFC mptcp-next v13 01/12] nvmet-tcp: use tr_ops from nport and add port refcounting Date: Mon, 18 May 2026 20:07:26 +0800 Message-ID: X-Mailer: git-send-email 2.53.0 In-Reply-To: References: Precedence: bulk X-Mailing-List: mptcp@lists.linux.dev List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: Geliang Tang Replace hardcoded nvmet_tcp_ops with queue->port->nport->tr_ops in nvmet_tcp_done_recv_pdu() to support multiple transport types. Add kref reference counting to struct nvmet_tcp_port to prevent use-after-free when a port is removed while queues still hold references to it. Cc: zhenwei pi Cc: Hui Zhu Cc: Gang Yan Signed-off-by: Geliang Tang --- drivers/nvme/target/tcp.c | 35 +++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 164a564ba3b4..7fc37ce2c050 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -202,6 +202,7 @@ struct nvmet_tcp_port { struct socket *sock; struct work_struct accept_work; struct nvmet_port *nport; + struct kref kref; struct sockaddr_storage addr; void (*data_ready)(struct sock *); }; @@ -211,7 +212,6 @@ static LIST_HEAD(nvmet_tcp_queue_list); static DEFINE_MUTEX(nvmet_tcp_queue_mutex); =20 static struct workqueue_struct *nvmet_tcp_wq; -static const struct nvmet_fabrics_ops nvmet_tcp_ops; static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c); static void nvmet_tcp_free_cmd_buffers(struct nvmet_tcp_cmd *cmd); =20 @@ -1081,7 +1081,8 @@ static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_q= ueue *queue) req =3D &queue->cmd->req; memcpy(req->cmd, nvme_cmd, sizeof(*nvme_cmd)); =20 - if (unlikely(!nvmet_req_init(req, &queue->nvme_sq, &nvmet_tcp_ops))) { + if (unlikely(!nvmet_req_init(req, &queue->nvme_sq, + queue->port->nport->tr_ops))) { pr_err("failed cmd %p id %d opcode %d, data_len: %d, status: %04x\n", req->cmd, req->cmd->common.command_id, req->cmd->common.opcode, @@ -1597,6 +1598,21 @@ static void nvmet_tcp_free_cmd_data_in_buffers(struc= t nvmet_tcp_queue *queue) nvmet_tcp_free_cmd_buffers(&queue->connect); } =20 +static void nvmet_tcp_port_release(struct kref *kref) +{ + struct nvmet_tcp_port *port =3D container_of(kref, + struct nvmet_tcp_port, + kref); + + kfree(port); +} + +static void nvmet_tcp_port_put(struct nvmet_tcp_port *port) +{ + if (port) + kref_put(&port->kref, nvmet_tcp_port_release); +} + static void nvmet_tcp_release_queue_work(struct work_struct *w) { struct nvmet_tcp_queue *queue =3D @@ -1623,6 +1639,8 @@ static void nvmet_tcp_release_queue_work(struct work_= struct *w) nvmet_tcp_free_cmds(queue); ida_free(&nvmet_tcp_queue_ida, queue->idx); page_frag_cache_drain(&queue->pf_cache); + nvmet_tcp_port_put(queue->port); + queue->port =3D NULL; kfree(queue); } =20 @@ -1904,6 +1922,13 @@ static int nvmet_tcp_tls_handshake(struct nvmet_tcp_= queue *queue) static void nvmet_tcp_tls_handshake_timeout(struct work_struct *w) {} #endif =20 +static struct nvmet_tcp_port *nvmet_tcp_port_get(struct nvmet_tcp_port *po= rt) +{ + if (port) + kref_get(&port->kref); + return port; +} + static void nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port, struct socket *newsock) { @@ -1921,7 +1946,7 @@ static void nvmet_tcp_alloc_queue(struct nvmet_tcp_po= rt *port, INIT_WORK(&queue->io_work, nvmet_tcp_io_work); kref_init(&queue->kref); queue->sock =3D newsock; - queue->port =3D port; + queue->port =3D nvmet_tcp_port_get(port); queue->nr_cmds =3D 0; spin_lock_init(&queue->state_lock); if (queue->port->nport->disc_addr.tsas.tcp.sectype =3D=3D @@ -2051,6 +2076,8 @@ static int nvmet_tcp_add_port(struct nvmet_port *npor= t) if (!port) return -ENOMEM; =20 + kref_init(&port->kref); + switch (nport->disc_addr.adrfam) { case NVMF_ADDR_FAMILY_IP4: af =3D AF_INET; @@ -2146,7 +2173,7 @@ static void nvmet_tcp_remove_port(struct nvmet_port *= nport) nvmet_tcp_destroy_port_queues(port); =20 sock_release(port->sock); - kfree(port); + nvmet_tcp_port_put(port); } =20 static void nvmet_tcp_delete_ctrl(struct nvmet_ctrl *ctrl) --=20 2.53.0 From nobody Mon May 25 18:05:07 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 93D5D3FBB46 for ; Mon, 18 May 2026 12:07:58 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779106078; cv=none; b=QOvxisFDwQC7n+9jzKqMsvpBwG4jAW9fhHndkKUAVmILq1W3yiVMl3aVaBR23roscAM7blk9/RxXIogiHui1j84vys0uM3L8tcXTFvdXWd3d/+wDVySpdXx2sHbjb41Y+WcrxjNhiNWu8eto0ecXntqY5hSxHX1keYylyl8xdSs= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779106078; c=relaxed/simple; bh=3YUspg6uIbaeDumRMWuDrGn+dKwQhPe3oM9mXxiQdys=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=CQ5Q7LmL1f4C6EA5w40VtGggls+qrX4W/FwPRB3Yb0IMQ0qlIrjHdPJECi0vpcAEf19qI+Y1aJPbAccIorsUPEfLykkNGH7zyPG3h5M3OG3eeDBhvOZLDLlXdZvuVkisu0zcV/xYCCpUKVmhaAhUS0pjBPOv1YvU2Hur47npNB4= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=U32YwxIs; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="U32YwxIs" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 37F8BC2BCB8; Mon, 18 May 2026 12:07:55 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1779106078; bh=3YUspg6uIbaeDumRMWuDrGn+dKwQhPe3oM9mXxiQdys=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=U32YwxIsnZ/pp3rIAmXu8kK0bQGKiEaptAHRH1aOUvi7mMMj86gKXBkIo0kQ4RFhy L/NmWYqRn67/E2zb+kN12PP7xrukXBUEM+G8CyYxz+qT9J8FRMF/MOEkp2IaeMtpnK I2n9GM/+9LHlovZYAstBjWtoGP9LWaaNXH5maLxhLCJ78muBTDNMxFNF5f+5vLM2wv /9ipD1PlFhWk9sopksJ0XY3HjHMX9an6AIiaImK70ZaDj1q1tKJ2tPIL3qTqo7PT51 CIRTwMTzbn7lYN+EwwK6KknwQvTLqRTlFepQ6MNFStPZnu3TawZxQXlOgktXf/P4SD nAwCIpXshDGNg== From: Geliang Tang To: mptcp@lists.linux.dev Cc: Geliang Tang , zhenwei pi , Hui Zhu , Gang Yan Subject: [RFC mptcp-next v13 02/12] nvmet-tcp: define target tcp_proto struct Date: Mon, 18 May 2026 20:07:27 +0800 Message-ID: <157ffe3dbecf3779c2633b493241180753f5e9e8.1779104752.git.tanggeliang@kylinos.cn> X-Mailer: git-send-email 2.53.0 In-Reply-To: References: Precedence: bulk X-Mailing-List: mptcp@lists.linux.dev List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: Geliang Tang To add MPTCP support in "NVMe over TCP", the target side needs to pass IPPROTO_MPTCP to sock_create() instead of IPPROTO_TCP to create an MPTCP socket. Additionally, the setsockopt operations for this socket need to be switched to a set of MPTCP-specific functions. This patch defines the nvmet_tcp_proto structure, which contains the protocol of the socket and a set of function pointers for these socket operations. A TCP-specific version of struct nvmet_tcp_proto is defined. In nvmet_tcp_add_port(), a local proto variable is set based on trtype. All locations that previously called TCP setsockopt functions are updated to call the corresponding function pointers in the nvmet_tcp_proto structure. A proto field is added to struct nvmet_tcp_port, which points to the appropriate protocol structure. The proto pointer is set during port creation and remains valid for the lifetime of the port. Co-developed-by: zhenwei pi Signed-off-by: zhenwei pi Co-developed-by: Hui Zhu Signed-off-by: Hui Zhu Co-developed-by: Gang Yan Signed-off-by: Gang Yan Signed-off-by: Geliang Tang --- drivers/nvme/target/tcp.c | 50 ++++++++++++++++++++++++++++++++------- 1 file changed, 41 insertions(+), 9 deletions(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 7fc37ce2c050..4ba6bc9480af 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -198,6 +198,15 @@ struct nvmet_tcp_queue { void (*write_space)(struct sock *); }; =20 +struct nvmet_tcp_proto { + int protocol; + void (*set_reuseaddr)(struct sock *sk); + void (*set_nodelay)(struct sock *sk); + void (*set_priority)(struct sock *sk, u32 priority); + void (*no_linger)(struct sock *sk); + void (*set_tos)(struct sock *sk); +}; + struct nvmet_tcp_port { struct socket *sock; struct work_struct accept_work; @@ -205,6 +214,7 @@ struct nvmet_tcp_port { struct kref kref; struct sockaddr_storage addr; void (*data_ready)(struct sock *); + const struct nvmet_tcp_proto *proto; }; =20 static DEFINE_IDA(nvmet_tcp_queue_ida); @@ -1714,7 +1724,6 @@ static void nvmet_tcp_state_change(struct sock *sk) static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue) { struct socket *sock =3D queue->sock; - struct inet_sock *inet =3D inet_sk(sock->sk); int ret; =20 ret =3D kernel_getsockname(sock, @@ -1732,14 +1741,13 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tc= p_queue *queue) * close. This is done to prevent stale data from being sent should * the network connection be restored before TCP times out. */ - sock_no_linger(sock->sk); + queue->port->proto->no_linger(sock->sk); =20 if (so_priority > 0) - sock_set_priority(sock->sk, so_priority); + queue->port->proto->set_priority(sock->sk, so_priority); =20 /* Set socket type of service */ - if (inet->rcv_tos > 0) - ip_sock_set_tos(sock->sk, inet->rcv_tos); + queue->port->proto->set_tos(sock->sk); =20 ret =3D 0; write_lock_bh(&sock->sk->sk_callback_lock); @@ -2066,6 +2074,23 @@ static void nvmet_tcp_listen_data_ready(struct sock = *sk) read_unlock_bh(&sk->sk_callback_lock); } =20 +static void tcp_sock_set_tos(struct sock *sk) +{ + struct inet_sock *inet =3D inet_sk(sk); + + if (inet->rcv_tos > 0) + ip_sock_set_tos(sk, inet->rcv_tos); +} + +static const struct nvmet_tcp_proto nvmet_tcp_proto =3D { + .protocol =3D IPPROTO_TCP, + .set_reuseaddr =3D sock_set_reuseaddr, + .set_nodelay =3D tcp_sock_set_nodelay, + .set_priority =3D sock_set_priority, + .no_linger =3D sock_no_linger, + .set_tos =3D tcp_sock_set_tos, +}; + static int nvmet_tcp_add_port(struct nvmet_port *nport) { struct nvmet_tcp_port *port; @@ -2092,6 +2117,13 @@ static int nvmet_tcp_add_port(struct nvmet_port *npo= rt) goto err_port; } =20 + if (nport->disc_addr.trtype =3D=3D NVMF_TRTYPE_TCP) { + port->proto =3D &nvmet_tcp_proto; + } else { + ret =3D -EINVAL; + goto err_port; + } + ret =3D inet_pton_with_scope(&init_net, af, nport->disc_addr.traddr, nport->disc_addr.trsvcid, &port->addr); if (ret) { @@ -2106,7 +2138,7 @@ static int nvmet_tcp_add_port(struct nvmet_port *npor= t) port->nport->inline_data_size =3D NVMET_TCP_DEF_INLINE_DATA_SIZE; =20 ret =3D sock_create(port->addr.ss_family, SOCK_STREAM, - IPPROTO_TCP, &port->sock); + port->proto->protocol, &port->sock); if (ret) { pr_err("failed to create a socket\n"); goto err_port; @@ -2115,10 +2147,10 @@ static int nvmet_tcp_add_port(struct nvmet_port *np= ort) port->sock->sk->sk_user_data =3D port; port->data_ready =3D port->sock->sk->sk_data_ready; port->sock->sk->sk_data_ready =3D nvmet_tcp_listen_data_ready; - sock_set_reuseaddr(port->sock->sk); - tcp_sock_set_nodelay(port->sock->sk); + port->proto->set_reuseaddr(port->sock->sk); + port->proto->set_nodelay(port->sock->sk); if (so_priority > 0) - sock_set_priority(port->sock->sk, so_priority); + port->proto->set_priority(port->sock->sk, so_priority); =20 ret =3D kernel_bind(port->sock, (struct sockaddr_unsized *)&port->addr, sizeof(port->addr)); --=20 2.53.0 From nobody Mon May 25 18:05:07 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 027D33502AA for ; Mon, 18 May 2026 12:08:01 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779106081; cv=none; b=GNC3zLRMCjPvqOJw2RseYV5KsMViQ/cpbmWP7qKrf/kM+VGBwXLbSlazWMnnS/REZUcWAHEWqMFimEcaaAH1dhynrA8HlByiRFiz2bTyYynekFv/MquIKkkFoeDe3A4BCDDGsmKVwKkEdunCRo06k/IlAbvBNVDv+JQgUV/yR5E= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779106081; c=relaxed/simple; bh=pCG/yoaHGCNW3lzROBi1LcHwCo8c6BLgvHyRuyXVkNY=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=tLycSH3XZXbivLdhalEue9EqdfcQnWdBOq7eJS1IafNtM/MhnUv023rF+a+jZjzj392IMTpxPYseAiSmSsd44A2Ux5TzyipZjrT0eh5tAhFDc6OOfGklrpbAF9MDxioIgr8BF2WKwFo8YQT7OjozchV7okEtwWyYywuy1PJpxw0= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=Cxth8IwA; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="Cxth8IwA" Received: by smtp.kernel.org (Postfix) with ESMTPSA id D9097C2BCB8; Mon, 18 May 2026 12:07:58 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1779106080; bh=pCG/yoaHGCNW3lzROBi1LcHwCo8c6BLgvHyRuyXVkNY=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=Cxth8IwAos/Et0DukaeoHI4eUWfRRD0Bmmw1+28rjY0k3uK3iRWVBz7ogy5NbegJV ykpMJUUU54qTZQB6p7vzrG9qPVaC8SH+4WhBcOaB/vOxL0G+K9EBpEYQHn97UmO9b6 ac48mnYGBgEoVULx622AImKWvMi0jaX9eUoN1NSZsF58Dn3rmdxt0XtYlQ5Za1w8Av NgO5bnnCcN9c+xpFjlKWjCTA2Y0f3yTaqrZF23bZJJx5QGaTayCii9rsX77MfhPI7m 7k53WrSlMRTV1bA6tOe3xv+xxJ/Q1NVjGjAMTmitMEDpLRrONPOwUmTG078Ympenz1 gqFNrtza/zxbQ== From: Geliang Tang To: mptcp@lists.linux.dev Cc: Geliang Tang , zhenwei pi , Hui Zhu , Gang Yan Subject: [RFC mptcp-next v13 03/12] nvmet-tcp: register target mptcp transport Date: Mon, 18 May 2026 20:07:28 +0800 Message-ID: X-Mailer: git-send-email 2.53.0 In-Reply-To: References: Precedence: bulk X-Mailing-List: mptcp@lists.linux.dev List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: Geliang Tang This patch adds a new nvme target transport type NVMF_TRTYPE_MPTCP for MPTCP. And defines a new nvmet_fabrics_ops named nvmet_mptcp_ops, which is almost the same as nvmet_tcp_ops except .type. It is registered in nvmet_tcp_init() and unregistered in nvmet_tcp_exit(). A MODULE_ALIAS for "nvmet-transport-4" is also added. Note: NVMF_TRTYPE_MPTCP is temporarily assigned 4, a value currently reserved in the NVMe over Fabrics specification. A request will be submitted to the NVMe working group to officially allocate this value for MPTCP. v2: - use trtype instead of tsas (Hannes). v3: - check mptcp protocol from disc_addr.trtype instead of passing a parameter (Hannes). v4: - check CONFIG_MPTCP. Co-developed-by: zhenwei pi Signed-off-by: zhenwei pi Co-developed-by: Hui Zhu Signed-off-by: Hui Zhu Co-developed-by: Gang Yan Signed-off-by: Gang Yan Signed-off-by: Geliang Tang --- drivers/nvme/target/configfs.c | 1 + drivers/nvme/target/tcp.c | 29 +++++++++++++++++++++++++++++ include/linux/nvme.h | 1 + 3 files changed, 31 insertions(+) diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index b88f897f06e2..51fc0f4d0c32 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -37,6 +37,7 @@ static struct nvmet_type_name_map nvmet_transport[] =3D { { NVMF_TRTYPE_RDMA, "rdma" }, { NVMF_TRTYPE_FC, "fc" }, { NVMF_TRTYPE_TCP, "tcp" }, + { NVMF_TRTYPE_MPTCP, "mptcp" }, { NVMF_TRTYPE_PCI, "pci" }, { NVMF_TRTYPE_LOOP, "loop" }, }; diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 4ba6bc9480af..745c939498fa 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -2290,6 +2290,23 @@ static const struct nvmet_fabrics_ops nvmet_tcp_ops = =3D { .host_traddr =3D nvmet_tcp_host_port_addr, }; =20 +#ifdef CONFIG_MPTCP +static bool nvmet_mptcp_registered; + +static const struct nvmet_fabrics_ops nvmet_mptcp_ops =3D { + .owner =3D THIS_MODULE, + .type =3D NVMF_TRTYPE_MPTCP, + .msdbd =3D 1, + .add_port =3D nvmet_tcp_add_port, + .remove_port =3D nvmet_tcp_remove_port, + .queue_response =3D nvmet_tcp_queue_response, + .delete_ctrl =3D nvmet_tcp_delete_ctrl, + .install_queue =3D nvmet_tcp_install_queue, + .disc_traddr =3D nvmet_tcp_disc_port_addr, + .host_traddr =3D nvmet_tcp_host_port_addr, +}; +#endif + static int __init nvmet_tcp_init(void) { int ret; @@ -2303,6 +2320,11 @@ static int __init nvmet_tcp_init(void) if (ret) goto err; =20 +#ifdef CONFIG_MPTCP + if (!nvmet_register_transport(&nvmet_mptcp_ops)) + nvmet_mptcp_registered =3D true; +#endif + return 0; err: destroy_workqueue(nvmet_tcp_wq); @@ -2313,6 +2335,10 @@ static void __exit nvmet_tcp_exit(void) { struct nvmet_tcp_queue *queue; =20 +#ifdef CONFIG_MPTCP + if (nvmet_mptcp_registered) + nvmet_unregister_transport(&nvmet_mptcp_ops); +#endif nvmet_unregister_transport(&nvmet_tcp_ops); =20 flush_workqueue(nvmet_wq); @@ -2332,3 +2358,6 @@ module_exit(nvmet_tcp_exit); MODULE_DESCRIPTION("NVMe target TCP transport driver"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS("nvmet-transport-3"); /* 3 =3D=3D NVMF_TRTYPE_TCP */ +#ifdef CONFIG_MPTCP +MODULE_ALIAS("nvmet-transport-4"); /* 4 =3D=3D NVMF_TRTYPE_MPTCP */ +#endif diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 041f30931a90..0eada1e0c652 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -68,6 +68,7 @@ enum { NVMF_TRTYPE_RDMA =3D 1, /* RDMA */ NVMF_TRTYPE_FC =3D 2, /* Fibre Channel */ NVMF_TRTYPE_TCP =3D 3, /* TCP/IP */ + NVMF_TRTYPE_MPTCP =3D 4, /* Multipath TCP */ NVMF_TRTYPE_LOOP =3D 254, /* Reserved for host usage */ NVMF_TRTYPE_MAX, }; --=20 2.53.0 From nobody Mon May 25 18:05:07 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id E6E213FBB4E for ; Mon, 18 May 2026 12:08:03 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779106084; cv=none; b=M/uhtzgyYxhSHAzyl4LfDcKotLvS0V48NuygDyNCMm+4SqgXsiiprfBXU2kttKfIqgj/+o5qmuyoFHSyC8DN4BB9OCKO+AwCxzHJyoUf0Hx2IMtofxbnr1JZlJnn8YoMbmYz3Ly8KCl8Rx56UT8dO8nyLgFfi3hPd/UfdOL4nJI= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779106084; c=relaxed/simple; bh=9HbGYzw9yj0MAog2AYmMzk08rWJqwlnr2j5Z7TMhZoY=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=P8+TjiFYN+CswE/oq26BAzbaPLDWltNgvCv7rajmokFfN+QfuzHv0NN2/2YTDqbDF1HYS0/ZXnRjWcCBRAxLoCt07wthSW31zh2jRxkSmDZR9kZ973qkEjj9kmhiDMTaggE5oVBI12Z05/N9eUORypC0r1cRYylLepRALy1gsu4= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=inYTiKK7; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="inYTiKK7" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 87865C2BCC6; Mon, 18 May 2026 12:08:01 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1779106083; bh=9HbGYzw9yj0MAog2AYmMzk08rWJqwlnr2j5Z7TMhZoY=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=inYTiKK7wLBQmDoYTJOgfaPj88VL+Pv2Pl0Ny2yOjg+5+qZjZ2JcdNFIqbbBuOK63 GkiLiOdtuWgfCSgpRY/cANQYueNcluFmjIowrljsTsSJxEXOEbgdApG5CMP64UEWSd V+IN6QWtBmXXIDte6SZtTiBtACoFUKSSslHo4M4iahnQI+WdLjb/KJBVPbqfK4WqYY ERbiXIdSgyZs6Akjq9/bG4ileAc+Jn5ysZuEOut/aVu19WZL2JrsGBe9W9UPKM+TJk 4YO1zn8VJzDcUnGWUl1ZATP7xBPEYihkTl2dwnDWGFMDqCE3cbSwWr7VwXxXSyrm4k ll4T/OzD0MxYQ== From: Geliang Tang To: mptcp@lists.linux.dev Cc: Geliang Tang , zhenwei pi , Hui Zhu , Gang Yan Subject: [RFC mptcp-next v13 04/12] nvmet-tcp: implement target mptcp proto Date: Mon, 18 May 2026 20:07:29 +0800 Message-ID: X-Mailer: git-send-email 2.53.0 In-Reply-To: References: Precedence: bulk X-Mailing-List: mptcp@lists.linux.dev List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: Geliang Tang This patch introduces a new NVMe target transport type NVMF_TRTYPE_MPTCP to support MPTCP. An MPTCP-specific version of struct nvmet_tcp_proto is implemented, and it is assigned to queue->proto when the transport type is MPTCP (based on port->sock->sk->sk_protocol). Dedicated MPTCP helpers are introduced for setting socket options. Most of these helpers (set_nodelay, set_priority, no_linger, set_tos) set the values on all existing subflows using mptcp_for_each_subflow(). The set_reuseaddr helper only applies to the first subflow. The values are then synchronized to other newly created subflows in sync_socket_options(). Co-developed-by: zhenwei pi Signed-off-by: zhenwei pi Co-developed-by: Hui Zhu Signed-off-by: Hui Zhu Co-developed-by: Gang Yan Signed-off-by: Gang Yan Signed-off-by: Geliang Tang --- drivers/nvme/target/tcp.c | 15 ++++ include/net/mptcp.h | 20 +++++ net/mptcp/sockopt.c | 153 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 188 insertions(+) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 745c939498fa..73afaf3562a6 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -2091,6 +2091,17 @@ static const struct nvmet_tcp_proto nvmet_tcp_proto = =3D { .set_tos =3D tcp_sock_set_tos, }; =20 +#ifdef CONFIG_MPTCP +static const struct nvmet_tcp_proto nvmet_mptcp_proto =3D { + .protocol =3D IPPROTO_MPTCP, + .set_reuseaddr =3D mptcp_sock_set_reuseaddr, + .set_nodelay =3D mptcp_sock_set_nodelay, + .set_priority =3D mptcp_sock_set_priority, + .no_linger =3D mptcp_sock_no_linger, + .set_tos =3D mptcp_sock_set_tos, +}; +#endif + static int nvmet_tcp_add_port(struct nvmet_port *nport) { struct nvmet_tcp_port *port; @@ -2119,6 +2130,10 @@ static int nvmet_tcp_add_port(struct nvmet_port *npo= rt) =20 if (nport->disc_addr.trtype =3D=3D NVMF_TRTYPE_TCP) { port->proto =3D &nvmet_tcp_proto; +#ifdef CONFIG_MPTCP + } else if (nport->disc_addr.trtype =3D=3D NVMF_TRTYPE_MPTCP) { + port->proto =3D &nvmet_mptcp_proto; +#endif } else { ret =3D -EINVAL; goto err_port; diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 4cf59e83c1c5..8eacb9424b37 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -237,6 +237,16 @@ static inline __be32 mptcp_reset_option(const struct s= k_buff *skb) } =20 void mptcp_active_detect_blackhole(struct sock *sk, bool expired); + +void mptcp_sock_set_reuseaddr(struct sock *sk); + +void mptcp_sock_set_nodelay(struct sock *sk); + +void mptcp_sock_set_priority(struct sock *sk, u32 priority); + +void mptcp_sock_no_linger(struct sock *sk); + +void mptcp_sock_set_tos(struct sock *sk); #else =20 static inline void mptcp_init(void) @@ -323,6 +333,16 @@ static inline struct request_sock *mptcp_subflow_reqsk= _alloc(const struct reques static inline __be32 mptcp_reset_option(const struct sk_buff *skb) { retu= rn htonl(0u); } =20 static inline void mptcp_active_detect_blackhole(struct sock *sk, bool exp= ired) { } + +static inline void mptcp_sock_set_reuseaddr(struct sock *sk) { } + +static inline void mptcp_sock_set_nodelay(struct sock *sk) { } + +static inline void mptcp_sock_set_priority(struct sock *sk, u32 priority) = { } + +static inline void mptcp_sock_no_linger(struct sock *sk) { } + +static inline void mptcp_sock_set_tos(struct sock *sk) { } #endif /* CONFIG_MPTCP */ =20 #if IS_ENABLED(CONFIG_MPTCP_IPV6) diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 87b5796d0135..be245babef30 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -1596,6 +1596,8 @@ static void sync_socket_options(struct mptcp_sock *ms= k, struct sock *ssk) inet_assign_bit(FREEBIND, ssk, inet_test_bit(FREEBIND, sk)); inet_assign_bit(BIND_ADDRESS_NO_PORT, ssk, inet_test_bit(BIND_ADDRESS_NO_= PORT, sk)); WRITE_ONCE(inet_sk(ssk)->local_port_range, READ_ONCE(inet_sk(sk)->local_p= ort_range)); + + ssk->sk_reuse =3D sk->sk_reuse; } =20 void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk) @@ -1662,3 +1664,154 @@ int mptcp_set_rcvlowat(struct sock *sk, int val) } return 0; } + +void mptcp_sock_set_reuseaddr(struct sock *sk) +{ + struct mptcp_sock *msk; + struct sock *ssk; + + if (sk->sk_protocol !=3D IPPROTO_MPTCP) + return; + + msk =3D mptcp_sk(sk); + + lock_sock(sk); + sockopt_seq_inc(msk); + sk->sk_reuse =3D SK_CAN_REUSE; + ssk =3D __mptcp_nmpc_sk(msk); + if (IS_ERR(ssk)) + goto unlock; + lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); + ssk->sk_reuse =3D SK_CAN_REUSE; + release_sock(ssk); +unlock: + release_sock(sk); +} +EXPORT_SYMBOL(mptcp_sock_set_reuseaddr); + +void mptcp_sock_set_nodelay(struct sock *sk) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_sock *msk; + struct sock *ssk; + + if (sk->sk_protocol !=3D IPPROTO_MPTCP) + return; + + msk =3D mptcp_sk(sk); + + lock_sock(sk); + sockopt_seq_inc(msk); + msk->nodelay =3D true; + mptcp_for_each_subflow(msk, subflow) { + ssk =3D mptcp_subflow_tcp_sock(subflow); + if (ssk) { + lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); + __tcp_sock_set_nodelay(ssk, true); + release_sock(ssk); + } + } + release_sock(sk); +} +EXPORT_SYMBOL(mptcp_sock_set_nodelay); + +void mptcp_sock_set_priority(struct sock *sk, u32 priority) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_sock *msk; + struct sock *ssk; + + if (sk->sk_protocol !=3D IPPROTO_MPTCP) + return; + + msk =3D mptcp_sk(sk); + + lock_sock(sk); + sockopt_seq_inc(msk); + sock_set_priority(sk, priority); + mptcp_for_each_subflow(msk, subflow) { + ssk =3D mptcp_subflow_tcp_sock(subflow); + if (ssk) { + lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); + sock_set_priority(ssk, priority); + release_sock(ssk); + } + } + release_sock(sk); +} +EXPORT_SYMBOL(mptcp_sock_set_priority); + +void mptcp_sock_no_linger(struct sock *sk) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_sock *msk; + struct sock *ssk; + + if (sk->sk_protocol !=3D IPPROTO_MPTCP) + return; + + msk =3D mptcp_sk(sk); + + lock_sock(sk); + sockopt_seq_inc(msk); + WRITE_ONCE(sk->sk_lingertime, 0); + sock_set_flag(sk, SOCK_LINGER); + mptcp_for_each_subflow(msk, subflow) { + ssk =3D mptcp_subflow_tcp_sock(subflow); + if (ssk) { + lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); + WRITE_ONCE(ssk->sk_lingertime, 0); + sock_set_flag(ssk, SOCK_LINGER); + release_sock(ssk); + } + } + release_sock(sk); +} +EXPORT_SYMBOL(mptcp_sock_no_linger); + +static void __mptcp_sock_set_tos(struct sock *sk, int val) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_sock *msk; + struct sock *ssk; + + if (sk->sk_protocol !=3D IPPROTO_MPTCP) + return; + + msk =3D mptcp_sk(sk); + + lock_sock(sk); + sockopt_seq_inc(msk); + __ip_sock_set_tos(sk, val); + mptcp_for_each_subflow(msk, subflow) { + ssk =3D mptcp_subflow_tcp_sock(subflow); + if (ssk) { + lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); + __ip_sock_set_tos(ssk, val); + release_sock(ssk); + } + } + release_sock(sk); +} + +void mptcp_sock_set_tos(struct sock *sk) +{ + struct mptcp_sock *msk; + struct sock *ssk; + int val =3D 0; + + if (sk->sk_protocol !=3D IPPROTO_MPTCP) + return; + + msk =3D mptcp_sk(sk); + + lock_sock(sk); + ssk =3D msk->first; + if (ssk && ssk->sk_state =3D=3D TCP_ESTABLISHED) + val =3D inet_sk(ssk)->rcv_tos; + release_sock(sk); + + if (val > 0) + __mptcp_sock_set_tos(sk, val); +} +EXPORT_SYMBOL(mptcp_sock_set_tos); --=20 2.53.0 From nobody Mon May 25 18:05:07 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id E170D3FCB10 for ; Mon, 18 May 2026 12:08:05 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779106086; cv=none; b=h5C7xCDRJ9aA5kRHpv3D7mu5++0ejD7C1VI0t4wxHoYziMy0GSZR+1y9wvvrynTe7zfwgT3uyQO8VN9EoPJJjgt1ORMauKGUegObtxERPkmgl8BgP51ZRpoQjXHsYQ9m9VGYKIjQ5h6kryUHVCUp/cisI7Fg9KOZgw+RZ6GaR5o= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779106086; c=relaxed/simple; bh=e0zoP1SW64E3C24gimXLk+EgYE7Qda6fe21XWG3ytwY=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=iJiyC3yeRa935NfWXf7vcC6Y1tUi4VhN2Nb6c3B87F6SM8zS7hTvwaf5dY4svXBrAoaUn4r5+Ukm+/PSt+gHiA9SMG9PoCtx1ypu35Ns7Z4B3q2cCm39SLk8Rry2SsTHAsT9Ls1lpBPTl+SJEbsTRlB1MbtuKdvBUlrnJMLpsj4= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=n8JFwdWa; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="n8JFwdWa" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 1C247C2BCB8; Mon, 18 May 2026 12:08:03 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1779106085; bh=e0zoP1SW64E3C24gimXLk+EgYE7Qda6fe21XWG3ytwY=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=n8JFwdWa9S85VtIT8g3NEUNzs2hCbIlXo12uKIkMUxfr02cgEQGZ5bzhKFOJ1kE3U 9sQX6SVS+Berq//48aL1gJ2JM3WpBE0Gmm6XxcxZ3a0hPiifDBvJa2fDBlj2/9nION 7xYJBTAOFCAuOjXO2IJEfDO8dPszoLZwWuo/k77l9sXu6xSt0/K5b/Cmek1HQR43UU LCrG2fQ+HVbhBiU/T63isbAE1+fF5anl8S1+OSjw3fTfNBppZwSoJxZQ2e7f77YfT/ FoiwvGRmI52apv14UpXdN6zDjGP55umWaasl6Hk9xy6U+gNzCseCJDhyOU8CWUE4op +eOPdWynw5yeg== From: Geliang Tang To: mptcp@lists.linux.dev Cc: Geliang Tang , zhenwei pi , Hui Zhu , Gang Yan Subject: [RFC mptcp-next v13 05/12] nvme-tcp: define host tcp_proto struct Date: Mon, 18 May 2026 20:07:30 +0800 Message-ID: <84c8d4e48c14d7570f79df2b67959d3a4e73900b.1779104752.git.tanggeliang@kylinos.cn> X-Mailer: git-send-email 2.53.0 In-Reply-To: References: Precedence: bulk X-Mailing-List: mptcp@lists.linux.dev List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: Geliang Tang To add MPTCP support in "NVMe over TCP", the host side needs to pass IPPROTO_MPTCP to sock_create_kern() instead of IPPROTO_TCP to create an MPTCP socket. Similar to the target-side nvmet_tcp_proto, this patch defines the host-side nvme_tcp_proto structure, which contains the protocol of the socket and a set of function pointers for socket operations. The only difference is that it defines .set_syncnt instead of .set_reuseaddr. A TCP-specific version of this structure is defined, and a proto field is added to nvme_tcp_ctrl. When the transport string is "tcp", it is assigned to ctrl->proto. All locations that previously called TCP setsockopt functions are updated to call the corresponding function pointers in the nvme_tcp_proto structure. The controller's proto pointer is set during initialization and remains valid throughout the controller's lifetime. Co-developed-by: zhenwei pi Signed-off-by: zhenwei pi Co-developed-by: Hui Zhu Signed-off-by: Hui Zhu Co-developed-by: Gang Yan Signed-off-by: Gang Yan Signed-off-by: Geliang Tang --- drivers/nvme/host/tcp.c | 44 ++++++++++++++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 15d36d6a728e..13a5240623ef 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -182,6 +182,16 @@ struct nvme_tcp_queue { void (*write_space)(struct sock *); }; =20 +struct nvme_tcp_proto { + int protocol; + int (*set_syncnt)(struct sock *sk, int val); + void (*set_nodelay)(struct sock *sk); + void (*no_linger)(struct sock *sk); + void (*set_priority)(struct sock *sk, u32 priority); + void (*set_tos)(struct sock *sk, int val); + const struct nvme_ctrl_ops *ops; +}; + struct nvme_tcp_ctrl { /* read only in the hot path */ struct nvme_tcp_queue *queues; @@ -198,6 +208,8 @@ struct nvme_tcp_ctrl { struct delayed_work connect_work; struct nvme_tcp_request async_req; u32 io_queues[HCTX_MAX_TYPES]; + + const struct nvme_tcp_proto *proto; }; =20 static LIST_HEAD(nvme_tcp_ctrl_list); @@ -1799,7 +1811,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nct= rl, int qid, =20 ret =3D sock_create_kern(current->nsproxy->net_ns, ctrl->addr.ss_family, SOCK_STREAM, - IPPROTO_TCP, &queue->sock); + ctrl->proto->protocol, &queue->sock); if (ret) { dev_err(nctrl->device, "failed to create socket: %d\n", ret); @@ -1816,24 +1828,24 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *n= ctrl, int qid, nvme_tcp_reclassify_socket(queue->sock); =20 /* Single syn retry */ - tcp_sock_set_syncnt(queue->sock->sk, 1); + ctrl->proto->set_syncnt(queue->sock->sk, 1); =20 /* Set TCP no delay */ - tcp_sock_set_nodelay(queue->sock->sk); + ctrl->proto->set_nodelay(queue->sock->sk); =20 /* * Cleanup whatever is sitting in the TCP transmit queue on socket * close. This is done to prevent stale data from being sent should * the network connection be restored before TCP times out. */ - sock_no_linger(queue->sock->sk); + ctrl->proto->no_linger(queue->sock->sk); =20 if (so_priority > 0) - sock_set_priority(queue->sock->sk, so_priority); + ctrl->proto->set_priority(queue->sock->sk, so_priority); =20 /* Set socket type of service */ if (nctrl->opts->tos >=3D 0) - ip_sock_set_tos(queue->sock->sk, nctrl->opts->tos); + ctrl->proto->set_tos(queue->sock->sk, nctrl->opts->tos); =20 /* Set 10 seconds timeout for icresp recvmsg */ queue->sock->sk->sk_rcvtimeo =3D 10 * HZ; @@ -2900,6 +2912,17 @@ nvme_tcp_existing_controller(struct nvmf_ctrl_option= s *opts) return found; } =20 +static const struct nvme_tcp_proto nvme_tcp_proto =3D { + .protocol =3D IPPROTO_TCP, + .set_syncnt =3D tcp_sock_set_syncnt, + .set_nodelay =3D tcp_sock_set_nodelay, + .no_linger =3D sock_no_linger, + .set_priority =3D sock_set_priority, + .set_tos =3D ip_sock_set_tos, + .ops =3D &nvme_tcp_ctrl_ops, + +}; + static struct nvme_tcp_ctrl *nvme_tcp_alloc_ctrl(struct device *dev, struct nvmf_ctrl_options *opts) { @@ -2964,13 +2987,20 @@ static struct nvme_tcp_ctrl *nvme_tcp_alloc_ctrl(st= ruct device *dev, goto out_free_ctrl; } =20 + if (!strcmp(ctrl->ctrl.opts->transport, "tcp")) { + ctrl->proto =3D &nvme_tcp_proto; + } else { + ret =3D -EINVAL; + goto out_free_ctrl; + } + ctrl->queues =3D kzalloc_objs(*ctrl->queues, ctrl->ctrl.queue_count); if (!ctrl->queues) { ret =3D -ENOMEM; goto out_free_ctrl; } =20 - ret =3D nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_tcp_ctrl_ops, 0); + ret =3D nvme_init_ctrl(&ctrl->ctrl, dev, ctrl->proto->ops, 0); if (ret) goto out_kfree_queues; =20 --=20 2.53.0 From nobody Mon May 25 18:05:07 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 561B63FBEA9 for ; Mon, 18 May 2026 12:08:08 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779106088; cv=none; b=bpLGdfdcTjQ6ZtuCtpUVNE/yv5k6/ALHPm5zINhMRzlbx16dZkuOWpz2rPHZ6VHBYC+lzwUERe7AkIHlspWfjtYKh5CYBYKjHlWrJtIZTosSG+SkFwj6nlPYMyjID0c4Gu4yA+5ESwqOHm5Dw8u1TzuqIxu/+CB35MMzXp3f3bg= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779106088; c=relaxed/simple; bh=ogWf/zrsOAL3UDKqkzALYnzC9lKAQzJkS2Xce2RI7N0=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=R1Mg6Ik6E6wIketSQ2t2oIpHf3LQbtTVAc3Rtv3AXdTsiGp56irQK3h+eoJEQ8V4h0zfYaxWEqAt+REfHQVNeXwmIoVbBMnxi0hXgKxpCbNbdxz34oL0Y4WvmlDQTUkdpkCI8K5LbGryIOYyUKYhuu+x2Ex5tkUo+sj62YNFZMQ= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=DfYHO763; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="DfYHO763" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 32552C2BCC6; Mon, 18 May 2026 12:08:06 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1779106087; bh=ogWf/zrsOAL3UDKqkzALYnzC9lKAQzJkS2Xce2RI7N0=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=DfYHO76370sLPPr7wURlpXeQHIKSwwOjRmD64/w3mca82haoR0trOiSU2WLmz7BkQ hkTg8hPAFYm/3U07ZCQquomaUwQQXmqct/yJcNfqVZLY9fpImvbIOWxz2EkOYBnEmE nFx1rduBpf5tAUORYee70CuMFbt1BXhmA7UOzkMgD2PRN31sGncFaybwuKtIzXQo4T 3adLUOrTdFXbteyoOY/KmoJ73HpLZs5IcAH/YzPOsm0Qj3KFWlhjF7z1MpMkkw7TOi yQthxOB+uTTb9bt+o/jj6ZjUZaj825E9dWXFXS1dQC2NlPBEYpN170fBna3Ec4YKu6 gEBnBAnAB5ccA== From: Geliang Tang To: mptcp@lists.linux.dev Cc: Geliang Tang , zhenwei pi , Hui Zhu , Gang Yan Subject: [RFC mptcp-next v13 06/12] nvme-tcp: register host mptcp transport Date: Mon, 18 May 2026 20:07:31 +0800 Message-ID: <344d157ea7c2086ba8f6bccd714016918e6180f8.1779104752.git.tanggeliang@kylinos.cn> X-Mailer: git-send-email 2.53.0 In-Reply-To: References: Precedence: bulk X-Mailing-List: mptcp@lists.linux.dev List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: Geliang Tang This patch defines a new nvmf_transport_ops named nvme_mptcp_transport, which is almost the same as nvme_tcp_transport except .name and .allowed_opts. MPTCP currently does not support TLS. The four TLS-related options (NVMF_OPT_TLS, NVMF_OPT_KEYRING, NVMF_OPT_TLS_KEY, and NVMF_OPT_CONCAT) have been removed from allowed_opts. They will be added back once MPTCP TLS is supported. It is registered in nvme_tcp_init_module() and unregistered in nvme_tcp_cleanup_module(). A MODULE_ALIAS("nvme-mptcp") declaration is added at the end of the file. v2: - use 'trtype' instead of '--mptcp' (Hannes) v3: - check mptcp protocol from opts->transport instead of passing a parameter (Hannes). v4: - check CONFIG_MPTCP. Co-developed-by: zhenwei pi Signed-off-by: zhenwei pi Co-developed-by: Hui Zhu Signed-off-by: Hui Zhu Co-developed-by: Gang Yan Signed-off-by: Gang Yan Signed-off-by: Geliang Tang --- drivers/nvme/host/tcp.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 13a5240623ef..305624d59c50 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -3067,6 +3067,20 @@ static struct nvmf_transport_ops nvme_tcp_transport = =3D { .create_ctrl =3D nvme_tcp_create_ctrl, }; =20 +#ifdef CONFIG_MPTCP +static struct nvmf_transport_ops nvme_mptcp_transport =3D { + .name =3D "mptcp", + .module =3D THIS_MODULE, + .required_opts =3D NVMF_OPT_TRADDR, + .allowed_opts =3D NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY | + NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO | + NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST | + NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES | + NVMF_OPT_TOS | NVMF_OPT_HOST_IFACE, + .create_ctrl =3D nvme_tcp_create_ctrl, +}; +#endif + static int __init nvme_tcp_init_module(void) { unsigned int wq_flags =3D WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_SYSFS; @@ -3092,6 +3106,9 @@ static int __init nvme_tcp_init_module(void) atomic_set(&nvme_tcp_cpu_queues[cpu], 0); =20 nvmf_register_transport(&nvme_tcp_transport); +#ifdef CONFIG_MPTCP + nvmf_register_transport(&nvme_mptcp_transport); +#endif return 0; } =20 @@ -3099,6 +3116,9 @@ static void __exit nvme_tcp_cleanup_module(void) { struct nvme_tcp_ctrl *ctrl; =20 +#ifdef CONFIG_MPTCP + nvmf_unregister_transport(&nvme_mptcp_transport); +#endif nvmf_unregister_transport(&nvme_tcp_transport); =20 mutex_lock(&nvme_tcp_ctrl_mutex); @@ -3116,3 +3136,6 @@ module_exit(nvme_tcp_cleanup_module); MODULE_DESCRIPTION("NVMe host TCP transport driver"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS("nvme-tcp"); +#ifdef CONFIG_MPTCP +MODULE_ALIAS("nvme-mptcp"); +#endif --=20 2.53.0 From nobody Mon May 25 18:05:07 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id A8CCD3FBB5C for ; Mon, 18 May 2026 12:08:10 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779106090; cv=none; b=DBHV8cShHWFgVLvlZOAcU4bdmVwpZ/JFqVnQRKfSifudwuOiXxLAiFXbF1+r72Jnc85diCaZEus9fYqZSSqIRqLpNH+2qbu8dCyuVNMuimSqScev/qvUNntcAej5rmE7Z1OwL/DcTGiqsjqihFYyWxsNdBMX0qgpZW75WN80ihM= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779106090; c=relaxed/simple; bh=O3bXL2fBO+OMBP01NvgAQ/8Cx7GuF/QMGzG0z2GAG4A=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=J8/mH0fEggwJSjHTMWyn4T6GEBAaRXlNLJOCKqDS5jn1C25p3RKKEzR917pzF+LjVZL6dZCaAxww2K5dfmU5ocrFZ/RbLetGf5qAvy8tOSmVdf1Zo7okvkOy5X/4R94oKtkjujuuTwN8yJ426Jh/DAgfF3DKumdeeBj7FFf70Wg= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=AOcUcnkY; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="AOcUcnkY" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 4C609C2BCB8; Mon, 18 May 2026 12:08:08 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1779106090; bh=O3bXL2fBO+OMBP01NvgAQ/8Cx7GuF/QMGzG0z2GAG4A=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=AOcUcnkYvDLGPhRHXDASwZtH5frrBEZG+z3CQCLnkDKGHYFhPTBJaIiat5mYCmyyz g9meOfM0P+bFH3SdpMF+QsdwPIOrtrMTxINRFYMsue7Q/7Pnv2eWDexnS4OFDhvBFd hivhuTi4aM9IgBuRl1HHtwmVbXXe7h2DlEdTJ2TNpHmj+5UlFtqvIxjE4AhL0YKHFb WJgrob7hKYH36XqpcaH4WjpHOSfrNoxZZ04WnlxBNIXSXalsXo+OX9MurfLzf1iGRz OUkoNIiII075OZd9ZD86mKhclI9lBN9Ccmg+5w/WA77Pe55oduvSILGg0L52w9zn3G i98tgnod+dNVw== From: Geliang Tang To: mptcp@lists.linux.dev Cc: Geliang Tang , zhenwei pi , Hui Zhu , Gang Yan Subject: [RFC mptcp-next v13 07/12] nvme-tcp: implement host mptcp proto Date: Mon, 18 May 2026 20:07:32 +0800 Message-ID: <14959d6716913cca12866057661831550835f25c.1779104752.git.tanggeliang@kylinos.cn> X-Mailer: git-send-email 2.53.0 In-Reply-To: References: Precedence: bulk X-Mailing-List: mptcp@lists.linux.dev List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: Geliang Tang An MPTCP-specific version of struct nvme_tcp_proto is implemented, and it is assigned to ctrl->proto when the transport string is "mptcp". The socket option setting logic is similar to the target side, except that mptcp_sock_set_syncnt is newly defined for the host side. These helpers set the values on all existing subflows of an MPTCP connection, except for set_reuseaddr which only applies to the first subflow. The values are then synchronized to other newly created subflows in sync_socket_options(). A separate nvme_mptcp_ctrl_ops structure with .name =3D "mptcp" is defined and used for MPTCP controllers. Co-developed-by: zhenwei pi Signed-off-by: zhenwei pi Co-developed-by: Hui Zhu Signed-off-by: Hui Zhu Co-developed-by: Gang Yan Signed-off-by: Gang Yan Signed-off-by: Geliang Tang --- drivers/nvme/host/tcp.c | 34 ++++++++++++++++++++++++++++++++++ include/net/mptcp.h | 11 +++++++++++ net/mptcp/sockopt.c | 35 ++++++++++++++++++++++++++++++++++- 3 files changed, 79 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 305624d59c50..2388a8c443cc 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2895,6 +2895,24 @@ static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = =3D { .get_virt_boundary =3D nvmf_get_virt_boundary, }; =20 +#ifdef CONFIG_MPTCP +static const struct nvme_ctrl_ops nvme_mptcp_ctrl_ops =3D { + .name =3D "mptcp", + .module =3D THIS_MODULE, + .flags =3D NVME_F_FABRICS | NVME_F_BLOCKING, + .reg_read32 =3D nvmf_reg_read32, + .reg_read64 =3D nvmf_reg_read64, + .reg_write32 =3D nvmf_reg_write32, + .subsystem_reset =3D nvmf_subsystem_reset, + .free_ctrl =3D nvme_tcp_free_ctrl, + .submit_async_event =3D nvme_tcp_submit_async_event, + .delete_ctrl =3D nvme_tcp_delete_ctrl, + .get_address =3D nvme_tcp_get_address, + .stop_ctrl =3D nvme_tcp_stop_ctrl, + .get_virt_boundary =3D nvmf_get_virt_boundary, +}; +#endif + static bool nvme_tcp_existing_controller(struct nvmf_ctrl_options *opts) { @@ -2923,6 +2941,18 @@ static const struct nvme_tcp_proto nvme_tcp_proto = =3D { =20 }; =20 +#ifdef CONFIG_MPTCP +static const struct nvme_tcp_proto nvme_mptcp_proto =3D { + .protocol =3D IPPROTO_MPTCP, + .set_syncnt =3D mptcp_sock_set_syncnt, + .set_nodelay =3D mptcp_sock_set_nodelay, + .no_linger =3D mptcp_sock_no_linger, + .set_priority =3D mptcp_sock_set_priority, + .set_tos =3D __mptcp_sock_set_tos, + .ops =3D &nvme_mptcp_ctrl_ops, +}; +#endif + static struct nvme_tcp_ctrl *nvme_tcp_alloc_ctrl(struct device *dev, struct nvmf_ctrl_options *opts) { @@ -2989,6 +3019,10 @@ static struct nvme_tcp_ctrl *nvme_tcp_alloc_ctrl(str= uct device *dev, =20 if (!strcmp(ctrl->ctrl.opts->transport, "tcp")) { ctrl->proto =3D &nvme_tcp_proto; +#ifdef CONFIG_MPTCP + } else if (!strcmp(ctrl->ctrl.opts->transport, "mptcp")) { + ctrl->proto =3D &nvme_mptcp_proto; +#endif } else { ret =3D -EINVAL; goto out_free_ctrl; diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 8eacb9424b37..9d5f0bf49d31 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -246,7 +246,11 @@ void mptcp_sock_set_priority(struct sock *sk, u32 prio= rity); =20 void mptcp_sock_no_linger(struct sock *sk); =20 +void __mptcp_sock_set_tos(struct sock *sk, int val); + void mptcp_sock_set_tos(struct sock *sk); + +int mptcp_sock_set_syncnt(struct sock *sk, int val); #else =20 static inline void mptcp_init(void) @@ -342,7 +346,14 @@ static inline void mptcp_sock_set_priority(struct sock= *sk, u32 priority) { } =20 static inline void mptcp_sock_no_linger(struct sock *sk) { } =20 +static inline void __mptcp_sock_set_tos(struct sock *sk, int val) { } + static inline void mptcp_sock_set_tos(struct sock *sk) { } + +static inline int mptcp_sock_set_syncnt(struct sock *sk, int val) +{ + return 0; +} #endif /* CONFIG_MPTCP */ =20 #if IS_ENABLED(CONFIG_MPTCP_IPV6) diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index be245babef30..90bfb37930da 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -1598,6 +1598,8 @@ static void sync_socket_options(struct mptcp_sock *ms= k, struct sock *ssk) WRITE_ONCE(inet_sk(ssk)->local_port_range, READ_ONCE(inet_sk(sk)->local_p= ort_range)); =20 ssk->sk_reuse =3D sk->sk_reuse; + if (inet_csk(sk)->icsk_syn_retries > 0) + tcp_sock_set_syncnt(ssk, inet_csk(sk)->icsk_syn_retries); } =20 void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk) @@ -1769,7 +1771,7 @@ void mptcp_sock_no_linger(struct sock *sk) } EXPORT_SYMBOL(mptcp_sock_no_linger); =20 -static void __mptcp_sock_set_tos(struct sock *sk, int val) +void __mptcp_sock_set_tos(struct sock *sk, int val) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk; @@ -1793,6 +1795,7 @@ static void __mptcp_sock_set_tos(struct sock *sk, int= val) } release_sock(sk); } +EXPORT_SYMBOL(__mptcp_sock_set_tos); =20 void mptcp_sock_set_tos(struct sock *sk) { @@ -1815,3 +1818,33 @@ void mptcp_sock_set_tos(struct sock *sk) __mptcp_sock_set_tos(sk, val); } EXPORT_SYMBOL(mptcp_sock_set_tos); + +int mptcp_sock_set_syncnt(struct sock *sk, int val) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_sock *msk; + struct sock *ssk; + + if (val < 1 || val > MAX_TCP_SYNCNT) + return -EINVAL; + + if (sk->sk_protocol !=3D IPPROTO_MPTCP) + return -EINVAL; + + msk =3D mptcp_sk(sk); + + lock_sock(sk); + sockopt_seq_inc(msk); + inet_csk(sk)->icsk_syn_retries =3D val; + mptcp_for_each_subflow(msk, subflow) { + ssk =3D mptcp_subflow_tcp_sock(subflow); + if (ssk) { + lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); + tcp_sock_set_syncnt(ssk, val); + release_sock(ssk); + } + } + release_sock(sk); + return 0; +} +EXPORT_SYMBOL(mptcp_sock_set_syncnt); --=20 2.53.0 From nobody Mon May 25 18:05:07 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 9B5163F9295 for ; Mon, 18 May 2026 12:08:13 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779106093; cv=none; b=T9KTsvP3VibvKmBOisPv1v9h7oOQi8USQEc+auuh4LAFVvy1oKHjSb55PjJQ9GtdcPmH0BOgkGjngWmpvqhnfWDe3vLDTz3dg1ANligy0XVQghYjl39yKyAy4b0x42CB5kJr55j1BFpRkNyQkS6quu6XOVJqGBkN/Nvj441dcao= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779106093; c=relaxed/simple; bh=kYZc0lE68ooEGG7r9C+CG4A2SxnfPFP+m9QiFxS2Iwc=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=uF0+tc6Kw/sQerBycTML4GqC0owRxRmncLCLN7bKoM/Uz6omJJOAagp6EKvIfRLeAI1PEvTVzDCW/jF78FpInoT357bHRruCmhaulEl9W6JB3DuL3+Oqv5cZnQaeuQR8vZStCEbswHK5OuvvkOqXsMA7PYDgsvNG+X5NGSek/c8= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=ZCpdNyRl; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="ZCpdNyRl" Received: by smtp.kernel.org (Postfix) with ESMTPSA id D48B8C2BCB8; Mon, 18 May 2026 12:08:10 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1779106093; bh=kYZc0lE68ooEGG7r9C+CG4A2SxnfPFP+m9QiFxS2Iwc=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=ZCpdNyRlaUt2ROvYzd/CVxYHuLE9eA8mxdDcoVuyfWfZVhxDMXCbrkAHql5yYS/gE pMVlygD5NSN/IEPAXEqiyBeLxberNEQUV5eykqSHRXXquFY5A76wlCBfZMubqRL2fL KRn3Az2lJKGrbxMkAuZLQTi/tQHMduTvI3MQLzuV8r6mH3cdnTHnwTkVz3VlGREPNA ezhIGadGl1PDby55gMDEnKrzTl0JjcCEztoL65vRxH0rKDZExsHQNVZOZxHDid5+xw JjlCrLK6U3W79Z/A/oN0yQMJMx6++Bn+X6xHyY39XE/4u7nRsBnkT2C9z36Fe42gIR 1gzA/vW2aI+wg== From: Geliang Tang To: mptcp@lists.linux.dev Cc: Geliang Tang , zhenwei pi , Hui Zhu , Gang Yan Subject: [RFC mptcp-next v13 08/12] selftests: mptcp: add nvme over mptcp test Date: Mon, 18 May 2026 20:07:33 +0800 Message-ID: <3a83f15d1b24d1db84f08e4969cf081435adb606.1779104752.git.tanggeliang@kylinos.cn> X-Mailer: git-send-email 2.53.0 In-Reply-To: References: Precedence: bulk X-Mailing-List: mptcp@lists.linux.dev List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: Geliang Tang A test case for NVMe over MPTCP has been implemented. It verifies the proper functionality of nvme discover and connect commands to establish NVMe over MPTCP connections. The test then evaluates read/write performance using fio, and ensures proper cleanup with nvme disconnect. This script accepts two positional parameters: trtype - Transport type (mptcp|tcp). Default: mptcp path - Number of multipath (1-4). Default: 1 This test simulates four NICs on both target and host sides, each limited to 125MB/s. It shows that 'NVMe over MPTCP' delivered bandwidth up to four times that of standard TCP with a single NVMe multipath configuration: # ./mptcp_nvme.sh tcp READ: bw=3D112MiB/s (118MB/s), 112MiB/s-112MiB/s (118MB/s-118MB/s), io=3D1123MiB (1177MB), run=3D10018-10018msec WRITE: bw=3D112MiB/s (117MB/s), 112MiB/s-112MiB/s (117MB/s-117MB/s), io=3D1118MiB (1173MB), run=3D10018-10018msec # ./mptcp_nvme.sh mptcp READ: bw=3D427MiB/s (448MB/s), 427MiB/s-427MiB/s (448MB/s-448MB/s), io=3D4286MiB (4494MB), run=3D10039-10039msec WRITE: bw=3D387MiB/s (406MB/s), 387MiB/s-387MiB/s (406MB/s-406MB/s), io=3D3885MiB (4073MB), run=3D10043-10043msec It reflects that MPTCP has the same multi-interface bandwidth aggregation capability as NVMe multipath. Co-developed-by: zhenwei pi Signed-off-by: zhenwei pi Co-developed-by: Hui Zhu Signed-off-by: Hui Zhu Co-developed-by: Gang Yan Signed-off-by: Gang Yan Signed-off-by: Geliang Tang --- tools/testing/selftests/net/mptcp/Makefile | 1 + tools/testing/selftests/net/mptcp/config | 8 + .../testing/selftests/net/mptcp/mptcp_lib.sh | 12 + .../testing/selftests/net/mptcp/mptcp_nvme.sh | 331 ++++++++++++++++++ 4 files changed, 352 insertions(+) create mode 100755 tools/testing/selftests/net/mptcp/mptcp_nvme.sh diff --git a/tools/testing/selftests/net/mptcp/Makefile b/tools/testing/sel= ftests/net/mptcp/Makefile index 22ba0da2adb8..7b308447a58b 100644 --- a/tools/testing/selftests/net/mptcp/Makefile +++ b/tools/testing/selftests/net/mptcp/Makefile @@ -13,6 +13,7 @@ TEST_PROGS :=3D \ mptcp_connect_sendfile.sh \ mptcp_connect_splice.sh \ mptcp_join.sh \ + mptcp_nvme.sh \ mptcp_sockopt.sh \ pm_netlink.sh \ simult_flows.sh \ diff --git a/tools/testing/selftests/net/mptcp/config b/tools/testing/selft= ests/net/mptcp/config index 59051ee2a986..e59cf7398f19 100644 --- a/tools/testing/selftests/net/mptcp/config +++ b/tools/testing/selftests/net/mptcp/config @@ -34,3 +34,11 @@ CONFIG_NFT_SOCKET=3Dm CONFIG_NFT_TPROXY=3Dm CONFIG_SYN_COOKIES=3Dy CONFIG_VETH=3Dy +CONFIG_BLK_DEV_LOOP=3Dy +CONFIG_CONFIGFS_FS=3Dy +CONFIG_NVME_CORE=3Dy +CONFIG_NVME_FABRICS=3Dy +CONFIG_NVME_TCP=3Dy +CONFIG_NVME_TARGET=3Dy +CONFIG_NVME_TARGET_TCP=3Dy +CONFIG_NVME_MULTIPATH=3Dy diff --git a/tools/testing/selftests/net/mptcp/mptcp_lib.sh b/tools/testing= /selftests/net/mptcp/mptcp_lib.sh index 5ef6033775c8..e08854ba42bd 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_lib.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh @@ -530,6 +530,18 @@ mptcp_lib_check_tools() { exit ${KSFT_SKIP} fi ;; + "nvme") + if ! nvme --version &> /dev/null; then + mptcp_lib_pr_skip "nvme tool not found" + exit ${KSFT_SKIP} + fi + ;; + "fio") + if ! fio -h &> /dev/null; then + mptcp_lib_pr_skip "fio tool not found" + exit ${KSFT_SKIP} + fi + ;; *) mptcp_lib_pr_fail "Internal error: unsupported tool: ${tool}" exit ${KSFT_FAIL} diff --git a/tools/testing/selftests/net/mptcp/mptcp_nvme.sh b/tools/testin= g/selftests/net/mptcp/mptcp_nvme.sh new file mode 100755 index 000000000000..2bb20d30e397 --- /dev/null +++ b/tools/testing/selftests/net/mptcp/mptcp_nvme.sh @@ -0,0 +1,331 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(dirname "$0")/mptcp_lib.sh" + +ret=3D0 +trtype=3D"${1:-mptcp}" +path=3D"${2:-1}" +nqn=3D"nqn.2014-08.org.nvmexpress.${trtype}dev.$$.${RANDOM}" +ns=3D1 +port=3D$((RANDOM % 10000 + 20000)) +trsvcid=3D$((RANDOM % 64512 + 1024)) +ns1=3D"" +ns2=3D"" +temp_file=3D"" +loop_dev=3D"" + +export trtype path nqn ns port trsvcid +export loop_dev temp_file + +usage() +{ + cat << EOF + +Usage: + + $(basename "$0") [trtype] [path] + + trtype Transport type (tcp|mptcp) - default: mptcp + path Number of multipath (1-4) - default: 1 + +EOF +exit ${KSFT_FAIL} +} + +validate_params() +{ + if [[ ! "${trtype}" =3D~ ^(tcp|mptcp)$ ]]; then + echo "Invalid trtype ${trtype}. Must be tcp or mptcp" + usage + fi + + if [[ ! "${path}" =3D~ ^[1-4]$ ]]; then + echo "Invalid path count ${path}. Must be between 1 and 4" + usage + fi +} + +# This function is invoked indirectly +#shellcheck disable=3DSC2317,SC2329 +ns1_cleanup() +{ + pushd /sys/kernel/config/nvmet || exit 1 + + for i in $(seq 1 "${path}"); do + local portdir=3D$((port + i)) + + rm -rf "ports/${portdir}/subsystems/${nqn}" + rmdir "ports/${portdir}" + done + + echo 0 > "subsystems/${nqn}/namespaces/${ns}/enable" + echo -n 0 > "subsystems/${nqn}/namespaces/${ns}/device_path" + rmdir "subsystems/${nqn}/namespaces/${ns}" + rmdir "subsystems/${nqn}" + + popd || exit 1 +} + +# This function is invoked indirectly +#shellcheck disable=3DSC2317,SC2329 +ns2_cleanup() +{ + nvme disconnect -n "${nqn}" || true +} + +# This function is used in the cleanup trap +#shellcheck disable=3DSC2317,SC2329 +cleanup() +{ + if ! ip netns exec "$ns2" bash <<- EOF + $(declare -f ns2_cleanup) + ns2_cleanup + EOF + then + echo "ns2_cleanup failed" >&2 + fi + + sleep 1 + + if ! ip netns exec "$ns1" unshare -m bash <<- EOF + mount -t configfs none /sys/kernel/config + $(declare -f ns1_cleanup) + ns1_cleanup + EOF + then + echo "ns1_cleanup failed" >&2 + fi + + if [ -n "${loop_dev}" ] && [ -b "${loop_dev}" ]; then + losetup -d "${loop_dev}" 2>/dev/null || true + fi + rm -rf "${temp_file}" + + mptcp_lib_ns_exit "$ns1" "$ns2" + + unset -v trtype path nqn ns port trsvcid + unset -v loop_dev temp_file +} + +# $tc_args needs word splitting to pass multiple arguments to netem +# shellcheck disable=3DSC2086 +init() +{ + local tc_args=3D"rate 1000mbit" + + mptcp_lib_ns_init ns1 ns2 + + # ns1 ns2 + # 10.1.1.1 10.1.1.2 + # 10.1.2.1 10.1.2.2 + # 10.1.3.1 10.1.3.2 + # 10.1.4.1 10.1.4.2 + for i in {1..4}; do + ip link add ns1eth"$i" netns "$ns1" type veth peer \ + name ns2eth"$i" netns "$ns2" + ip -net "$ns1" addr add 10.1."$i".1/24 dev ns1eth"$i" + ip -net "$ns1" addr add dead:beef:"$i"::1/64 \ + dev ns1eth"$i" nodad + ip -net "$ns1" link set ns1eth"$i" up + ip -net "$ns2" addr add 10.1."$i".2/24 dev ns2eth"$i" + ip -net "$ns2" addr add dead:beef:"$i"::2/64 \ + dev ns2eth"$i" nodad + ip -net "$ns2" link set ns2eth"$i" up + ip -net "$ns2" route add default via 10.1."$i".1 \ + dev ns2eth"$i" metric 10"$i" + ip -net "$ns2" route add default via dead:beef:"$i"::1 \ + dev ns2eth"$i" metric 10"$i" + + # Add tc qdisc to both namespaces for bandwidth limiting + tc -n "$ns1" qdisc add dev ns1eth"$i" root netem $tc_args + tc -n "$ns2" qdisc add dev ns2eth"$i" root netem $tc_args + + tc -n "$ns1" qdisc show dev ns1eth"$i" + tc -n "$ns2" qdisc show dev ns2eth"$i" + done + + mptcp_lib_pm_nl_set_limits "${ns1}" 8 8 + + mptcp_lib_pm_nl_add_endpoint "$ns1" 10.1.1.1 flags signal + mptcp_lib_pm_nl_add_endpoint "$ns1" 10.1.2.1 flags signal + mptcp_lib_pm_nl_add_endpoint "$ns1" 10.1.3.1 flags signal + mptcp_lib_pm_nl_add_endpoint "$ns1" 10.1.4.1 flags signal + + mptcp_lib_pm_nl_set_limits "${ns2}" 8 8 + + mptcp_lib_pm_nl_add_endpoint "$ns2" 10.1.1.2 flags subflow + mptcp_lib_pm_nl_add_endpoint "$ns2" 10.1.2.2 flags subflow + mptcp_lib_pm_nl_add_endpoint "$ns2" 10.1.3.2 flags subflow + mptcp_lib_pm_nl_add_endpoint "$ns2" 10.1.4.2 flags subflow +} + +# This function is invoked indirectly +#shellcheck disable=3DSC2317,SC2329 +run_target() +{ + cd /sys/kernel/config/nvmet/subsystems || exit + mkdir -p "${nqn}" + cd "${nqn}" || exit + echo 1 > attr_allow_any_host + mkdir -p namespaces/"${ns}" + echo "${loop_dev}" > namespaces/"${ns}"/device_path + echo 1 > namespaces/"${ns}"/enable + + # Create 4 ports, each on a different IP address + for i in $(seq 1 "${path}"); do + local portdir=3D$((port + i)) + + cd /sys/kernel/config/nvmet/ports || exit + mkdir -p "${portdir}" + cd "${portdir}" || exit 1 + echo "${trtype}" > addr_trtype + echo ipv4 > addr_adrfam + if [ "${path}" -eq 1 ]; then + echo "0.0.0.0" > addr_traddr + else + echo "10.1.${i}.1" > addr_traddr + fi + echo "${trsvcid}" > addr_trsvcid + + mkdir -p subsystems + ln -sf "../../subsystems/${nqn}" "subsystems/${nqn}" + cd - >/dev/null || exit + done +} + +# This function is invoked indirectly +#shellcheck disable=3DSC2317,SC2329 +run_host() +{ + local traddr=3D10.1.1.1 + local devname + + echo "nvme discover -a ${traddr}" + if ! nvme discover -t "${trtype}" -a "${traddr}" \ + -s "${trsvcid}"; then + echo "Failed to discover ${traddr}" + return 1 + fi + + for i in $(seq 1 "${path}"); do + traddr=3D10.1.${i}.1 + echo "Connecting to ${traddr}:${trsvcid}" + if ! nvme connect -t "${trtype}" -a "${traddr}" \ + -s "${trsvcid}" -n "${nqn}"; then + echo "Failed to connect to ${traddr}" + return 1 + fi + done + + for i in $(seq 1 10); do + for dev in /dev/nvme*n1 /dev/nvme*cn1; do + if [ -b "$dev" ] 2>/dev/null; then + if nvme id-ctrl "$dev" 2>/dev/null | + grep -q "${nqn}"; then + devname=3D$(basename "$dev") + break 2 + fi + fi + done 2>/dev/null + [ -n "$devname" ] && break + sleep 1 + done + + if [ -z "$devname" ]; then + echo "No block device found for NQN ${nqn}" >&2 + return 1 + fi + + echo "nvme list" + if ! nvme list; then + echo "nvme list failed" >&2 + return 1 + fi + + sleep 1 + + echo "fio randread /dev/${devname}" + if ! fio --name=3Dglobal --direct=3D1 --norandommap --randrepeat=3D0 \ + --ioengine=3Dlibaio --thread=3D1 --blocksize=3D128k --runtime=3D10 \ + --time_based --rw=3Drandread --numjobs=3D4 --iodepth=3D256 \ + --group_reporting --size=3D100% \ + --name=3Dlibaio_4_256_128k_randread \ + --filename=3D"/dev/${devname}"; then + echo "fio randread failed" + return 1 + fi + + sleep 1 + + echo "fio randwrite /dev/${devname}" + if ! fio --name=3Dglobal --direct=3D1 --norandommap --randrepeat=3D0 \ + --ioengine=3Dlibaio --thread=3D1 --blocksize=3D128k --runtime=3D10 \ + --time_based --rw=3Drandwrite --numjobs=3D4 --iodepth=3D256 \ + --group_reporting --size=3D100% \ + --name=3Dlibaio_4_256_128k_randwrite \ + --filename=3D"/dev/${devname}"; then + echo "fio randwrite failed" + return 1 + fi + + nvme flush "/dev/${devname}" +} + +mptcp_lib_check_tools nvme fio +validate_params + +if ! temp_file=3D$(mktemp /tmp/nvme_test.XXXXXX.raw); then + echo "Failed to create temp file" + exit 1 +fi + +if ! dd if=3D/dev/zero of=3D"${temp_file}" bs=3D1M count=3D0 seek=3D512; t= hen + echo "Failed to create backing file" >&2 + rm -f "${temp_file}" + exit 1 +fi + +if ! loop_dev=3D$(losetup -f --show "${temp_file}"); then + echo "Failed to create loop device" >&2 + rm -f "${temp_file}" + exit 1 +fi + +trap cleanup EXIT +init + +run_test() +{ + if ! ip netns exec "$ns1" unshare -m bash <<- EOF + mount -t configfs none /sys/kernel/config + $(declare -f run_target) + run_target + exit \$? + EOF + then + ret=3D"${KSFT_FAIL}" + fi + + if ! ip netns exec "$ns2" bash <<- EOF + $(declare -f run_host) + run_host + exit \$? + EOF + then + ret=3D"${KSFT_FAIL}" + fi + + sleep 1 +} + +run_test "$@" + +if [ "${ret}" -eq 0 ]; then + mptcp_lib_result_pass "nvme over ${trtype} test" +else + mptcp_lib_result_fail "nvme over ${trtype} test" +fi + +mptcp_lib_result_print_all_tap +exit "$ret" --=20 2.53.0 From nobody Mon May 25 18:05:07 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 9CD3F3FD12C for ; Mon, 18 May 2026 12:08:15 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779106095; cv=none; b=OhAaOuAW3BXqpBkBJI0XqZbJaQH0URUGsMHmNsA35oATfRQgumsuoltvNxl75NEqcaKwkZt9P5FUw3e0tJCclrNanwfzX41HknbWJPs7rX/i17GyD7XELCed0zkx4frwbvQxTzcDWUSj3w2exwyvLdg/1rYxw8BVA2fpmFzfMdk= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779106095; c=relaxed/simple; bh=vf6ULtFHWGoesNS/lFZx8DmSpYxPcl4yCQL6/3i2c58=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=kRb8GoFT3KwNecypoJWE6WZbPbOHuDJCizg9/2SMrVOXGTqmxBsO59RJWC5cVVCBdL1obvz5PP+5lG7cjaejbfpL3U+Mvoc4ZrNYzDJ6bkCe4EXiZK4ggDbzCsvQz5RAO98RbOn0KT0Duaq0qb4AVD8IFs94pW8UyuqEqUQ0WTo= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=koWotbX6; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="koWotbX6" Received: by smtp.kernel.org (Postfix) with ESMTPSA id B4C28C2BCB7; Mon, 18 May 2026 12:08:13 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1779106095; bh=vf6ULtFHWGoesNS/lFZx8DmSpYxPcl4yCQL6/3i2c58=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=koWotbX693mpP6Uys3rJ4ws97MHQqjobjDmuFu+m5fQUX0If1OQ4xyvT6ED4B134p E0SRVBvddKlu0bFJMAl4XMPTzs9+NkKCBlktxzvZ1EzwVU1G+rfp15vw1YpEmLd8Pz SkBWdZVNZa2n1EkhzPx+vvA5CwPq4lS8AYZ1wNmBNBlMibooLEPJ2j2OkiiArm0qQJ FIi0N25P4qiqXNw4X6YuIUfW9t0GmHsuYpMQibUANJB4fONu9ttJLvAA5mO+46Ug6J 2Y5TreyCGDnel/jg9E3z4Pwhsf66fE7iKL6F4puPHbC1+RB1XNsP3eI/XBYwyQ8Cnv jLrCbJUU7m3pw== From: Geliang Tang To: mptcp@lists.linux.dev Cc: Geliang Tang , zhenwei pi , Hui Zhu , Gang Yan Subject: [RFC mptcp-next v13 09/12] selftests: mptcp: nvme: add iopolicy tests Date: Mon, 18 May 2026 20:07:34 +0800 Message-ID: <8bf6076a51ac30e6f7d247dc91a1df8789cbb519.1779104752.git.tanggeliang@kylinos.cn> X-Mailer: git-send-email 2.53.0 In-Reply-To: References: Precedence: bulk X-Mailing-List: mptcp@lists.linux.dev List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: Geliang Tang Add NVMe iopolicy testing to mptcp_nvme.sh, with the default set to "numa". It can be set to "round-robin" or "queue-depth". Test results with 4 NVMe multipath paths and round-robin iopolicy show that TCP and MPTCP achieve similar bandwidth: # ./mptcp_nvme.sh tcp 4 round-robin READ: bw=3D455MiB/s (478MB/s), 455MiB/s-455MiB/s (478MB/s-478MB/s), io=3D4665MiB (4891MB), run=3D10242-10242msec WRITE: bw=3D455MiB/s (477MB/s), 455MiB/s-455MiB/s (477MB/s-477MB/s), io=3D4633MiB (4858MB), run=3D10184-10184msec # ./mptcp_nvme.sh mptcp 4 round-robin READ: bw=3D445MiB/s (466MB/s), 445MiB/s-445MiB/s (466MB/s-466MB/s), io=3D4575MiB (4797MB), run=3D10287-10287msec WRITE: bw=3D445MiB/s (467MB/s), 445MiB/s-445MiB/s (467MB/s-467MB/s), io=3D4572MiB (4794MB), run=3D10267-10267msec A "loss" argument is added to simulate network packet loss. When loss=3D1, each veth interface is configured with "delay 5ms loss 0.5%" using tc qdisc. Under this scenario, TCP performance is reduced by multiples compared to MPTCP: # ./mptcp_nvme.sh tcp 4 round-robin 1 READ: bw=3D144MiB/s (151MB/s), 144MiB/s-144MiB/s (151MB/s-151MB/s), io=3D1909MiB (2001MB), run=3D13231-13231msec WRITE: bw=3D100.0MiB/s (105MB/s), 100.0MiB/s-100.0MiB/s (105MB/s-105MB/s), io=3D1397MiB (1465MB), run=3D13980-13980msec # ./mptcp_nvme.sh mptcp 4 round-robin 1 READ: bw=3D428MiB/s (449MB/s), 428MiB/s-428MiB/s (449MB/s-449MB/s), io=3D4524MiB (4743MB), run=3D10564-10564msec WRITE: bw=3D431MiB/s (452MB/s), 431MiB/s-431MiB/s (452MB/s-452MB/s), io=3D4513MiB (4732MB), run=3D10481-10481msec These results demonstrate that MPTCP has better resilience against packet loss compared to TCP, as it can leverage multiple subflows to mitigate network degradation. Co-developed-by: zhenwei pi Signed-off-by: zhenwei pi Co-developed-by: Hui Zhu Signed-off-by: Hui Zhu Co-developed-by: Gang Yan Signed-off-by: Gang Yan Signed-off-by: Geliang Tang --- .../testing/selftests/net/mptcp/mptcp_nvme.sh | 70 ++++++++++++++++++- 1 file changed, 69 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_nvme.sh b/tools/testin= g/selftests/net/mptcp/mptcp_nvme.sh index 2bb20d30e397..b5b4d732140f 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_nvme.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_nvme.sh @@ -6,6 +6,8 @@ ret=3D0 trtype=3D"${1:-mptcp}" path=3D"${2:-1}" +iopolicy=3D${3:-"numa"} # round-robin, queue-depth +loss=3D${4:-0} nqn=3D"nqn.2014-08.org.nvmexpress.${trtype}dev.$$.${RANDOM}" ns=3D1 port=3D$((RANDOM % 10000 + 20000)) @@ -17,6 +19,7 @@ loop_dev=3D"" =20 export trtype path nqn ns port trsvcid export loop_dev temp_file +export iopolicy loss =20 usage() { @@ -24,10 +27,12 @@ usage() =20 Usage: =20 - $(basename "$0") [trtype] [path] + $(basename "$0") [trtype] [path] [iopolicy] [loss] =20 trtype Transport type (tcp|mptcp) - default: mptcp path Number of multipath (1-4) - default: 1 + iopolicy I/O policy (numa|round-robin|queue-depth) - default: numa + loss Enable packet loss (0|1) - default: 0 =20 EOF exit ${KSFT_FAIL} @@ -44,6 +49,16 @@ validate_params() echo "Invalid path count ${path}. Must be between 1 and 4" usage fi + + if [[ ! "${iopolicy}" =3D~ ^(numa|round-robin|queue-depth)$ ]]; then + echo "Invalid iopolicy ${iopolicy}." + usage + fi + + if [[ ! "${loss}" =3D~ ^[01]$ ]]; then + echo "Invalid loss value ${loss}. Must be 0 or 1" + usage + fi } =20 # This function is invoked indirectly @@ -106,6 +121,7 @@ cleanup() =20 unset -v trtype path nqn ns port trsvcid unset -v loop_dev temp_file + unset -v iopolicy loss } =20 # $tc_args needs word splitting to pass multiple arguments to netem @@ -114,6 +130,10 @@ init() { local tc_args=3D"rate 1000mbit" =20 + if [ "${loss}" -eq 1 ]; then + tc_args+=3D" delay 5ms loss 0.5%" + fi + mptcp_lib_ns_init ns1 ns2 =20 # ns1 ns2 @@ -194,6 +214,48 @@ run_target() done } =20 +# This function is invoked indirectly +#shellcheck disable=3DSC2317,SC2329 +set_io_policy() +{ + local nqn=3D"$1" + local iopolicy=3D"$2" + local subname + local policy + local current + + subname=3D$(nvme list-subsys 2>/dev/null | grep "${nqn}" | + grep -o 'nvme-subsys[0-9]*' | head -1) + if [ -z "$subname" ]; then + return 1 + fi + + policy=3D"/sys/class/nvme-subsystem/${subname}/iopolicy" + if [ ! -e "$policy" ]; then + # NVMe multipath not supported, skip iopolicy setting + return 0 + fi + + if [ ! -w "$policy" ]; then + return 1 + fi + + if ! echo "${iopolicy}" > "$policy" 2>/dev/null; then + return 1 + fi + + current=3D$(cat "$policy" 2>/dev/null) + if [ -z "$current" ]; then + return 1 + fi + + if [[ "$current" !=3D *"${iopolicy}"* ]]; then + return 1 + fi + + return 0 +} + # This function is invoked indirectly #shellcheck disable=3DSC2317,SC2329 run_host() @@ -243,6 +305,11 @@ run_host() return 1 fi =20 + if ! set_io_policy "${nqn}" "${iopolicy}"; then + echo "Failed to set I/O policy to ${iopolicy}" + return 1 + fi + sleep 1 =20 echo "fio randread /dev/${devname}" @@ -308,6 +375,7 @@ run_test() fi =20 if ! ip netns exec "$ns2" bash <<- EOF + $(declare -f set_io_policy) $(declare -f run_host) run_host exit \$? --=20 2.53.0 From nobody Mon May 25 18:05:07 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id F17183FBB5C for ; Mon, 18 May 2026 12:08:19 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779106100; cv=none; b=eTxz3XyxVQHaNDM0GewhdonogY6ByEOhQrbS3eonDyjsdOWF1bL5/TBHIX5XMnyNNx2Csl4ZpDuDiuO9m0ZjshUEHHX711YbNUpn7VjlFYJF8ae7eaOdlTA2tlB6T/7TOsey3J6PT2OFmg/mkVeI2BVt/mQK09t0pfDOX57L/hk= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779106100; c=relaxed/simple; bh=l020gXrviNHIa4yYJ94R6HMYpNkDvMQMaq4DE7I5voc=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=chYHtYZgVmbYMQ7+6uNlSQ0fOnT960olnxqmh6bhynkOErEE8Pxnj1WGV8FhgUNJVm9gPlmGUdX43G9UjvfB0W8KSf0cW7NcX/Zeeje5TBWTQPFF273eWWch8K5IyGpISFscQspQdEgyY7NuOd6Jbg7ag499A3VSuy7mCOxmSKc= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=pqqFBKB/; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="pqqFBKB/" Received: by smtp.kernel.org (Postfix) with ESMTPSA id F1C46C2BCC6; Mon, 18 May 2026 12:08:15 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1779106098; bh=l020gXrviNHIa4yYJ94R6HMYpNkDvMQMaq4DE7I5voc=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=pqqFBKB/wBoZXugFu1MJF9iCAZTHmEVAW/SgvaLl4GdTsk/mDbIU8Is3R2ww3ySxL dnzYsl8LRBW66qLMCb4iyzuiwkjiwfnGpzr1/IG+aqUNodX8js11pfli+wSiVDx7pU K+DCx7fzMCYb9AwMBsPSpXlBQGggYTVvipUosXr4HGE0AfC6JNHK03JGBkH3B3Iybq 80bxk6W6zVLjb4M27y//mUdCCcxj3Etysh26TYMSrMnsaPbYMGNRjq6JDGf6ZBoX+N VWi/4QBoiMsULnYMJsrtYDy9oXTFLYm6GzbdaM0FS+aGxgFAUVzPYULMvLdVKaUE8L GI2fwjbAuih2g== From: Geliang Tang To: mptcp@lists.linux.dev Cc: Geliang Tang , zhenwei pi , Hui Zhu , Gang Yan Subject: [RFC mptcp-next v13 10/12] nvmet-tcp: check return value of set_queue_sock Date: Mon, 18 May 2026 20:07:35 +0800 Message-ID: <4033b3407043cae7ba46d2ecf346329dce88b7f3.1779104752.git.tanggeliang@kylinos.cn> X-Mailer: git-send-email 2.53.0 In-Reply-To: References: Precedence: bulk X-Mailing-List: mptcp@lists.linux.dev List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: Geliang Tang The return value of nvmet_tcp_set_queue_sock() is currently ignored in nvmet_tcp_tls_handshake_done(). If it fails (e.g., due to concurrent port removal), the socket callbacks will not be properly set, leading to queue and socket leakage. Fix this by capturing the return value and calling nvmet_tcp_schedule_release_queue() on failure to ensure proper cleanup. Cc: zhenwei pi Cc: Hui Zhu Cc: Gang Yan Fixes: 675b453e0241 ("nvmet-tcp: enable TLS handshake upcall") Signed-off-by: Geliang Tang --- drivers/nvme/target/tcp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 73afaf3562a6..ed1a7f1d5958 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1868,10 +1868,11 @@ static void nvmet_tcp_tls_handshake_done(void *data= , int status, if (!status) status =3D nvmet_tcp_tls_key_lookup(queue, peerid); =20 + if (!status) + status =3D nvmet_tcp_set_queue_sock(queue); + if (status) nvmet_tcp_schedule_release_queue(queue); - else - nvmet_tcp_set_queue_sock(queue); kref_put(&queue->kref, nvmet_tcp_release_queue); } =20 --=20 2.53.0 From nobody Mon May 25 18:05:07 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 083913F44FD for ; Mon, 18 May 2026 12:08:20 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779106101; cv=none; b=WDx/snzNS/pyrC0yRj39m1LgTWlBcJfDMRHy91+Qeo6RzBBKAx6vf+B2EYNa/QyB1YNSiboBDBlDFhWX3WrDz6sjnHnliIMJtQ6APhK0txu0QbJcrO8aPOcR2Wx9MtU7ZswccPW6OM41fhssyKLYCTXlm0Sk8IwsPvlv9kaporA= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779106101; c=relaxed/simple; bh=AccoqBj22VePlwp41PqIhXy5UW8PEYoC+3bNp2ltdew=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=u16CycF1VzgHGBsH9N7xKjBtP59LBpwMGl4Mo1iuEASrvt3EkP00nn7KJ1yuyGKpIyp9la1bcTRV0rnFydxbhlmhv8JYW4Dw2Jm+J/dFYJMlsClfHkUeGJ5sAwxWXkKFjnXrkD9CQIwKZBhCKzoAsSUSPSUmumnOGHkRWDOr9+8= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=CxN9DOJi; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="CxN9DOJi" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 9F2B3C2BCB7; Mon, 18 May 2026 12:08:18 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1779106100; bh=AccoqBj22VePlwp41PqIhXy5UW8PEYoC+3bNp2ltdew=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=CxN9DOJiW17aJArv6VpMFGw1dumvc2Z/BVOoE790iitceNOFRboh3Ib/c2junNg0Z W+s2e7+qIyVjrwf6be2V8irfOjsAwhYzcNpNz4yDuJkQhmoWjIvFhUmyOU8gzn5w5U xdZ5L5SYUDqKjuUEPWpayIUqv17GsacdyeawA6SpTieK6tc8VMORtYSU1gqAIrg53s nkp46eQTTPgcHovdEzbpdKFFDXT3fOXfN7UZ2IuHy8urYUNC8HNx3uwf1f83A0/TB8 bntij0ePWbL8EWaLkixDk5svn6/g9WUo55jn1t28IKPcc4OiOTM+7O2fV5RjdATCiQ UpEtyTa4jjaxQ== From: Geliang Tang To: mptcp@lists.linux.dev Cc: Geliang Tang , zhenwei pi , Hui Zhu , Gang Yan Subject: [RFC mptcp-next v13 11/12] nvme-tcp: add RCU protection for host_iface validation Date: Mon, 18 May 2026 20:07:36 +0800 Message-ID: <3dd32a6ac7d7bb8629408255f28014468cbffdea.1779104752.git.tanggeliang@kylinos.cn> X-Mailer: git-send-email 2.53.0 In-Reply-To: References: Precedence: bulk X-Mailing-List: mptcp@lists.linux.dev List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: Geliang Tang When the host_iface option is specified, nvme_tcp_alloc_ctrl() calls __dev_get_by_name() to validate the network interface exists. However, __dev_get_by_name() requires RCU read lock protection as it traverses the RCU-protected device hash table. Without RCU read lock, the thread can be preempted during traversal. If a concurrent device unregistration occurs, the hash node can be freed via kfree_rcu() and the RCU grace period could end before the thread resumes, leading to a use-after-free when the freed memory is dereferenced. Fix this by using dev_get_by_name_rcu() instead of __dev_get_by_name() and wrapping the lookup with rcu_read_lock()/rcu_read_unlock(). Cc: zhenwei pi Cc: Hui Zhu Cc: Gang Yan Fixes: 3ede8f72a9a2 ("nvme-tcp: allow selecting the network interface for c= onnections") Signed-off-by: Geliang Tang --- drivers/nvme/host/tcp.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 2388a8c443cc..eafe750a8be8 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -3004,12 +3005,15 @@ static struct nvme_tcp_ctrl *nvme_tcp_alloc_ctrl(st= ruct device *dev, } =20 if (opts->mask & NVMF_OPT_HOST_IFACE) { - if (!__dev_get_by_name(&init_net, opts->host_iface)) { + rcu_read_lock(); + if (!dev_get_by_name_rcu(&init_net, opts->host_iface)) { + rcu_read_unlock(); pr_err("invalid interface passed: %s\n", opts->host_iface); ret =3D -ENODEV; goto out_free_ctrl; } + rcu_read_unlock(); } =20 if (!opts->duplicate_connect && nvme_tcp_existing_controller(opts)) { --=20 2.53.0 From nobody Mon May 25 18:05:07 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 3F2013FD14B for ; Mon, 18 May 2026 12:08:24 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779106104; cv=none; b=RtFcruvedGJqcEsgzBaeA8HlhfV34GpcTHtGtbit2yopLP5dSbBpQb3YJckQ+XuCbp0o03GAz6G2IdLpQWRHVLU2JpQc0Kr6rz0GQa5NiQUQlwQubrjJqHM18p9klNHpBhz3wupSaEJJyPAmD4D4Fe5tWVO+96q0Qhi3R6CGCys= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779106104; c=relaxed/simple; bh=MCzYtqCuQY9Gbb37KYrhQLnVl9NjufkMifYjSb5DHIQ=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=jei2N6i1YtTu0NOTscfzHPdF6JE3OU83rN1s6xWaFrFeWysKquIwLAiRYuCue/cifBqxM1TK38ScBP/JqIXKoLDx6zB2XaRaU4Gf0xSya4smrnBo59ZvgWNPfJFO9ReLtLl9COm1oX9ZL14Cy95GicGaXMb3LGgbAHRzfjqTa6Y= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=DAsHM3vn; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="DAsHM3vn" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 344D4C2BCB7; Mon, 18 May 2026 12:08:20 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1779106103; bh=MCzYtqCuQY9Gbb37KYrhQLnVl9NjufkMifYjSb5DHIQ=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=DAsHM3vnpZ8iE8KCgjFCsIXK4zN0YhvDqSylbCzr+B84Bvd6eALZvi+hT8VIygjnG U4kITJDR49jWeZFi/QQ6AaQ8CM2I35x+ip6hb81di2/8tfuAdhuUDlx4xzCbuttbaM OIPC3s1NYhIwh8POkTauYWfcSUUEMceHAG+y89aCohJBdwpU7br3k9QMp7kYIfsllh xptSY/hy6Ck7aUzd3hHID8PQgwQ1sB7CrK343Mc+7K99FEvHhltk4wdFaYQXazHq3R R5GKF8RlrZDlmihCuHA6uflfdNivVspP1n2bkm1HLxpmzkoKqrb2jRC7GQNU0iddjm nAMJw0hFyCBcg== From: Geliang Tang To: mptcp@lists.linux.dev Cc: Geliang Tang , zhenwei pi , Hui Zhu , Gang Yan Subject: [RFC mptcp-next v13 12/12] nvme-tcp: wait for synchronous controller deletion on module unload Date: Mon, 18 May 2026 20:07:37 +0800 Message-ID: <7ac2ed162f9a87c419af3ab042d6a6e65fee5e0b.1779104752.git.tanggeliang@kylinos.cn> X-Mailer: git-send-email 2.53.0 In-Reply-To: References: Precedence: bulk X-Mailing-List: mptcp@lists.linux.dev List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: Geliang Tang Module unload can race with synchronous controller deletion via sysfs, leading to use-after-free when the sysfs thread calls into unloaded module code. Fix by using nvme_delete_ctrl_sync() to wait for each controller deletion to complete before unloading the module. Cc: zhenwei pi Cc: Hui Zhu Cc: Gang Yan Signed-off-by: Geliang Tang --- drivers/nvme/host/tcp.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index eafe750a8be8..97678196204a 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -3152,7 +3152,8 @@ static int __init nvme_tcp_init_module(void) =20 static void __exit nvme_tcp_cleanup_module(void) { - struct nvme_tcp_ctrl *ctrl; + struct nvme_tcp_ctrl *ctrl, *tmp; + LIST_HEAD(ctrl_list); =20 #ifdef CONFIG_MPTCP nvmf_unregister_transport(&nvme_mptcp_transport); @@ -3160,10 +3161,16 @@ static void __exit nvme_tcp_cleanup_module(void) nvmf_unregister_transport(&nvme_tcp_transport); =20 mutex_lock(&nvme_tcp_ctrl_mutex); - list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list) - nvme_delete_ctrl(&ctrl->ctrl); + list_for_each_entry_safe(ctrl, tmp, &nvme_tcp_ctrl_list, list) { + nvme_get_ctrl(&ctrl->ctrl); + list_move(&ctrl->list, &ctrl_list); + } mutex_unlock(&nvme_tcp_ctrl_mutex); - flush_workqueue(nvme_delete_wq); + + list_for_each_entry_safe(ctrl, tmp, &ctrl_list, list) { + nvme_delete_ctrl_sync(&ctrl->ctrl); + nvme_put_ctrl(&ctrl->ctrl); + } =20 destroy_workqueue(nvme_tcp_wq); } --=20 2.53.0