From nobody Mon May 25 05:12:14 2026 Received: from mx0a-00082601.pphosted.com (mx0a-00082601.pphosted.com [67.231.145.42]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id AEED43F9F30; Mon, 18 May 2026 15:36:23 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=67.231.145.42 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779118586; cv=none; b=VYph8eeQ2JbKjglzlcJ9Vxd+TR8Zpa7mtVVZ4AjSzF9ruP0J6MFX5bLitndAF2yD9DUS3cP/Wz69IFprFFQ370dlc/qWuSGnDW+EKq+QEyxQLtWxgO4nI64/1gjOWnSEMMiXrCLkN0YYmXZPGP4Si0r6RJm56gaF6V9l52BdhjQ= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779118586; c=relaxed/simple; bh=rx5ZGSwADz9+97vNY1/vB/EeG1yWsBC9hvcrCStUViY=; h=From:To:CC:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version:Content-Type; b=Z2dVAyMGzrUoN1AbvY7eXnTI/zIUlvo2oToznjHOJpbTOxhQ57Bcccx8amu9uP2P0dkbWZp0UWKqafGqHPPHTJ1FUJrBJ201v7/usee8qr2nWJACVvEACMFqkSWvhEzsyOu2TEBwGhMDZk1GE2TlWE0aehfT9t/Ra3LrWBc2094= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=reject dis=none) header.from=meta.com; spf=pass smtp.mailfrom=meta.com; dkim=pass (2048-bit key) header.d=meta.com header.i=@meta.com header.b=rzKyhOwo; arc=none smtp.client-ip=67.231.145.42 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=reject dis=none) header.from=meta.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=meta.com Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=meta.com header.i=@meta.com header.b="rzKyhOwo" Received: from pps.filterd (m0044010.ppops.net [127.0.0.1]) by mx0a-00082601.pphosted.com (8.18.1.11/8.18.1.11) with ESMTP id 64HMOPmM3955565; Mon, 18 May 2026 08:36:16 -0700 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=meta.com; h=cc :content-transfer-encoding:content-type:date:from:in-reply-to :message-id:mime-version:references:subject:to; s=s2048-2025-q2; bh=hwqumWOzgPDProtcQ8OjlXWX4FPuHrxZC2c7JY1jCf4=; b=rzKyhOwoxstT E4UDitAETRES32TDAWirHt9EmQkhIs58QxOJ7rjzkDzbTDsIR5GNIYZ3DeQbc07i XTE8QLPVBdvKGHcJY6YtRwQoPYcypibPe+X5fLas0N6Af5yuToJKkzh4vWy9CDzP Dd6P8+cL234bZb7viIyiAXtI5RYjIpwMusAIpmtBrdxn/3jZreNBzHRsrU0HFM8T n98kOCclCNs3RWKzJvrkEmqyZEsykgmkTnELlp1RpA4JtyyPe6k//LQ1FMlX0uef ENtcEAKcMBA2P4MymFhApH66AcwDQIuDxAEVd1neCCfZFYeC3FxP1xW6ZthOANyh XKyVng/8lw== Received: from maileast.thefacebook.com ([163.114.135.16]) by mx0a-00082601.pphosted.com (PPS) with ESMTPS id 4e6kw122hp-2 (version=TLSv1.2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128 verify=NOT); Mon, 18 May 2026 08:36:15 -0700 (PDT) Received: from localhost (2620:10d:c0a8:1b::2d) by mail.thefacebook.com (2620:10d:c0a9:6f::8fd4) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.2.2562.37; Mon, 18 May 2026 15:36:13 +0000 From: =?UTF-8?q?Cl=C3=A9ment=20L=C3=A9ger?= To: , Pavel Begunkov , "Jens Axboe" CC: , , , , "David S. Miller" , Eric Dumazet , "Jakub Kicinski" , Paolo Abeni , Simon Horman , Jonathan Corbet , Shuah Khan , Vishwanath Seshagiri , "Vishwanath Seshagiri" Subject: [PATCH v2 1/6] io_uring/zcrx: add ctx pointer to zcrx Date: Mon, 18 May 2026 08:35:24 -0700 Message-ID: <20260518153532.2835502-2-cleger@meta.com> X-Mailer: git-send-email 2.52.0 In-Reply-To: <20260518153532.2835502-1-cleger@meta.com> References: <20260518153532.2835502-1-cleger@meta.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Proofpoint-ORIG-GUID: HhJnrQDkYkAQuQY_ijdKSS3pd1qQMUIB X-Authority-Analysis: v=2.4 cv=P/MKQCAu c=1 sm=1 tr=0 ts=6a0b31ef cx=c_pps a=MfjaFnPeirRr97d5FC5oHw==:117 a=MfjaFnPeirRr97d5FC5oHw==:17 a=NGcC8JguVDcA:10 a=M51BFTxLslgA:10 a=VkNPw1HP01LnGYTKEx00:22 a=7x6HtfJdh03M6CCDgxCd:22 a=8elwO82fXORLTBIkMd32:22 a=pGLkceISAAAA:8 a=VabnemYjAAAA:8 a=Df7pRbWtbhPDAoH00hYA:9 a=gKebqoRLp9LExxC7YDUY:22 X-Proofpoint-Spam-Details-Enc: AW1haW4tMjYwNTE4MDE1MyBTYWx0ZWRfX+IuMgyKPH/Ge 7JsSAPMW1ojtKw/twnxLXkjcyRPeJ5sq+fPXDisRd89POq3mEZXqmietxbCaaBpNReXFk7zs8af ww3rBRwt8nBYMJawth4zn+ZLhmWRSpD6S88wErRlqsvhQOjeUyR0Yj8mvrvXLpHr12XoHQxYBNY A0nOrGzaS+7SSQGLtOkdlMOZFpj8zt0VDFlORkRVl72BIbeYnCsHV3y2Dgicp2WvHfw34ULrxBT JnYD0Z/zsaaw3MD0D39+HOnRKBe8Er+FxH+RgqNVuuvQqvUaBvoZED1+R0UocmT8sKGlRZNd64A tEnzZHbC3v7L5x66RlSGIZe+bRthluTpjMfR0xZBMGe9xxWJTRqrPjgZx5FaWAldh23blnFbudJ al5L6D6nmpN4mulaLNvl7E6BUzt9pl6Y3Xz8LwfKiyLSNJ+Zv1nlHpGcaWgbyfyj2Bd44QQfd6s O4YjK10SlhzyzZVtr1w== X-Proofpoint-GUID: HhJnrQDkYkAQuQY_ijdKSS3pd1qQMUIB X-Proofpoint-Virus-Version: vendor=baseguard engine=ICAP:2.0.293,Aquarius:18.0.1143,Hydra:6.1.51,FMLib:17.12.100.49 definitions=2026-05-18_03,2026-05-18_01,2025-10-01_01 Content-Type: text/plain; charset="utf-8" From: Pavel Begunkov zcrx will need to have a pointer to an owning ctx to communicate different events. Reference the ctx while it's attached to zcrx, and rely on zcrx termination to drop the ctx to avoid circular ref deps. Co-developed-by: Vishwanath Seshagiri Signed-off-by: Pavel Begunkov --- io_uring/zcrx.c | 39 +++++++++++++++++++++++++++++++-------- io_uring/zcrx.h | 3 +++ 2 files changed, 34 insertions(+), 8 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 3f9632e7790a..34faf90423f4 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -44,6 +44,17 @@ static inline struct io_zcrx_area *io_zcrx_iov_to_area(c= onst struct net_iov *nio return container_of(owner, struct io_zcrx_area, nia); } =20 +static bool zcrx_set_ring_ctx(struct io_zcrx_ifq *zcrx, + struct io_ring_ctx *ctx) +{ + guard(spinlock_bh)(&zcrx->ctx_lock); + if (zcrx->master_ctx) + return false; + percpu_ref_get(&ctx->refs); + zcrx->master_ctx =3D ctx; + return true; +} + static inline struct page *io_zcrx_iov_page(const struct net_iov *niov) { struct io_zcrx_area *area =3D io_zcrx_iov_to_area(niov); @@ -531,6 +542,7 @@ static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_= ring_ctx *ctx) return NULL; =20 ifq->if_rxq =3D -1; + spin_lock_init(&ifq->ctx_lock); spin_lock_init(&ifq->rq.lock); mutex_init(&ifq->pp_lock); refcount_set(&ifq->refs, 1); @@ -580,6 +592,8 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) return; if (WARN_ON_ONCE(ifq->netdev !=3D NULL)) return; + if (WARN_ON_ONCE(ifq->master_ctx)) + return; =20 if (ifq->area) io_zcrx_free_area(ifq, ifq->area); @@ -656,17 +670,24 @@ static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) } } =20 -static void zcrx_unregister_user(struct io_zcrx_ifq *ifq) +static void zcrx_unregister_user(struct io_zcrx_ifq *ifq, struct io_ring_c= tx *ctx) { + scoped_guard(spinlock_bh, &ifq->ctx_lock) { + if (ctx && ifq->master_ctx =3D=3D ctx) { + ifq->master_ctx =3D NULL; + percpu_ref_put(&ctx->refs); + } + } + if (refcount_dec_and_test(&ifq->user_refs)) { io_close_queue(ifq); io_zcrx_scrub(ifq); } } =20 -static void zcrx_unregister(struct io_zcrx_ifq *ifq) +static void zcrx_unregister(struct io_zcrx_ifq *ifq, struct io_ring_ctx *c= tx) { - zcrx_unregister_user(ifq); + zcrx_unregister_user(ifq, ctx); io_put_zcrx_ifq(ifq); } =20 @@ -686,7 +707,7 @@ static int zcrx_box_release(struct inode *inode, struct= file *file) =20 if (WARN_ON_ONCE(!ifq)) return -EFAULT; - zcrx_unregister(ifq); + zcrx_unregister(ifq, NULL); return 0; } =20 @@ -711,7 +732,7 @@ static int zcrx_export(struct io_ring_ctx *ctx, struct = io_zcrx_ifq *ifq, file =3D anon_inode_create_getfile("[zcrx]", &zcrx_box_fops, ifq, O_CLOEXEC, NULL); if (IS_ERR(file)) { - zcrx_unregister(ifq); + zcrx_unregister(ifq, NULL); return PTR_ERR(file); } =20 @@ -787,7 +808,7 @@ static int import_zcrx(struct io_ring_ctx *ctx, scoped_guard(mutex, &ctx->mmap_lock) xa_erase(&ctx->zcrx_ctxs, id); err: - zcrx_unregister(ifq); + zcrx_unregister(ifq, ctx); return ret; } =20 @@ -932,12 +953,14 @@ int io_register_zcrx(struct io_ring_ctx *ctx, ret =3D -EFAULT; goto err; } + + zcrx_set_ring_ctx(ifq, ctx); return 0; err: scoped_guard(mutex, &ctx->mmap_lock) xa_erase(&ctx->zcrx_ctxs, id); ifq_free: - zcrx_unregister(ifq); + zcrx_unregister(ifq, ctx); return ret; } =20 @@ -967,7 +990,7 @@ void io_terminate_zcrx(struct io_ring_ctx *ctx) break; set_zcrx_entry_mark(ctx, id); id++; - zcrx_unregister_user(ifq); + zcrx_unregister_user(ifq, ctx); } } =20 diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index 9e1a6a1b11e8..6b565d0bf6da 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -73,6 +73,9 @@ struct io_zcrx_ifq { */ struct mutex pp_lock; struct io_mapped_region rq_region; + + spinlock_t ctx_lock; + struct io_ring_ctx *master_ctx; }; =20 #if defined(CONFIG_IO_URING_ZCRX) --=20 2.53.0-Meta From nobody Mon May 25 05:12:14 2026 Received: from mx0b-00082601.pphosted.com (mx0b-00082601.pphosted.com [67.231.153.30]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 86E3D3FBB68; Mon, 18 May 2026 15:36:34 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=67.231.153.30 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779118596; cv=none; b=G8fODwxoxTFU5Hw0pnLWKHbaMTObwzfRiywW88C0tLvg0p31otjI5M8WoEUHE4ZDGsF6cI60JIWnfxLT1nGFDEXSKbXNPk8moTk7K6Cthp7TZ6T35kbuVcpFs2EiTIpGU31Rov6YvydI6d7r6VQ0XnRqaUTLd/TDgoHz+5nf7fo= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779118596; c=relaxed/simple; bh=tyukqsHNmM5zZKmC/VN1FsZSXO83SY6EK7UWphe2G2A=; h=From:To:CC:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version:Content-Type; b=IwyCgqfGOKzlqfiJwMiBN6D1Qq3Wf37a0zZCXSm1fz1FAlo6TugqWxGLhlBp4HbzKKNTKAcTLw5PzV7cAwtk6LTNsFahTD6S8EcTYIa5JnJUjjuV8Q61y1y7SQNpNLNkZCyL4rvpaNlfJNeBJh+BiwWBsoMonaHl3eEVg5QsnEA= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=reject dis=none) header.from=meta.com; spf=pass smtp.mailfrom=meta.com; dkim=pass (2048-bit key) header.d=meta.com header.i=@meta.com header.b=AymUx7Ic; arc=none smtp.client-ip=67.231.153.30 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=reject dis=none) header.from=meta.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=meta.com Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=meta.com header.i=@meta.com header.b="AymUx7Ic" Received: from pps.filterd (m0528005.ppops.net [127.0.0.1]) by mx0a-00082601.pphosted.com (8.18.1.11/8.18.1.11) with ESMTP id 64I5OeKR2338178; Mon, 18 May 2026 08:36:23 -0700 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=meta.com; h=cc :content-transfer-encoding:content-type:date:from:in-reply-to :message-id:mime-version:references:subject:to; s=s2048-2025-q2; bh=pQPxJk4W/DOHrm9nMz0gvk3eSsq4LRgzVMYDbz1BfS0=; b=AymUx7IcViHX apHGpEib2gek/XEXRUwmgCWnkHKiiMojssoY8Ui80PvLkUeCJMpTZaiOUj3UPtuQ e0cFdrQoxOLPaeMf5LDp46sIzgPqUDq2wYu4/ffdKcXlGb4lSJgUWzADXX5i9hOP No7tlbp9tQ5IwjYbOklagXAyrjIcT8h274tb4Dr3INk0hjXDGvUV6o8689nV8UKh KqZUIhZ1aunKAmTVbut48yiHbfd8VjgRLlFfP1Taaa74aSxZc8nNZj69nlzrPZPY Gm12W77Av8vj7gRVPcmrXmFby/6p5/G8J8faoHj1sUNzJpiGjJckVULLNOQLGAL/ Mb5vcZsHKQ== Received: from maileast.thefacebook.com ([163.114.135.16]) by mx0a-00082601.pphosted.com (PPS) with ESMTPS id 4e7a5hp7pw-3 (version=TLSv1.2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128 verify=NOT); Mon, 18 May 2026 08:36:23 -0700 (PDT) Received: from localhost (2620:10d:c0a8:1b::2d) by mail.thefacebook.com (2620:10d:c0a9:6f::237c) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.2.2562.37; Mon, 18 May 2026 15:36:16 +0000 From: =?UTF-8?q?Cl=C3=A9ment=20L=C3=A9ger?= To: , Pavel Begunkov , "Jens Axboe" CC: , , , , "David S. Miller" , Eric Dumazet , "Jakub Kicinski" , Paolo Abeni , Simon Horman , Jonathan Corbet , Shuah Khan , Vishwanath Seshagiri , "Vishwanath Seshagiri" Subject: [PATCH v2 2/6] io_uring/zcrx: notify user when out of buffers Date: Mon, 18 May 2026 08:35:25 -0700 Message-ID: <20260518153532.2835502-3-cleger@meta.com> X-Mailer: git-send-email 2.52.0 In-Reply-To: <20260518153532.2835502-1-cleger@meta.com> References: <20260518153532.2835502-1-cleger@meta.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Authority-Analysis: v=2.4 cv=NqXhtcdJ c=1 sm=1 tr=0 ts=6a0b31f7 cx=c_pps a=MfjaFnPeirRr97d5FC5oHw==:117 a=MfjaFnPeirRr97d5FC5oHw==:17 a=NGcC8JguVDcA:10 a=M51BFTxLslgA:10 a=VkNPw1HP01LnGYTKEx00:22 a=7x6HtfJdh03M6CCDgxCd:22 a=jCddH8ec0KUNCymVuxII:22 a=pGLkceISAAAA:8 a=VabnemYjAAAA:8 a=XVcddZJZvRb1FB5kz7kA:9 a=gKebqoRLp9LExxC7YDUY:22 X-Proofpoint-GUID: RA94Vg0WCFFZBfMEq1n-ssV49B3uTeCx X-Proofpoint-ORIG-GUID: RA94Vg0WCFFZBfMEq1n-ssV49B3uTeCx X-Proofpoint-Spam-Details-Enc: AW1haW4tMjYwNTE4MDE1MyBTYWx0ZWRfX1rbe3PSPZYH6 G15rTN6f+Q2PJUpzalz4Dzev6yByu2B1EeoyESyYUwZG1lZPCVomrLbbwZfO910DktpvR+SuSeA +q7PzWYZcTKNCp/JHSgzbzJsx4zVJjpfBXAJ4cfkauVTbVDyR8l9fJauMAOTjhPgLMIYhq79dk4 00hZtbAgujzPNUTgQdOm3zhUGIYuNchEyUdEKDUgQpNCEqQj4MWv2gzwFXJjPWt9sitLBxJxy5L M6vJMdv65a8j+jThsr1jNQJRFv6Y2EJMu+wU1mjW9D5u8ZhnA9OSlTmSCSajR4ksPPttVh8IZy7 HAc8HprOQf9sRZeehonp/jAvdZhV79Y8Op6qoqJKRJ4TKofonXdDb+Jf01rc3gAneqKT0RdsWfT +qTI9GH2KtRMNvLurwTWs8ADTodYUydpQjqYI5FW1aBjTQXO3Y15to4QsCCNr/IWTInbgIbLUXf 7g4zlIieK/x2DOFvWlA== X-Proofpoint-Virus-Version: vendor=baseguard engine=ICAP:2.0.293,Aquarius:18.0.1143,Hydra:6.1.51,FMLib:17.12.100.49 definitions=2026-05-18_03,2026-05-18_01,2025-10-01_01 Content-Type: text/plain; charset="utf-8" From: Pavel Begunkov There are currently no easy ways for the user to know if zcrx is out of buffers and page pool fails to allocate. Add uapi for zcrx to communicate it back. It's implemented as a separate CQE, which for now is posted to the creator ctx. To use it, on registration the user space needs to pass an instance of struct zcrx_notification_desc, which tells the kernel the user_data for resulting CQEs and which event types are expected / allowed. When an allowed event happens, zcrx will post a CQE containing the specified user_data, and lower bits of cqe->res will be set to the event mask. Before the kernel could post another notification of the given type, the user needs to acknowledge that it processed the previous one by issuing IORING_REGISTER_ZCRX_CTRL with ZCRX_CTRL_ARM_NOTIFICATION. The only notification type the patch implements is ZCRX_NOTIF_NO_BUFFERS, but we'll need more of them in the future. Co-developed-by: Vishwanath Seshagiri Signed-off-by: Pavel Begunkov --- include/uapi/linux/io_uring/zcrx.h | 24 ++++++++- io_uring/io_uring.c | 2 +- io_uring/io_uring.h | 1 + io_uring/zcrx.c | 86 +++++++++++++++++++++++++++++- io_uring/zcrx.h | 7 ++- 5 files changed, 115 insertions(+), 5 deletions(-) diff --git a/include/uapi/linux/io_uring/zcrx.h b/include/uapi/linux/io_uri= ng/zcrx.h index 5ce02c7a6096..67185566ad3c 100644 --- a/include/uapi/linux/io_uring/zcrx.h +++ b/include/uapi/linux/io_uring/zcrx.h @@ -65,6 +65,20 @@ enum zcrx_features { * value in struct io_uring_zcrx_ifq_reg::rx_buf_len. */ ZCRX_FEATURE_RX_PAGE_SIZE =3D 1 << 0, + ZCRX_FEATURE_NOTIFICATION =3D 1 << 1, +}; + +enum zcrx_notification_type { + ZCRX_NOTIF_NO_BUFFERS, + + __ZCRX_NOTIF_TYPE_LAST, +}; + +struct zcrx_notification_desc { + __u64 user_data; + __u32 type_mask; + __u32 __resv1; + __u64 __resv2[10]; }; =20 /* @@ -82,12 +96,14 @@ struct io_uring_zcrx_ifq_reg { struct io_uring_zcrx_offsets offsets; __u32 zcrx_id; __u32 rx_buf_len; - __u64 __resv[3]; + __u64 notif_desc; /* see struct zcrx_notification_desc */ + __u64 __resv[2]; }; =20 enum zcrx_ctrl_op { ZCRX_CTRL_FLUSH_RQ, ZCRX_CTRL_EXPORT, + ZCRX_CTRL_ARM_NOTIFICATION, =20 __ZCRX_CTRL_LAST, }; @@ -101,6 +117,11 @@ struct zcrx_ctrl_export { __u32 __resv1[11]; }; =20 +struct zcrx_ctrl_arm_notif { + __u32 notif_type; + __u32 __resv[11]; +}; + struct zcrx_ctrl { __u32 zcrx_id; __u32 op; /* see enum zcrx_ctrl_op */ @@ -109,6 +130,7 @@ struct zcrx_ctrl { union { struct zcrx_ctrl_export zc_export; struct zcrx_ctrl_flush_rq zc_flush; + struct zcrx_ctrl_arm_notif zc_arm_notif; }; }; =20 diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 2ebb0ba37c4f..c5972274cce1 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -160,7 +160,7 @@ static void io_poison_cached_req(struct io_kiocb *req) req->apoll =3D IO_URING_PTR_POISON; } =20 -static void io_poison_req(struct io_kiocb *req) +void io_poison_req(struct io_kiocb *req) { io_poison_cached_req(req); req->async_data =3D IO_URING_PTR_POISON; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index e612a66ee80e..de0a3bed58d1 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -213,6 +213,7 @@ bool __io_alloc_req_refill(struct io_ring_ctx *ctx); =20 void io_activate_pollwq(struct io_ring_ctx *ctx); void io_restriction_clone(struct io_restriction *dst, struct io_restrictio= n *src); +void io_poison_req(struct io_kiocb *req); =20 static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) { diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 34faf90423f4..463fbaead35b 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -768,6 +768,8 @@ static int import_zcrx(struct io_ring_ctx *ctx, return -EINVAL; if (reg->if_rxq || reg->rq_entries || reg->area_ptr || reg->region_ptr) return -EINVAL; + if (reg->notif_desc) + return -EINVAL; if (reg->flags & ~ZCRX_REG_IMPORT) return -EINVAL; =20 @@ -856,6 +858,7 @@ static int zcrx_register_netdev(struct io_zcrx_ifq *ifq, int io_register_zcrx(struct io_ring_ctx *ctx, struct io_uring_zcrx_ifq_reg __user *arg) { + struct zcrx_notification_desc notif; struct io_uring_zcrx_area_reg area; struct io_uring_zcrx_ifq_reg reg; struct io_uring_region_desc rd; @@ -899,10 +902,22 @@ int io_register_zcrx(struct io_ring_ctx *ctx, if (copy_from_user(&area, u64_to_user_ptr(reg.area_ptr), sizeof(area))) return -EFAULT; =20 + memset(¬if, 0, sizeof(notif)); + if (reg.notif_desc && copy_from_user(¬if, u64_to_user_ptr(reg.notif_de= sc), + sizeof(notif))) + return -EFAULT; + if (notif.type_mask & ~ZCRX_NOTIF_TYPE_MASK) + return -EINVAL; + if (notif.__resv1 || !mem_is_zero(¬if.__resv2, sizeof(notif.__resv2))) + return -EINVAL; + ifq =3D io_zcrx_ifq_alloc(ctx); if (!ifq) return -ENOMEM; =20 + ifq->notif_data =3D notif.user_data; + ifq->allowed_notif_mask =3D notif.type_mask; + if (ctx->user) { get_uid(ctx->user); ifq->user =3D ctx->user; @@ -954,7 +969,8 @@ int io_register_zcrx(struct io_ring_ctx *ctx, goto err; } =20 - zcrx_set_ring_ctx(ifq, ctx); + if (notif.type_mask) + zcrx_set_ring_ctx(ifq, ctx); return 0; err: scoped_guard(mutex, &ctx->mmap_lock) @@ -1127,6 +1143,48 @@ static unsigned io_zcrx_refill_slow(struct page_pool= *pp, struct io_zcrx_ifq *if return allocated; } =20 +static void zcrx_notif_tw(struct io_tw_req tw_req, io_tw_token_t tw) +{ + struct io_kiocb *req =3D tw_req.req; + struct io_ring_ctx *ctx =3D req->ctx; + + io_post_aux_cqe(ctx, req->cqe.user_data, req->cqe.res, 0); + percpu_ref_put(&ctx->refs); + io_poison_req(req); + kmem_cache_free(req_cachep, req); +} + +static void zcrx_send_notif(struct io_zcrx_ifq *ifq, unsigned type) +{ + gfp_t gfp =3D GFP_ATOMIC | __GFP_NOWARN | __GFP_ZERO; + u32 type_mask =3D 1 << type; + struct io_kiocb *req; + + if (!(type_mask & ifq->allowed_notif_mask)) + return; + + guard(spinlock_bh)(&ifq->ctx_lock); + if (!ifq->master_ctx) + return; + if (type_mask & ifq->fired_notifs) + return; + + req =3D kmem_cache_alloc(req_cachep, gfp); + if (unlikely(!req)) + return; + + ifq->fired_notifs |=3D type_mask; + + req->opcode =3D IORING_OP_NOP; + req->cqe.user_data =3D ifq->notif_data; + req->cqe.res =3D type; + req->ctx =3D ifq->master_ctx; + percpu_ref_get(&req->ctx->refs); + req->tctx =3D NULL; + req->io_task_work.func =3D zcrx_notif_tw; + io_req_task_work_add(req); +} + static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp) { struct io_zcrx_ifq *ifq =3D io_pp_to_ifq(pp); @@ -1143,8 +1201,10 @@ static netmem_ref io_pp_zc_alloc_netmems(struct page= _pool *pp, gfp_t gfp) goto out_return; =20 allocated =3D io_zcrx_refill_slow(pp, ifq, netmems, to_alloc); - if (!allocated) + if (!allocated) { + zcrx_send_notif(ifq, ZCRX_NOTIF_NO_BUFFERS); return 0; + } out_return: zcrx_sync_for_device(pp, ifq, netmems, allocated); allocated--; @@ -1293,12 +1353,32 @@ static int zcrx_flush_rq(struct io_ring_ctx *ctx, s= truct io_zcrx_ifq *zcrx, return 0; } =20 +static int zcrx_arm_notif(struct io_ring_ctx *ctx, struct io_zcrx_ifq *zcr= x, + struct zcrx_ctrl *ctrl) +{ + const struct zcrx_ctrl_arm_notif *an =3D &ctrl->zc_arm_notif; + unsigned type_mask; + + if (an->notif_type >=3D __ZCRX_NOTIF_TYPE_LAST) + return -EINVAL; + if (!mem_is_zero(&an->__resv, sizeof(an->__resv))) + return -EINVAL; + + guard(spinlock_bh)(&zcrx->ctx_lock); + type_mask =3D 1U << an->notif_type; + if (type_mask & ~zcrx->fired_notifs) + return -EINVAL; + zcrx->fired_notifs &=3D ~type_mask; + return 0; +} + int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_ar= gs) { struct zcrx_ctrl ctrl; struct io_zcrx_ifq *zcrx; =20 BUILD_BUG_ON(sizeof(ctrl.zc_export) !=3D sizeof(ctrl.zc_flush)); + BUILD_BUG_ON(sizeof(ctrl.zc_export) !=3D sizeof(ctrl.zc_arm_notif)); =20 if (nr_args) return -EINVAL; @@ -1316,6 +1396,8 @@ int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user= *arg, unsigned nr_args) return zcrx_flush_rq(ctx, zcrx, &ctrl); case ZCRX_CTRL_EXPORT: return zcrx_export(ctx, zcrx, &ctrl, arg); + case ZCRX_CTRL_ARM_NOTIFICATION: + return zcrx_arm_notif(ctx, zcrx, &ctrl); } =20 return -EOPNOTSUPP; diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index 6b565d0bf6da..cca10d0d02ac 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -9,7 +9,9 @@ #include =20 #define ZCRX_SUPPORTED_REG_FLAGS (ZCRX_REG_IMPORT | ZCRX_REG_NODEV) -#define ZCRX_FEATURES (ZCRX_FEATURE_RX_PAGE_SIZE) +#define ZCRX_FEATURES (ZCRX_FEATURE_RX_PAGE_SIZE |\ + ZCRX_FEATURE_NOTIFICATION) +#define ZCRX_NOTIF_TYPE_MASK (1U << ZCRX_NOTIF_NO_BUFFERS) =20 struct io_zcrx_mem { unsigned long size; @@ -76,6 +78,9 @@ struct io_zcrx_ifq { =20 spinlock_t ctx_lock; struct io_ring_ctx *master_ctx; + u32 allowed_notif_mask; + u32 fired_notifs; + u64 notif_data; }; =20 #if defined(CONFIG_IO_URING_ZCRX) --=20 2.53.0-Meta From nobody Mon May 25 05:12:14 2026 Received: from mx0a-00082601.pphosted.com (mx0a-00082601.pphosted.com [67.231.145.42]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 1038C36CDE3; Mon, 18 May 2026 15:36:29 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=67.231.145.42 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779118591; cv=none; b=b4PxrNSqiNem8jUnGW+9pmDioAbUwzRAMw0bMtTb4Lv49V0xoHNsgyCO54FV0iu5zVuJbVaNfxfaiimO00EgjSESZzCIpdolmhqFxrbM1glM9nci7QSJpSySCe3uDnf53pFbxRt6DYWDRwU3pQBE32LEWE2IFcQa0GidTApnAfE= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779118591; c=relaxed/simple; bh=7u+hKOoWW66Y8kcYWNogf07t4JVPr6VZPCTE+rHo3EM=; h=From:To:CC:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version:Content-Type; b=kSB4RgssKxNPgsbjmbavsOGv819PcwHtylJ7lPnpJOTkGkSQ75uoI6ztNNSqhlL1Jlj/iCielyS/bZJdBJmhgw1gcnYlX6cywmXsXOXRtfVef2ewxBlCpY3ARInIiBVsFJPgil7HZ8/vd5WfYV88qezFQaOyQNlzalOuVkM/sbU= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=reject dis=none) header.from=meta.com; spf=pass smtp.mailfrom=meta.com; dkim=pass (2048-bit key) header.d=meta.com header.i=@meta.com header.b=pEOj1Kxm; arc=none smtp.client-ip=67.231.145.42 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=reject dis=none) header.from=meta.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=meta.com Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=meta.com header.i=@meta.com header.b="pEOj1Kxm" Received: from pps.filterd (m0528009.ppops.net [127.0.0.1]) by mx0a-00082601.pphosted.com (8.18.1.11/8.18.1.11) with ESMTP id 64I31jt01641348; Mon, 18 May 2026 08:36:22 -0700 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=meta.com; h=cc :content-transfer-encoding:content-type:date:from:in-reply-to :message-id:mime-version:references:subject:to; s=s2048-2025-q2; bh=hC/Q+HF3sh1W434WgFYdHA9Vrg7QRFbGt3rlf+dK6So=; b=pEOj1Kxmpstz 8X/t8Cj0lnJb0G14rS2epxxOkQ78MNZUGFxQ6ngP/mnbneB+KxRo09GPEZylFSf6 uz1bkN4eBdBKIBThP5gRvncqkTSOtA+qfl3GTbPArjXl//qJfa1qTbR6mtTRH2ZI gepCf/M/4SXOeWlvGh3iyA3EWTRAs0hDVCFKhee5dlVzvbnHiRUui4Xdl9AmEEaG J/VUcXvaLu5ZwE522B/D7iTIe5Nj9gRNPnRhgvMqidqUjB+wmaFXcrqLF6LQA+Ok mieYyv1S3zuW7FHbs7mfgu+24WUmodyBd5wIweJSMGJCYYk3DSrx6YhPmUP2fuT4 KQEoS4L09A== Received: from mail.thefacebook.com ([163.114.134.16]) by mx0a-00082601.pphosted.com (PPS) with ESMTPS id 4e7ab565a0-5 (version=TLSv1.2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128 verify=NOT); Mon, 18 May 2026 08:36:21 -0700 (PDT) Received: from localhost (2620:10d:c085:108::150d) by mail.thefacebook.com (2620:10d:c08b:78::2ac9) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.2.2562.37; Mon, 18 May 2026 15:36:21 +0000 From: =?UTF-8?q?Cl=C3=A9ment=20L=C3=A9ger?= To: , Pavel Begunkov , "Jens Axboe" CC: =?UTF-8?q?Cl=C3=A9ment=20L=C3=A9ger?= , , , , , "David S. Miller" , Eric Dumazet , "Jakub Kicinski" , Paolo Abeni , Simon Horman , Jonathan Corbet , Shuah Khan , Vishwanath Seshagiri Subject: [PATCH v2 3/6] io_uring/zcrx: notify user on frag copy fallback Date: Mon, 18 May 2026 08:35:26 -0700 Message-ID: <20260518153532.2835502-4-cleger@meta.com> X-Mailer: git-send-email 2.52.0 In-Reply-To: <20260518153532.2835502-1-cleger@meta.com> References: <20260518153532.2835502-1-cleger@meta.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable X-Authority-Analysis: v=2.4 cv=XNQAjwhE c=1 sm=1 tr=0 ts=6a0b31f5 cx=c_pps a=CB4LiSf2rd0gKozIdrpkBw==:117 a=CB4LiSf2rd0gKozIdrpkBw==:17 a=IkcTkHD0fZMA:10 a=NGcC8JguVDcA:10 a=M51BFTxLslgA:10 a=VkNPw1HP01LnGYTKEx00:22 a=7x6HtfJdh03M6CCDgxCd:22 a=U_y8lYiYyhHBU5rMqhb2:22 a=VabnemYjAAAA:8 a=wIfzXL8Z3gsHLL3D1c4A:9 a=3ZKOabzyN94A:10 a=QEXdDO2ut3YA:10 a=gKebqoRLp9LExxC7YDUY:22 X-Proofpoint-Spam-Details-Enc: AW1haW4tMjYwNTE4MDE1MyBTYWx0ZWRfX0EDeXOgP5v7K 4Fl5RY9RkJYG9nDEcdau3aw6fUXawfnXEFaepnGFB/WaWpq1fnBZ8LxGxBoc5+o/xiL+7F6b7Z0 MpgUinlQB/n/CDFKdhAgcIRWtFanQPxRo5oQiIW64w+P6PmGZk4zdyYesC3vwI3lXg8rDivHr6G f6ODdhZyRKyFU/faY6byRQHWg4PJ/VVvmq47VGZaDD4hxyFVbN/NY1Q2ypJxepFxpm1PP3IAo3u gWOJWTTmCBC+HyL4A3Xt6FOzwdWXcOkGr6rOe8qZPf/LJehUL5Ko5gQKezB6bngt7lKvGpNd4B3 bk6/l8G1dEti1POD5q1JmUlbWPf7I3ML5vDj7UGrgdrFL/CMWOxeOf/TxWzCo8REpKKdsD0qGgU omx8UTXZ3+v4wD4BOc70tdmfHr5m6WZD5XR8ylN70tJJJvBeIuV0M3/+53AesrtaS6FZOWprOv6 h9Y8bdr2Xk4xwg1Xq2Q== X-Proofpoint-ORIG-GUID: 9maH-ML0zefpJDOm0izlgR6sZCstVxHE X-Proofpoint-GUID: 9maH-ML0zefpJDOm0izlgR6sZCstVxHE X-Proofpoint-Virus-Version: vendor=baseguard engine=ICAP:2.0.293,Aquarius:18.0.1143,Hydra:6.1.51,FMLib:17.12.100.49 definitions=2026-05-18_03,2026-05-18_01,2025-10-01_01 Add a ZCRX_NOTIF_COPY notification type to signal userspace when a received fragment could not be delivered using zero-copy and was instead copied into a buffer. Signed-off-by: Cl=C3=A9ment L=C3=A9ger --- include/uapi/linux/io_uring/zcrx.h | 1 + io_uring/zcrx.c | 7 ++++++- io_uring/zcrx.h | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/io_uring/zcrx.h b/include/uapi/linux/io_uri= ng/zcrx.h index 67185566ad3c..3f7b72b09878 100644 --- a/include/uapi/linux/io_uring/zcrx.h +++ b/include/uapi/linux/io_uring/zcrx.h @@ -70,6 +70,7 @@ enum zcrx_features { =20 enum zcrx_notification_type { ZCRX_NOTIF_NO_BUFFERS, + ZCRX_NOTIF_COPY, =20 __ZCRX_NOTIF_TYPE_LAST, }; diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 463fbaead35b..f31f2ca0f7ec 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -1534,8 +1534,13 @@ static int io_zcrx_copy_frag(struct io_kiocb *req, s= truct io_zcrx_ifq *ifq, const skb_frag_t *frag, int off, int len) { struct page *page =3D skb_frag_page(frag); + int ret; + + ret =3D io_zcrx_copy_chunk(req, ifq, page, off + skb_frag_off(frag), len); + if (ret > 0) + zcrx_send_notif(ifq, ZCRX_NOTIF_COPY); =20 - return io_zcrx_copy_chunk(req, ifq, page, off + skb_frag_off(frag), len); + return ret; } =20 static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq, diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index cca10d0d02ac..203b3049e14b 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -11,7 +11,7 @@ #define ZCRX_SUPPORTED_REG_FLAGS (ZCRX_REG_IMPORT | ZCRX_REG_NODEV) #define ZCRX_FEATURES (ZCRX_FEATURE_RX_PAGE_SIZE |\ ZCRX_FEATURE_NOTIFICATION) -#define ZCRX_NOTIF_TYPE_MASK (1U << ZCRX_NOTIF_NO_BUFFERS) +#define ZCRX_NOTIF_TYPE_MASK ((1U << ZCRX_NOTIF_NO_BUFFERS) | (1U << ZCRX= _NOTIF_COPY)) =20 struct io_zcrx_mem { unsigned long size; --=20 2.53.0-Meta From nobody Mon May 25 05:12:14 2026 Received: from mx0a-00082601.pphosted.com (mx0a-00082601.pphosted.com [67.231.145.42]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 0CC5B4657F5; Mon, 18 May 2026 15:36:35 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=67.231.145.42 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779118597; cv=none; b=k20yGjqyXmSY+I71CzUNiriZKkyL778QdVgx90/hBmh15RtimqCgIq6ImC20JBF+P3WBW/epGe8nWIEL0hnkLXQ4XXd0CMl1fFThuQSUjJblBnBMwYMrCGWNWKg1N0nBhlk9QI2HBheai3eJoLhEEHcurcX/3dyHQKKADFib3YY= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779118597; c=relaxed/simple; bh=jvp+xIe56wOZuwxAYmxCWXJS3N4zRsG8U/ZIZVspQfw=; h=From:To:CC:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version:Content-Type; b=pNC5qTirjPhHbW8/phhCHHY+buo0/32dBGrCV90weV9jZNVEReNEW0sD221jGoSzSOUaqLklhjjCzTUIm4TbZLZFB3Evx13BIvFxhHoaYsEUxlBPZNb9xsy7sbo0Pm+ygDOK3G4d8VMqCSPRbR7TI24OtLwk3s0CzxZf4yrhgyk= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=reject dis=none) header.from=meta.com; spf=pass smtp.mailfrom=meta.com; dkim=pass (2048-bit key) header.d=meta.com header.i=@meta.com header.b=t//fGip9; arc=none smtp.client-ip=67.231.145.42 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=reject dis=none) header.from=meta.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=meta.com Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=meta.com header.i=@meta.com header.b="t//fGip9" Received: from pps.filterd (m0148461.ppops.net [127.0.0.1]) by mx0a-00082601.pphosted.com (8.18.1.11/8.18.1.11) with ESMTP id 64I0721A971899; Mon, 18 May 2026 08:36:27 -0700 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=meta.com; h=cc :content-transfer-encoding:content-type:date:from:in-reply-to :message-id:mime-version:references:subject:to; s=s2048-2025-q2; bh=N9cwM3uJpW9AqVLCsjhE16RDghhwc3hL+4K3YJVoltU=; b=t//fGip9UXVO 0OjmHSE7qme86tB0L1Lp2zEqbn/oAajYvvQhBKDJRgLwCjeS2JyIacAGNgXq2jch 9E37a7soTaq0t7jf4KDSPdQscWaCNbnD3EvEQ5kQKdI58VE80t/zYooMKRgGT9t3 JOZS+Fsy5A88AEO8yeteL6n9NfH9KkKfaNPSKwHB3kDwIEoFgVxhrc1f4ldTXuo7 8ABwBornAuUVjGo6uMaWqgGk4JQI12BJQCPBkEjySmBBlsFC/CMux0CiI1PxH7ny sB0H1iZqp8O2RmFPpm+cb5FYnTlv4CHDKQP1E2QBZUAx+knVM7xjCPYLWDsUpkoP /1tq1YECag== Received: from mail.thefacebook.com ([163.114.134.16]) by mx0a-00082601.pphosted.com (PPS) with ESMTPS id 4e6np69rcp-2 (version=TLSv1.2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128 verify=NOT); Mon, 18 May 2026 08:36:27 -0700 (PDT) Received: from localhost (2620:10d:c085:108::150d) by mail.thefacebook.com (2620:10d:c08b:78::2ac9) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.2.2562.37; Mon, 18 May 2026 15:36:25 +0000 From: =?UTF-8?q?Cl=C3=A9ment=20L=C3=A9ger?= To: , Pavel Begunkov , "Jens Axboe" CC: =?UTF-8?q?Cl=C3=A9ment=20L=C3=A9ger?= , , , , , "David S. Miller" , Eric Dumazet , "Jakub Kicinski" , Paolo Abeni , Simon Horman , Jonathan Corbet , Shuah Khan , Vishwanath Seshagiri Subject: [PATCH v2 4/6] io_uring/zcrx: add shared-memory notification statistics Date: Mon, 18 May 2026 08:35:27 -0700 Message-ID: <20260518153532.2835502-5-cleger@meta.com> X-Mailer: git-send-email 2.52.0 In-Reply-To: <20260518153532.2835502-1-cleger@meta.com> References: <20260518153532.2835502-1-cleger@meta.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable X-Proofpoint-Spam-Details-Enc: AW1haW4tMjYwNTE4MDE1MyBTYWx0ZWRfX3LiKzmQg7Cdb Ao6/ngCD9DhIyfd7EN7/VynYidOUyLhUMKvCn+EK4rm7rn/jEYcjWCiV3J6bPG1aW3rOzQmrlD7 VsuVhxEDQGpSwaqvkBqcLD+wsiKvtvY3tW/KEVhPSQ16BQclM8iuJuRamIInTKRqKBy86msiLsL wnywtQZrujqAdwBaouz05ECcF5sPVnHab3tEltT+tGCj608Hlif+OMBwOdYJYHQ4A0VOtqzt2k3 Yl6FDFgyNjfvJFLxQEpPGWJ0XaIyZJ1W7MgWdGjVmnQGQ2HULNvLi05o6Mx37Mfg3pMbMbLruF2 8rSh2PXZfdlsb2VAmo6QOEbxCChW8Pzqm7abWU7eblaSFGEmg8BO3/unkfu+L/aegB31M7Wjcxd /9bC/uoG+8FL83+YIYVmXmEZdwQmMxnMPbsRnJsYDS/Z+zwu8oU4yujGnZOxvpAeaXM13o6HjW7 Cduuhnh9WXjjRDYidpg== X-Authority-Analysis: v=2.4 cv=Pr2jqQM3 c=1 sm=1 tr=0 ts=6a0b31fb cx=c_pps a=CB4LiSf2rd0gKozIdrpkBw==:117 a=CB4LiSf2rd0gKozIdrpkBw==:17 a=IkcTkHD0fZMA:10 a=NGcC8JguVDcA:10 a=M51BFTxLslgA:10 a=VkNPw1HP01LnGYTKEx00:22 a=7x6HtfJdh03M6CCDgxCd:22 a=03ozwUkBphtHgyqjj1sw:22 a=VabnemYjAAAA:8 a=tUABAHM7GXuONXiLVFsA:9 a=3ZKOabzyN94A:10 a=QEXdDO2ut3YA:10 a=gKebqoRLp9LExxC7YDUY:22 X-Proofpoint-GUID: shiX0prg-5IktwqHiSbTFgmKXKAjOqer X-Proofpoint-ORIG-GUID: shiX0prg-5IktwqHiSbTFgmKXKAjOqer X-Proofpoint-Virus-Version: vendor=baseguard engine=ICAP:2.0.293,Aquarius:18.0.1143,Hydra:6.1.51,FMLib:17.12.100.49 definitions=2026-05-18_03,2026-05-18_01,2025-10-01_01 Add support for an optional stats struct embedded in the refill queue region, allowing userspace to monitor copy-fallback in real-time. Userspace queries the stats struct size and alignment via IO_URING_QUERY_ZCRX_NOTIF (notif_stats_size / notif_stats_alignment), then provides a stats_offset in zcrx_notification_desc pointing to a location within the refill queue region. The kernel updates the stats counters in-place on every copy-fallback event. Signed-off-by: Cl=C3=A9ment L=C3=A9ger --- include/uapi/linux/io_uring/query.h | 12 +++++++ include/uapi/linux/io_uring/zcrx.h | 15 ++++++-- io_uring/query.c | 16 +++++++++ io_uring/zcrx.c | 54 +++++++++++++++++++++++++++-- io_uring/zcrx.h | 1 + 5 files changed, 94 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/io_uring/query.h b/include/uapi/linux/io_ur= ing/query.h index 95500759cc13..1a68eca7c6b4 100644 --- a/include/uapi/linux/io_uring/query.h +++ b/include/uapi/linux/io_uring/query.h @@ -23,6 +23,7 @@ enum { IO_URING_QUERY_OPCODES =3D 0, IO_URING_QUERY_ZCRX =3D 1, IO_URING_QUERY_SCQ =3D 2, + IO_URING_QUERY_ZCRX_NOTIF =3D 3, =20 __IO_URING_QUERY_MAX, }; @@ -62,6 +63,17 @@ struct io_uring_query_zcrx { __u64 __resv2; }; =20 +struct io_uring_query_zcrx_notif { + /* Bitmask of supported ZCRX_NOTIF_* flags */ + __u32 notif_flags; + /* Size of io_uring_zcrx_notif_stats */ + __u32 notif_stats_size; + /* Required alignment for the stats struct within the region (ie stats_of= fset) */ + __u32 notif_stats_off_alignment; + __u32 __resv1; + __u64 __resv2[4]; +}; + struct io_uring_query_scq { /* The SQ/CQ rings header size */ __u64 hdr_size; diff --git a/include/uapi/linux/io_uring/zcrx.h b/include/uapi/linux/io_uri= ng/zcrx.h index 3f7b72b09878..384e185a180c 100644 --- a/include/uapi/linux/io_uring/zcrx.h +++ b/include/uapi/linux/io_uring/zcrx.h @@ -75,11 +75,22 @@ enum zcrx_notification_type { __ZCRX_NOTIF_TYPE_LAST, }; =20 +enum zcrx_notification_desc_flags { + /* If set, stats_offset holds a valid offset to a notif_stats struct */ + ZCRX_NOTIF_DESC_FLAG_STATS =3D 1 << 0, +}; + +struct io_uring_zcrx_notif_stats { + __u64 copy_count; /* cumulative copy-fallback CQEs */ + __u64 copy_bytes; /* cumulative bytes copied */ +}; + struct zcrx_notification_desc { __u64 user_data; __u32 type_mask; - __u32 __resv1; - __u64 __resv2[10]; + __u32 flags; /* see enum zcrx_notification_desc_flags */ + __u64 stats_offset; /* offset from the beginning of refill ring region fo= r stats */ + __u64 __resv2[9]; }; =20 /* diff --git a/io_uring/query.c b/io_uring/query.c index c1704d088374..d17a83645bcd 100644 --- a/io_uring/query.c +++ b/io_uring/query.c @@ -9,6 +9,7 @@ union io_query_data { struct io_uring_query_opcode opcodes; struct io_uring_query_zcrx zcrx; + struct io_uring_query_zcrx_notif zcrx_notif; struct io_uring_query_scq scq; }; =20 @@ -44,6 +45,18 @@ static ssize_t io_query_zcrx(union io_query_data *data) return sizeof(*e); } =20 +static ssize_t io_query_zcrx_notif(union io_query_data *data) +{ + struct io_uring_query_zcrx_notif *e =3D &data->zcrx_notif; + + e->notif_flags =3D ZCRX_NOTIF_TYPE_MASK; + e->notif_stats_size =3D sizeof(struct io_uring_zcrx_notif_stats); + e->notif_stats_off_alignment =3D __alignof__(struct io_uring_zcrx_notif_s= tats); + e->__resv1 =3D 0; + memset(&e->__resv2, 0, sizeof(e->__resv2)); + return sizeof(*e); +} + static ssize_t io_query_scq(union io_query_data *data) { struct io_uring_query_scq *e =3D &data->scq; @@ -83,6 +96,9 @@ static int io_handle_query_entry(union io_query_data *dat= a, void __user *uhdr, case IO_URING_QUERY_ZCRX: ret =3D io_query_zcrx(data); break; + case IO_URING_QUERY_ZCRX_NOTIF: + ret =3D io_query_zcrx_notif(data); + break; case IO_URING_QUERY_SCQ: ret =3D io_query_scq(data); break; diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index f31f2ca0f7ec..2881ad76bacc 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -415,6 +415,7 @@ static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) io_free_region(ifq->user, &ifq->rq_region); ifq->rq.ring =3D IO_URING_PTR_POISON; ifq->rq.rqes =3D IO_URING_PTR_POISON; + ifq->notif_stats =3D IO_URING_PTR_POISON; } =20 static void io_zcrx_free_area(struct io_zcrx_ifq *ifq, @@ -855,6 +856,33 @@ static int zcrx_register_netdev(struct io_zcrx_ifq *if= q, return ret; } =20 +static int zcrx_validate_notif_stats(struct io_zcrx_ifq *ifq, + const struct io_uring_zcrx_ifq_reg *reg, + const struct zcrx_notification_desc *notif) +{ + size_t stats_off =3D notif->stats_offset; + size_t used, end; + + used =3D reg->offsets.rqes + + sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries; + + if (!IS_ALIGNED(stats_off, __alignof__(struct io_uring_zcrx_notif_stats))) + return -EINVAL; + if (stats_off < used) + return -ERANGE; + if (check_add_overflow(stats_off, + sizeof(struct io_uring_zcrx_notif_stats), + &end)) + return -ERANGE; + if (end > io_region_size(&ifq->rq_region)) + return -ERANGE; + + ifq->notif_stats =3D io_region_get_ptr(&ifq->rq_region) + stats_off; + memset(ifq->notif_stats, 0, sizeof(*ifq->notif_stats)); + + return 0; +} + int io_register_zcrx(struct io_ring_ctx *ctx, struct io_uring_zcrx_ifq_reg __user *arg) { @@ -908,7 +936,13 @@ int io_register_zcrx(struct io_ring_ctx *ctx, return -EFAULT; if (notif.type_mask & ~ZCRX_NOTIF_TYPE_MASK) return -EINVAL; - if (notif.__resv1 || !mem_is_zero(¬if.__resv2, sizeof(notif.__resv2))) + if (notif.flags & ~ZCRX_NOTIF_DESC_FLAG_STATS) + return -EINVAL; + if (!(notif.flags & ZCRX_NOTIF_DESC_FLAG_STATS)) { + if (notif.stats_offset) + return -EINVAL; + } + if (!mem_is_zero(¬if.__resv2, sizeof(notif.__resv2))) return -EINVAL; =20 ifq =3D io_zcrx_ifq_alloc(ctx); @@ -939,6 +973,12 @@ int io_register_zcrx(struct io_ring_ctx *ctx, if (ret) goto err; =20 + if (notif.flags & ZCRX_NOTIF_DESC_FLAG_STATS) { + ret =3D zcrx_validate_notif_stats(ifq, ®, ¬if); + if (ret) + goto err; + } + ifq->kern_readable =3D !(area.flags & IORING_ZCRX_AREA_DMABUF); =20 if (!(reg.flags & ZCRX_REG_NODEV)) { @@ -1154,6 +1194,11 @@ static void zcrx_notif_tw(struct io_tw_req tw_req, i= o_tw_token_t tw) kmem_cache_free(req_cachep, req); } =20 +static void zcrx_stat_add(__u64 *p, s64 v) +{ + WRITE_ONCE(*p, READ_ONCE(*p) + v); +} + static void zcrx_send_notif(struct io_zcrx_ifq *ifq, unsigned type) { gfp_t gfp =3D GFP_ATOMIC | __GFP_NOWARN | __GFP_ZERO; @@ -1537,8 +1582,13 @@ static int io_zcrx_copy_frag(struct io_kiocb *req, s= truct io_zcrx_ifq *ifq, int ret; =20 ret =3D io_zcrx_copy_chunk(req, ifq, page, off + skb_frag_off(frag), len); - if (ret > 0) + if (ret > 0) { + if (ifq->notif_stats) { + zcrx_stat_add(&ifq->notif_stats->copy_count, 1); + zcrx_stat_add(&ifq->notif_stats->copy_bytes, ret); + } zcrx_send_notif(ifq, ZCRX_NOTIF_COPY); + } =20 return ret; } diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index 203b3049e14b..e1aab76c310d 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -81,6 +81,7 @@ struct io_zcrx_ifq { u32 allowed_notif_mask; u32 fired_notifs; u64 notif_data; + struct io_uring_zcrx_notif_stats *notif_stats; }; =20 #if defined(CONFIG_IO_URING_ZCRX) --=20 2.53.0-Meta From nobody Mon May 25 05:12:14 2026 Received: from mx0a-00082601.pphosted.com (mx0a-00082601.pphosted.com [67.231.145.42]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 1C2CF481231; Mon, 18 May 2026 15:36:38 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=67.231.145.42 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779118601; cv=none; b=EOl2NizUbfYApC2/MWv4R6mQC9OUDxtl8dl+bJubH8atTb9dpVPaQwBXt2DShMkVsqppwglmyFLMRqGY+hR5zHPeJHGws2F177dMd3Q+8WIJbOrOm3NuDVwmWSqb/pRt/BVcx5g7mHcwHAqCcVbuu/LAzlbLrNepRP7yXKSkQbw= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779118601; c=relaxed/simple; bh=l707VBbrCSJyGC+nnuNzjTQMnzp+k7ZUIis8WkSHD8g=; h=From:To:CC:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version:Content-Type; b=LT7eoWDkfWLN5ZAKMmgJnM1RTh/SYJ7HF0hgfR7SOfbpheI5KmEF4W0MSFWT+bcHD01au8j8Ljq1d67H+psQGW/gyLutcsGGf8d9eBR9M3FWyK2AylG7Ky84RHJ2G/SaSoI5WY1yr9Hcw3mFup8cUqkwwTWDfvTN6CsKUYlV5TM= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=reject dis=none) header.from=meta.com; spf=pass smtp.mailfrom=meta.com; dkim=pass (2048-bit key) header.d=meta.com header.i=@meta.com header.b=XqvpnDc5; arc=none smtp.client-ip=67.231.145.42 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=reject dis=none) header.from=meta.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=meta.com Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=meta.com header.i=@meta.com header.b="XqvpnDc5" Received: from pps.filterd (m0044010.ppops.net [127.0.0.1]) by mx0a-00082601.pphosted.com (8.18.1.11/8.18.1.11) with ESMTP id 64I3stsp388165; Mon, 18 May 2026 08:36:31 -0700 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=meta.com; h=cc :content-transfer-encoding:content-type:date:from:in-reply-to :message-id:mime-version:references:subject:to; s=s2048-2025-q2; bh=UQDbSj0a7z4WLbieDvJ38kqU5oMrF5F2mxE5NnAVQwI=; b=XqvpnDc5NpSF 9AxkQtI6412YZnzWB70P/WAuV5uJCg1+KJ6OpNBsB36K6SUy8K6FMgisJG/+H5dU 75z/CggPNy2VtGSdDElAnOxXZ3NzvocrhRcfU8xsZceU9+p8xR24duzKyCYE48+a p/cDaGX9LNM+4Z54+GuevWkRUSVNVE7E7mrf8sRVzprg9m0yyCK1ObtsSaD2zWtj IrT3/lthLG1FxuNrZ0Y/OIdbKWiTselmQwgVmLD8Hzx4G45Kc3GYv61FoWYfUeZp EmRRKKUKmlcRhfMPY70Fwr0t2fYK4YJ3ba62H5+OIQ4bD4i0Z2PYksHZ4BG4JloN uJXmE2nQ4Q== Received: from maileast.thefacebook.com ([163.114.135.16]) by mx0a-00082601.pphosted.com (PPS) with ESMTPS id 4e6kw122km-2 (version=TLSv1.2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128 verify=NOT); Mon, 18 May 2026 08:36:31 -0700 (PDT) Received: from localhost (2620:10d:c0a8:1c::11) by mail.thefacebook.com (2620:10d:c0a9:6f::8fd4) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.2.2562.37; Mon, 18 May 2026 15:36:29 +0000 From: =?UTF-8?q?Cl=C3=A9ment=20L=C3=A9ger?= To: , Pavel Begunkov , "Jens Axboe" CC: =?UTF-8?q?Cl=C3=A9ment=20L=C3=A9ger?= , , , , , "David S. Miller" , Eric Dumazet , "Jakub Kicinski" , Paolo Abeni , Simon Horman , Jonathan Corbet , Shuah Khan , Vishwanath Seshagiri Subject: [PATCH v2 5/6] Documentation: networking: document zcrx notifications and statistics Date: Mon, 18 May 2026 08:35:28 -0700 Message-ID: <20260518153532.2835502-6-cleger@meta.com> X-Mailer: git-send-email 2.52.0 In-Reply-To: <20260518153532.2835502-1-cleger@meta.com> References: <20260518153532.2835502-1-cleger@meta.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable X-Proofpoint-ORIG-GUID: m9ASsOJN8Cqh3sLRQ9-XZTrCN21y2fVv X-Authority-Analysis: v=2.4 cv=P/MKQCAu c=1 sm=1 tr=0 ts=6a0b31ff cx=c_pps a=MfjaFnPeirRr97d5FC5oHw==:117 a=MfjaFnPeirRr97d5FC5oHw==:17 a=IkcTkHD0fZMA:10 a=NGcC8JguVDcA:10 a=M51BFTxLslgA:10 a=VkNPw1HP01LnGYTKEx00:22 a=7x6HtfJdh03M6CCDgxCd:22 a=8elwO82fXORLTBIkMd32:22 a=VabnemYjAAAA:8 a=8Z81dfnnkPiAomAV-AkA:9 a=3ZKOabzyN94A:10 a=QEXdDO2ut3YA:10 a=gKebqoRLp9LExxC7YDUY:22 X-Proofpoint-Spam-Details-Enc: AW1haW4tMjYwNTE4MDE1MyBTYWx0ZWRfXzV2nDNznSNGu rbwILdZhyGwuimyfBtnlzraTn2D38QsgP+lLXH+6GqGz1Vmp2U+YvGNCBa3nlwLv89suoXbqcjE OM4wgAcab3gUh0pg6SMIxGJV6bF5oAwdh3BokpIdRAXNrW06Zu9tlskM7pYomFGqSz5FtyJrAPm oVkhurKlVK1Cg6rMS2Ra4okQwPzBmsl+QtVrZkovjHVCMT14IJJdTqnSEcH0mWqyCJfDEtXHQWl +HtKJS6BCmlvK1V1YaaYbgZDna+77OeC+Mg0JcgOYvae7RuuWCsYA3VVqIiW17VqOYRj6lwSher m1GFXcscX6mnuoAJ/VIxiZhe6zO+DtYCaSUXiTx+PjOkjjFZy7BflEHRPEL4x4wWchBBVVwIvDM hkF4ZDM8kSzWXjrWPkb2/6SxOXwOhTCjGS5WS9IB/GMT1q/Vd1CEndj7b1kV67Tx43ndZCUUi/Q GcfszY8dEMtE6WwMrOA== X-Proofpoint-GUID: m9ASsOJN8Cqh3sLRQ9-XZTrCN21y2fVv X-Proofpoint-Virus-Version: vendor=baseguard engine=ICAP:2.0.293,Aquarius:18.0.1143,Hydra:6.1.51,FMLib:17.12.100.49 definitions=2026-05-18_03,2026-05-18_01,2025-10-01_01 Document the zcrx notification system and shared-memory statistics that were introduced to let userspace monitor zero-copy receive health. The notification section covers the two notification types (ZCRX_NOTIF_NO_BUFFERS, ZCRX_NOTIF_COPY), registration via zcrx_notification_desc, and the fire-once / re-arm mechanism via ZCRX_CTRL_ARM_NOTIFICATION. The statistics section covers the optional shared-memory io_uring_zcrx_notif_stats structure placed in the refill ring region, including how to query its layout via IO_URING_QUERY_ZCRX_NOTIF. Signed-off-by: Cl=C3=A9ment L=C3=A9ger --- Documentation/networking/iou-zcrx.rst | 121 ++++++++++++++++++++++++++ 1 file changed, 121 insertions(+) diff --git a/Documentation/networking/iou-zcrx.rst b/Documentation/networki= ng/iou-zcrx.rst index 7f3f4b2e6cf2..442760a1ca03 100644 --- a/Documentation/networking/iou-zcrx.rst +++ b/Documentation/networking/iou-zcrx.rst @@ -196,6 +196,127 @@ Return buffers back to the kernel to be used again:: rqe->len =3D cqe->res; IO_URING_WRITE_ONCE(*refill_ring.ktail, ++refill_ring.rq_tail); =20 +Notifications +------------- + +When zero-copy receive encounters conditions that impact performance or +functionality, the kernel can notify userspace via dedicated CQE notificat= ions. +The application must register a notification descriptor during +``IORING_REGISTER_ZCRX_IFQ`` to receive them. Notifications are sent +individually and are not batched with other CQEs. Each notification CQE re= ports +a single notification in ``cqe->res``. + +Supported features can be detected by checking for ``ZCRX_FEATURE_NOTIFICA= TION`` +in the features bitmask returned by ``IO_URING_QUERY_ZCRX``. + +**Notification types** + +``ZCRX_NOTIF_NO_BUFFERS`` + Fired when the page pool fails to allocate because the zcrx buffer area = is + exhausted. + +``ZCRX_NOTIF_COPY`` + Fired when a received fragment could not be delivered zero-copy and was + instead copied into a buffer. + +**Registering notifications** + +Allocate and fill a ``struct zcrx_notification_desc``:: + + struct zcrx_notification_desc notif =3D { + .user_data =3D MY_NOTIF_USER_DATA, + .type_mask =3D ZCRX_NOTIF_NO_BUFFERS | ZCRX_NOTIF_COPY, + }; + + reg.notif_desc =3D (__u64)(unsigned long)¬if; + +``user_data`` is the value that will appear in the notification CQE's +``user_data`` field. ``type_mask`` selects which notification types the +application wants to receive. + +When a registered event occurs, the kernel posts a CQE with the specified +``user_data`` and ``cqe->res`` set to a bitmask of the triggered notificat= ion +types. + +**Rate limiting** + +Each notification type fires once until the application explicitly re-arms= it. +To re-arm, issue ``IORING_REGISTER_ZCRX_CTRL`` with +``ZCRX_CTRL_ARM_NOTIFICATION``:: + + struct zcrx_ctrl ctrl =3D { + .zcrx_id =3D zcrx_id, + .op =3D ZCRX_CTRL_ARM_NOTIFICATION, + .zc_arm_notif =3D { + .notif_type =3D ZCRX_NOTIF_NO_BUFFERS, + }, + }; + + io_uring_register(ring_fd, IORING_REGISTER_ZCRX_CTRL, &ctrl, 0); + +Only notification types that have previously fired can be re-armed. + +Notification statistics +----------------------- + +In addition to CQE-based notifications, the kernel can maintain a shared-m= emory +statistics structure that is updated on every relevant event. All stats are +updated regardless of which notification flags were registered. + +The statistics structure layout and alignment requirements can be queried = via +``IO_URING_QUERY_ZCRX_NOTIF``. The application must query the structure si= ze +and alignment requirements so that it allocates enough memory for the regi= on +to fit both the refill ring and the stats structure:: + + struct io_uring_query_zcrx_notif notif_query =3D {}; + struct io_uring_query_hdr hdr =3D { + .query_op =3D IO_URING_QUERY_ZCRX_NOTIF, + .size =3D sizeof(notif_query), + .query_data =3D (__u64)(unsigned long)¬if_query, + }; + + io_uring_register(ring_fd, IORING_REGISTER_QUERY, &hdr, 1); + + __u32 notif_stats_size =3D notif_query.notif_stats_size; + __u32 notif_stats_off_alignment =3D notif_query.notif_stats_off_alignmen= t; + +To enable statistics, place the stats structure after the refill ring entr= ies +within the same mapped region, and set the ``ZCRX_NOTIF_DESC_FLAG_STATS`` = flag +in the notification descriptor:: + + /* Compute offset for the stats struct (after refill ring entries) */ + size_t stats_offset =3D ALIGN_UP(ring_size, notif_stats_off_alignment); + ring_size =3D stats_offset + notif_stats_size; + ring_size =3D ALIGN_UP(ring_size, PAGE_SIZE); + + /* Map the region with the extra space */ + ring_ptr =3D mmap(NULL, ring_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + + struct zcrx_notification_desc notif =3D { + .user_data =3D MY_NOTIF_USER_DATA, + .type_mask =3D ZCRX_NOTIF_COPY, + .flags =3D ZCRX_NOTIF_DESC_FLAG_STATS, + .stats_offset =3D stats_offset, + }; + +The ``stats_offset`` must satisfy the alignment reported by +``notif_stats_off_alignment`` and must point to a location within the mapp= ed +region that does not overlap with the refill ring header or entries. + +Application can read stat counters them at any time:: + + volatile struct io_uring_zcrx_notif_stats *stats =3D + (void *)((char *)ring_ptr + stats_offset); + + printf("copy fallbacks: %llu (%llu bytes)\n", + IO_URING_READ_ONCE(stats->copy_count), + IO_URING_READ_ONCE(stats->copy_bytes)); + +``copy_count`` is incremented each time a fragment is copied instead of be= ing +delivered via zero-copy. ``copy_bytes`` accumulates the total number of by= tes +copied. + Area chunking ------------- =20 --=20 2.53.0-Meta From nobody Mon May 25 05:12:14 2026 Received: from mx0a-00082601.pphosted.com (mx0a-00082601.pphosted.com [67.231.145.42]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 364913FE348; Mon, 18 May 2026 15:36:45 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=67.231.145.42 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779118607; cv=none; b=ivEJwlmfwMNL0bg2dji9H3ppnHq46RkkmKwzYRMq2gnZupCZJpIIkt6VQ/ynlkup6YNQFNJIu0LGYAd1tlzU2hHCcc6fI1rT2KhFtEKfGfEjLZdC6gd7C4nJidre9SEMe8yYo8M+QFbaEsAJkLbMennrcz6S4s+MqVjJ3E9iVFY= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1779118607; c=relaxed/simple; bh=5cj/nKuEQelBP0dSJBoOWrIRfhWzeqqdXejYwbGQ5VA=; h=From:To:CC:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version:Content-Type; b=kyzE9PHbsHcxbIhKoAdSVM6obaJqNV90+ChXgMEMpRyNrsz5aEsEfL6658uRz9OruvoN2pIWTUl2U8GgO2Vm01MR1xJE8apDbgLj43ni3RETll48iMJywoNVH+Uf3wKJwGrYKN7dp1XCgBmbN4nAvoO0TJ4o/Og9VsgEC+sdlvw= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=reject dis=none) header.from=meta.com; spf=pass smtp.mailfrom=meta.com; dkim=pass (2048-bit key) header.d=meta.com header.i=@meta.com header.b=IXCis9eZ; arc=none smtp.client-ip=67.231.145.42 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=reject dis=none) header.from=meta.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=meta.com Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=meta.com header.i=@meta.com header.b="IXCis9eZ" Received: from pps.filterd (m0109334.ppops.net [127.0.0.1]) by mx0a-00082601.pphosted.com (8.18.1.11/8.18.1.11) with ESMTP id 64I0kgwq791911; Mon, 18 May 2026 08:36:36 -0700 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=meta.com; h=cc :content-transfer-encoding:content-type:date:from:in-reply-to :message-id:mime-version:references:subject:to; s=s2048-2025-q2; bh=6n9ydSJJfJR/OQGl/6tWThkMyR4vOGg98Mrp63aOd+8=; b=IXCis9eZoiYg bhDS/ZkY0NdeyAoCimnOVWRrbjrWS9l+dMx+Cy0QtsJakWR4FoZwDhIDyUtQlY12 ukr0KpEFMv0BYCzyEQoVeag2jxTuXBTkZWn8LoOg6N/GlxzAt/xOVyAL9kReGhVF by+UK02RSNeBjM9gRdbqLpwSinIps5wwHNNhcdGdYyZU7zrb3lnclSmGd2Ttpbs2 vNgupbJMt64yrLnjwJ+B8NmISx3wblqs61Ev7NnW3V09N/qz0jjKAj+hWDHIJGol +LxeEFRIY3zZg92lwyP/dS4Pd636/xLpjzLo5PPBDcwnu2RAeZZHGR5z63gWt4Hp lqKUACAAyA== Received: from maileast.thefacebook.com ([163.114.135.16]) by mx0a-00082601.pphosted.com (PPS) with ESMTPS id 4e6qvg9ayq-2 (version=TLSv1.2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128 verify=NOT); Mon, 18 May 2026 08:36:35 -0700 (PDT) Received: from localhost (2620:10d:c0a8:1b::8e35) by mail.thefacebook.com (2620:10d:c0a9:6f::8fd4) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.2.2562.37; Mon, 18 May 2026 15:36:33 +0000 From: =?UTF-8?q?Cl=C3=A9ment=20L=C3=A9ger?= To: , Pavel Begunkov , "Jens Axboe" CC: =?UTF-8?q?Cl=C3=A9ment=20L=C3=A9ger?= , , , , , "David S. Miller" , Eric Dumazet , "Jakub Kicinski" , Paolo Abeni , Simon Horman , Jonathan Corbet , Shuah Khan , Vishwanath Seshagiri Subject: [PATCH v2 6/6] selftests: iou-zcrx: add notification and stats test for zcrx Date: Mon, 18 May 2026 08:35:29 -0700 Message-ID: <20260518153532.2835502-7-cleger@meta.com> X-Mailer: git-send-email 2.52.0 In-Reply-To: <20260518153532.2835502-1-cleger@meta.com> References: <20260518153532.2835502-1-cleger@meta.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable X-Proofpoint-Spam-Details-Enc: AW1haW4tMjYwNTE4MDE1MyBTYWx0ZWRfX8CvtpYdhUQVJ C5SgL/5Azn2posfhXkO1HuE07ZZjrAAGc6dECLSz23gXMBl+LvGnmHO/UCJ8fS1vwX7jeZ8YCiI hnhTbSF6t7tuSzN5y2QiXIOO23o3zdbXI/tTVIr4gsJmHBRrC48XBQlbGzUzENwXCA+TO+RqFIk cYvYFBCh3X2ZppxhvSy4if+7EPdz/NDwyn4Mdk8pP880LV3dDGlqWTma7KyS+xlVhvHPMJjIEdf cyVmSpUXlZXlT7g7w1d/28fy39seA2ZZPRIZnmUUMVnjnTfHK0wNuypDUD/qtTCE6FnV2YegYxA 9TzxoAuqxhw8K65lGRn3BaFliyY9TKGF3PP5nmrfH2zoRdukREcO9njG8M6xBDfYAwu3z+flQEp IankTWidyfvUjssqkHHiXhmaiLv9/b5dVfTSk52OUaz1wUu1LkUokGBZDhlXbAbcDwE9iTtWgSA Gy/JppISEbQCBvrS92g== X-Authority-Analysis: v=2.4 cv=LpqiDHdc c=1 sm=1 tr=0 ts=6a0b3203 cx=c_pps a=MfjaFnPeirRr97d5FC5oHw==:117 a=MfjaFnPeirRr97d5FC5oHw==:17 a=IkcTkHD0fZMA:10 a=NGcC8JguVDcA:10 a=M51BFTxLslgA:10 a=VkNPw1HP01LnGYTKEx00:22 a=7x6HtfJdh03M6CCDgxCd:22 a=crHB47gyY4rKiduisYu9:22 a=VabnemYjAAAA:8 a=04ZbsbsXR34X7E8LsGsA:9 a=3ZKOabzyN94A:10 a=QEXdDO2ut3YA:10 a=gKebqoRLp9LExxC7YDUY:22 X-Proofpoint-ORIG-GUID: 2Y9rrR_-riIGNffU-d7yP2m2fcrUodF3 X-Proofpoint-GUID: 2Y9rrR_-riIGNffU-d7yP2m2fcrUodF3 X-Proofpoint-Virus-Version: vendor=baseguard engine=ICAP:2.0.293,Aquarius:18.0.1143,Hydra:6.1.51,FMLib:17.12.100.49 definitions=2026-05-18_03,2026-05-18_01,2025-10-01_01 Add a selftest to verify that ZCRX notification are properly delivered to userspace and that the shared-memory notification stats (copy_count, copy_bytes) are correctly incremented when zero-copy RX falls back to copying or when it runs out of buffers. The test registers a notification descriptor during IORING_REGISTER_ZCRX_IFQ with a stats region placed after the refill queue entries. A new -n flag verifies that the copy fallback is triggered and -b/-a flags allows to check for out of buffer notification. To reliably trigger copy fallback, the Python test uses a new single_no_flow() setup variant that configures tcp-data-split and RSS but without ethtool flow rule. Without flow steering, traffic arrives on non-zcrx queues as regular pages, forcing the kernel copy-fallback path in io_zcrx_copy_frag(). Out-of-buffer notification is verified by using a smaller receive area and by avoiding recycling the buffers so that the kernel runs out of buffer quickly. Signed-off-by: Cl=C3=A9ment L=C3=A9ger --- .../selftests/drivers/net/hw/iou-zcrx.c | 114 ++++++++++++++++-- .../selftests/drivers/net/hw/iou-zcrx.py | 49 +++++++- 2 files changed, 151 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/drivers/net/hw/iou-zcrx.c b/tools/test= ing/selftests/drivers/net/hw/iou-zcrx.c index 240d13dbc54e..78a43ede77ed 100644 --- a/tools/testing/selftests/drivers/net/hw/iou-zcrx.c +++ b/tools/testing/selftests/drivers/net/hw/iou-zcrx.c @@ -52,7 +52,27 @@ struct t_io_uring_zcrx_ifq_reg { struct io_uring_zcrx_offsets offsets; __u32 zcrx_id; __u32 rx_buf_len; - __u64 __resv[3]; + __u64 notif_desc; + __u64 __resv[2]; +}; + +#define ZCRX_NOTIF_NO_BUFFERS 0 +#define ZCRX_NOTIF_COPY 1 +#define ZCRX_NOTIF_DESC_FLAG_STATS (1 << 0) + +#define NOTIF_USER_DATA 3 + +struct t_zcrx_notification_desc { + __u64 user_data; + __u32 type_mask; + __u32 flags; + __u64 stats_offset; + __u64 __resv2[9]; +}; + +struct t_io_uring_zcrx_notif_stats { + __u64 copy_count; + __u64 copy_bytes; }; =20 static long page_size; @@ -84,7 +104,10 @@ static int cfg_oneshot_recvs; static int cfg_send_size =3D SEND_SIZE; static struct sockaddr_in6 cfg_addr; static unsigned int cfg_rx_buf_len; +static size_t cfg_area_size; static bool cfg_dry_run; +static bool cfg_copy_fallback; +static bool cfg_no_buffers; =20 static char *payload; static void *area_ptr; @@ -95,6 +118,9 @@ static unsigned long area_token; static int connfd; static bool stop; static size_t received; +static unsigned int received_notif_type; +static bool received_notif; +static size_t notif_stats_offset; =20 static unsigned long gettimeofday_ms(void) { @@ -142,6 +168,7 @@ static void setup_zcrx(struct io_uring *ring) { unsigned int ifindex; unsigned int rq_entries =3D 4096; + size_t area_size =3D cfg_area_size ? cfg_area_size : AREA_SIZE; int ret; =20 ifindex =3D if_nametoindex(cfg_ifname); @@ -150,7 +177,7 @@ static void setup_zcrx(struct io_uring *ring) =20 if (cfg_rx_buf_len && cfg_rx_buf_len !=3D page_size) { area_ptr =3D mmap(NULL, - AREA_SIZE, + area_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE | MAP_HUGETLB | MAP_HUGE_2MB, @@ -162,7 +189,7 @@ static void setup_zcrx(struct io_uring *ring) } } else { area_ptr =3D mmap(NULL, - AREA_SIZE, + area_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, 0, @@ -172,6 +199,12 @@ static void setup_zcrx(struct io_uring *ring) } =20 ring_size =3D get_refill_ring_size(rq_entries); + + if (cfg_copy_fallback) { + notif_stats_offset =3D ring_size; + ring_size +=3D ALIGN_UP(sizeof(struct t_io_uring_zcrx_notif_stats), page= _size); + } + ring_ptr =3D mmap(NULL, ring_size, PROT_READ | PROT_WRITE, @@ -187,10 +220,11 @@ static void setup_zcrx(struct io_uring *ring) =20 struct io_uring_zcrx_area_reg area_reg =3D { .addr =3D (__u64)(unsigned long)area_ptr, - .len =3D AREA_SIZE, + .len =3D area_size, .flags =3D 0, }; =20 + struct t_zcrx_notification_desc notif_desc; struct t_io_uring_zcrx_ifq_reg reg =3D { .if_idx =3D ifindex, .if_rxq =3D cfg_queue_id, @@ -200,11 +234,32 @@ static void setup_zcrx(struct io_uring *ring) .rx_buf_len =3D cfg_rx_buf_len, }; =20 + if (cfg_copy_fallback || cfg_no_buffers) { + __u32 type_mask =3D 0; + + if (cfg_copy_fallback) + type_mask =3D 1 << ZCRX_NOTIF_COPY; + if (cfg_no_buffers) + type_mask =3D 1 << ZCRX_NOTIF_NO_BUFFERS; + + memset(¬if_desc, 0, sizeof(notif_desc)); + notif_desc.user_data =3D NOTIF_USER_DATA; + notif_desc.type_mask =3D type_mask; + if (cfg_copy_fallback) { + notif_desc.flags =3D ZCRX_NOTIF_DESC_FLAG_STATS; + notif_desc.stats_offset =3D notif_stats_offset; + } + reg.notif_desc =3D (__u64)(unsigned long)¬if_desc; + } + ret =3D io_uring_register_ifq(ring, (void *)®); if (cfg_rx_buf_len && (ret =3D=3D -EINVAL || ret =3D=3D -EOPNOTSUPP || ret =3D=3D -ERANGE)) { printf("Large chunks are not supported %i\n", ret); exit(SKIP_CODE); + } else if ((cfg_copy_fallback || cfg_no_buffers) && ret =3D=3D -EINVAL) { + printf("Notifications not supported %i\n", ret); + exit(SKIP_CODE); } else if (ret) { error(1, 0, "io_uring_register_ifq(): %d", ret); } @@ -304,10 +359,13 @@ static void process_recvzc(struct io_uring *ring, str= uct io_uring_cqe *cqe) } received +=3D n; =20 - rqe =3D &rq_ring.rqes[(rq_ring.rq_tail & rq_mask)]; - rqe->off =3D (rcqe->off & ~IORING_ZCRX_AREA_MASK) | area_token; - rqe->len =3D cqe->res; - io_uring_smp_store_release(rq_ring.ktail, ++rq_ring.rq_tail); + /* Skip ring refill so that we ran out of buffers quickly */ + if (!cfg_no_buffers) { + rqe =3D &rq_ring.rqes[(rq_ring.rq_tail & rq_mask)]; + rqe->off =3D (rcqe->off & ~IORING_ZCRX_AREA_MASK) | area_token; + rqe->len =3D cqe->res; + io_uring_smp_store_release(rq_ring.ktail, ++rq_ring.rq_tail); + } } =20 static void server_loop(struct io_uring *ring) @@ -324,8 +382,16 @@ static void server_loop(struct io_uring *ring) process_accept(ring, cqe); else if (cqe->user_data =3D=3D 2) process_recvzc(ring, cqe); - else + else if ((cfg_copy_fallback || cfg_no_buffers) && + cqe->user_data =3D=3D NOTIF_USER_DATA) { + received_notif_type |=3D cqe->res; + received_notif =3D true; + if (cfg_no_buffers && + (cqe->res =3D=3D ZCRX_NOTIF_NO_BUFFERS)) + stop =3D true; + } else { error(1, 0, "unknown cqe"); + } count++; } io_uring_cq_advance(ring, count); @@ -374,6 +440,23 @@ static void run_server(void) =20 if (!stop) error(1, 0, "test failed\n"); + + if (cfg_copy_fallback) { + struct t_io_uring_zcrx_notif_stats *stats =3D + (void *)((char *)ring_ptr + notif_stats_offset); + + if (!received_notif || received_notif_type !=3D ZCRX_NOTIF_COPY) + error(1, 0, "expected copy fallback notification"); + if (!IO_URING_READ_ONCE(stats->copy_count)) + error(1, 0, "expected copy_count > 0"); + if (!IO_URING_READ_ONCE(stats->copy_bytes)) + error(1, 0, "expected copy_bytes > 0"); + } + + if (cfg_no_buffers) { + if (!received_notif || received_notif_type !=3D ZCRX_NOTIF_NO_BUFFERS) + error(1, 0, "expected no-buffers notification"); + } } =20 static void run_client(void) @@ -425,7 +508,7 @@ static void parse_opts(int argc, char **argv) usage(argv[0]); cfg_payload_len =3D max_payload_len; =20 - while ((c =3D getopt(argc, argv, "sch:p:l:i:q:o:z:x:d")) !=3D -1) { + while ((c =3D getopt(argc, argv, "sch:p:l:i:q:o:z:x:a:dnb")) !=3D -1) { switch (c) { case 's': if (cfg_client) @@ -466,8 +549,19 @@ static void parse_opts(int argc, char **argv) case 'd': cfg_dry_run =3D true; break; + case 'n': + cfg_copy_fallback =3D true; + break; + case 'b': + cfg_no_buffers =3D true; + break; + case 'a': + cfg_area_size =3D strtoul(optarg, NULL, 0) * page_size; + break; } } + if (cfg_copy_fallback && cfg_no_buffers) + error(1, 0, "Pass one of -n or -b"); =20 if (cfg_server && addr) error(1, 0, "Receiver cannot have -h specified"); diff --git a/tools/testing/selftests/drivers/net/hw/iou-zcrx.py b/tools/tes= ting/selftests/drivers/net/hw/iou-zcrx.py index e81724cb5542..82b4f4777182 100755 --- a/tools/testing/selftests/drivers/net/hw/iou-zcrx.py +++ b/tools/testing/selftests/drivers/net/hw/iou-zcrx.py @@ -41,7 +41,9 @@ def set_flow_rule_rss(cfg, rss_ctx_id): return int(values) =20 =20 -def single(cfg): +def single_no_flow(cfg): + """Like single() but without a flow rule.""" + channels =3D cfg.ethnl.channels_get({'header': {'dev-index': cfg.ifind= ex}}) channels =3D channels['combined-count'] if channels < 2: @@ -65,6 +67,9 @@ def single(cfg): ethtool(f"-X {cfg.ifname} equal {cfg.target}") defer(ethtool, f"-X {cfg.ifname} default") =20 +def single(cfg): + single_no_flow(cfg) + flow_rule_id =3D set_flow_rule(cfg) defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}") =20 @@ -130,6 +135,26 @@ def test_zcrx_oneshot(cfg, setup) -> None: cmd(tx_cmd, host=3Dcfg.remote) =20 =20 +@ksft_variants([ + KsftNamedVariant("single", single_no_flow), +]) +def test_zcrx_notif_copy_fallback(cfg, setup) -> None: + """Test zcrx copy fallback notification. + + Omits the flow rule so traffic arrives on non-zcrx queues as regular + pages, forcing the kernel copy-fallback path. Asserts that the + ZCRX_NOTIF_COPY notification CQE is delivered.""" + + cfg.require_ipver('6') + + setup(cfg) + rx_cmd =3D f"{cfg.bin_local} -s -p {cfg.port} -i {cfg.ifname} -q {cfg.= target} -n" + tx_cmd =3D f"{cfg.bin_remote} -c -h {cfg.addr_v['6']} -p {cfg.port} -l= 12840" + with bkg(rx_cmd, exit_wait=3DTrue): + wait_port_listen(cfg.port, proto=3D"tcp") + cmd(tx_cmd, host=3Dcfg.remote) + + def test_zcrx_large_chunks(cfg) -> None: """Test zcrx with large buffer chunks.""" =20 @@ -157,6 +182,25 @@ def test_zcrx_large_chunks(cfg) -> None: cmd(tx_cmd, host=3Dcfg.remote) =20 =20 +@ksft_variants([ + KsftNamedVariant("single", single), +]) +def test_zcrx_notif_no_buffers(cfg, setup) -> None: + """Test zcrx out-of-buffer notification. + + Skips buffer refill so the pool is quickly exhausted, triggering + a ZCRX_NOTIF_NO_BUFFERS notification CQE.""" + + cfg.require_ipver('6') + + setup(cfg) + rx_cmd =3D f"{cfg.bin_local} -s -p {cfg.port} -i {cfg.ifname} -q {cfg.= target} -b -a 64" + tx_cmd =3D f"{cfg.bin_remote} -c -h {cfg.addr_v['6']} -p {cfg.port} -l= 12840" + with bkg(rx_cmd, exit_wait=3DTrue): + wait_port_listen(cfg.port, proto=3D"tcp") + cmd(tx_cmd, host=3Dcfg.remote, fail=3DFalse) + + def main() -> None: with NetDrvEpEnv(__file__) as cfg: cfg.bin_local =3D path.abspath(path.dirname(__file__) + "/../../..= /drivers/net/hw/iou-zcrx") @@ -166,7 +210,8 @@ def main() -> None: cfg.netnl =3D NetdevFamily() cfg.port =3D rand_port() ksft_run(globs=3Dglobals(), cases=3D[test_zcrx, test_zcrx_oneshot, - test_zcrx_large_chunks], args=3D(c= fg, )) + test_zcrx_large_chunks, test_zcrx_= notif_copy_fallback, + test_zcrx_notif_no_buffers], args= =3D(cfg, )) ksft_exit() =20 =20 --=20 2.53.0-Meta