include/linux/sunrpc/svcsock.h | 3 +++ net/sunrpc/svcsock.c | 55 ++++++++++++++++++++++++++++++++++++------ 2 files changed, 51 insertions(+), 7 deletions(-)
svc_tcp_sendmsg() calls xdr_buf_to_bvec() with the second slot of
rq_bvec as the start, but doesn't reduce the array length by one, which
could lead to an array overrun. Also, rq_bvec is always rq_maxpages in
length, which can be too short in some cases, since the TCP record
marker consumes a slot.
Fix both problems by adding a separate bvec array to the svc_sock that
is specifically for sending. For TCP, make this array one slot longer
than rq_maxpages, to account for the record marker. For UDP, only
allocate as large an array as we need since it's limited to 64k of
payload.
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
Another update based on Neil's feedback. This version allocates the
array when the socket is allocated, and fixes the UDP length
calculation.
---
Changes in v7:
- use RPCSVC_MAXPAYLOAD_UDP instead of assuming max of 64kb
- Link to v6: https://lore.kernel.org/r/20251013-rq_bvec-v6-1-17982fc64ad2@kernel.org
Changes in v6:
- allocate sk_bvec in ->xpo_create
- fix the array-length calculation for UDP
- Link to v5: https://lore.kernel.org/r/20251010-rq_bvec-v5-1-44976250199d@kernel.org
Changes in v5:
- reduce the size of sk_bvec on UDP sockets
- Link to v4: https://lore.kernel.org/r/20251010-rq_bvec-v4-1-627567f1ce91@kernel.org
Changes in v4:
- switch to allocating a separate bvec for sends in the svc_sock
- Link to v3: https://lore.kernel.org/r/20251009-rq_bvec-v3-0-57181360b9cb@kernel.org
Changes in v3:
- Add rq_bvec_len field and use it in appropriate places
- Link to v2: https://lore.kernel.org/r/20251008-rq_bvec-v2-0-823c0a85a27c@kernel.org
Changes in v2:
- Better changelog message for patch #2
- Link to v1: https://lore.kernel.org/r/20251008-rq_bvec-v1-0-7f23d32d75e5@kernel.org
---
include/linux/sunrpc/svcsock.h | 3 +++
net/sunrpc/svcsock.c | 55 ++++++++++++++++++++++++++++++++++++------
2 files changed, 51 insertions(+), 7 deletions(-)
diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h
index 963bbe251e52109a902f6b9097b6e9c3c23b1fd8..de37069aba90899be19b1090e6e90e509a3cf530 100644
--- a/include/linux/sunrpc/svcsock.h
+++ b/include/linux/sunrpc/svcsock.h
@@ -26,6 +26,9 @@ struct svc_sock {
void (*sk_odata)(struct sock *);
void (*sk_owspace)(struct sock *);
+ /* For sends (protected by xpt_mutex) */
+ struct bio_vec *sk_bvec;
+
/* private TCP part */
/* On-the-wire fragment header: */
__be32 sk_marker;
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 0cb9c4d457453b26db29f08985b056c3f8d59447..93de79020a2d859a9c0b9464d794c167531d701d 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -68,6 +68,17 @@
#define RPCDBG_FACILITY RPCDBG_SVCXPRT
+/*
+ * For UDP:
+ * 1 for header page
+ * enough pages for RPCSVC_MAXPAYLOAD_UDP
+ * 1 in case payload is not aligned
+ * 1 for tail page
+ */
+enum {
+ SUNRPC_MAX_UDP_SENDPAGES = 1 + RPCSVC_MAXPAYLOAD_UDP / PAGE_SIZE + 1 + 1
+};
+
/* To-do: to avoid tying up an nfsd thread while waiting for a
* handshake request, the request could instead be deferred.
*/
@@ -740,14 +751,14 @@ static int svc_udp_sendto(struct svc_rqst *rqstp)
if (svc_xprt_is_dead(xprt))
goto out_notconn;
- count = xdr_buf_to_bvec(rqstp->rq_bvec, rqstp->rq_maxpages, xdr);
+ count = xdr_buf_to_bvec(svsk->sk_bvec, SUNRPC_MAX_UDP_SENDPAGES, xdr);
- iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec,
+ iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, svsk->sk_bvec,
count, rqstp->rq_res.len);
err = sock_sendmsg(svsk->sk_sock, &msg);
if (err == -ECONNREFUSED) {
/* ICMP error on earlier request. */
- iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec,
+ iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, svsk->sk_bvec,
count, rqstp->rq_res.len);
err = sock_sendmsg(svsk->sk_sock, &msg);
}
@@ -1236,19 +1247,19 @@ static int svc_tcp_sendmsg(struct svc_sock *svsk, struct svc_rqst *rqstp,
int ret;
/* The stream record marker is copied into a temporary page
- * fragment buffer so that it can be included in rq_bvec.
+ * fragment buffer so that it can be included in sk_bvec.
*/
buf = page_frag_alloc(&svsk->sk_frag_cache, sizeof(marker),
GFP_KERNEL);
if (!buf)
return -ENOMEM;
memcpy(buf, &marker, sizeof(marker));
- bvec_set_virt(rqstp->rq_bvec, buf, sizeof(marker));
+ bvec_set_virt(svsk->sk_bvec, buf, sizeof(marker));
- count = xdr_buf_to_bvec(rqstp->rq_bvec + 1, rqstp->rq_maxpages,
+ count = xdr_buf_to_bvec(svsk->sk_bvec + 1, rqstp->rq_maxpages,
&rqstp->rq_res);
- iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec,
+ iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, svsk->sk_bvec,
1 + count, sizeof(marker) + rqstp->rq_res.len);
ret = sock_sendmsg(svsk->sk_sock, &msg);
page_frag_free(buf);
@@ -1393,6 +1404,20 @@ void svc_sock_update_bufs(struct svc_serv *serv)
spin_unlock_bh(&serv->sv_lock);
}
+static int svc_sock_sendpages(struct svc_serv *serv, struct socket *sock, int flags)
+{
+ switch (sock->type) {
+ case SOCK_STREAM:
+ /* +1 for TCP record marker */
+ if (flags & SVC_SOCK_TEMPORARY)
+ return svc_serv_maxpages(serv) + 1;
+ return 0;
+ case SOCK_DGRAM:
+ return SUNRPC_MAX_UDP_SENDPAGES;
+ }
+ return -EINVAL;
+}
+
/*
* Initialize socket for RPC use and create svc_sock struct
*/
@@ -1403,12 +1428,26 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
struct svc_sock *svsk;
struct sock *inet;
int pmap_register = !(flags & SVC_SOCK_ANONYMOUS);
+ int sendpages;
unsigned long pages;
+ sendpages = svc_sock_sendpages(serv, sock, flags);
+ if (sendpages < 0)
+ return ERR_PTR(sendpages);
+
pages = svc_serv_maxpages(serv);
svsk = kzalloc(struct_size(svsk, sk_pages, pages), GFP_KERNEL);
if (!svsk)
return ERR_PTR(-ENOMEM);
+
+ if (sendpages) {
+ svsk->sk_bvec = kcalloc(sendpages, sizeof(*svsk->sk_bvec), GFP_KERNEL);
+ if (!svsk->sk_bvec) {
+ kfree(svsk);
+ return ERR_PTR(-ENOMEM);
+ }
+ }
+
svsk->sk_maxpages = pages;
inet = sock->sk;
@@ -1420,6 +1459,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
inet->sk_protocol,
ntohs(inet_sk(inet)->inet_sport));
if (err < 0) {
+ kfree(svsk->sk_bvec);
kfree(svsk);
return ERR_PTR(err);
}
@@ -1637,5 +1677,6 @@ static void svc_sock_free(struct svc_xprt *xprt)
sock_release(sock);
page_frag_cache_drain(&svsk->sk_frag_cache);
+ kfree(svsk->sk_bvec);
kfree(svsk);
}
---
base-commit: 05d2192090744b16ce05bd221c459a9357c17525
change-id: 20251008-rq_bvec-b66afd0fdbbb
Best regards,
--
Jeff Layton <jlayton@kernel.org>
On Tue, 14 Oct 2025, Jeff Layton wrote:
> svc_tcp_sendmsg() calls xdr_buf_to_bvec() with the second slot of
> rq_bvec as the start, but doesn't reduce the array length by one, which
> could lead to an array overrun. Also, rq_bvec is always rq_maxpages in
> length, which can be too short in some cases, since the TCP record
> marker consumes a slot.
>
> Fix both problems by adding a separate bvec array to the svc_sock that
> is specifically for sending. For TCP, make this array one slot longer
> than rq_maxpages, to account for the record marker. For UDP, only
> allocate as large an array as we need since it's limited to 64k of
> payload.
>
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
> Another update based on Neil's feedback. This version allocates the
> array when the socket is allocated, and fixes the UDP length
> calculation.
Looks good - thanks.
Reviewed-by: NeilBrown <neil@brown.name>
NeilBrown
> ---
> Changes in v7:
> - use RPCSVC_MAXPAYLOAD_UDP instead of assuming max of 64kb
> - Link to v6: https://lore.kernel.org/r/20251013-rq_bvec-v6-1-17982fc64ad2@kernel.org
>
> Changes in v6:
> - allocate sk_bvec in ->xpo_create
> - fix the array-length calculation for UDP
> - Link to v5: https://lore.kernel.org/r/20251010-rq_bvec-v5-1-44976250199d@kernel.org
>
> Changes in v5:
> - reduce the size of sk_bvec on UDP sockets
> - Link to v4: https://lore.kernel.org/r/20251010-rq_bvec-v4-1-627567f1ce91@kernel.org
>
> Changes in v4:
> - switch to allocating a separate bvec for sends in the svc_sock
> - Link to v3: https://lore.kernel.org/r/20251009-rq_bvec-v3-0-57181360b9cb@kernel.org
>
> Changes in v3:
> - Add rq_bvec_len field and use it in appropriate places
> - Link to v2: https://lore.kernel.org/r/20251008-rq_bvec-v2-0-823c0a85a27c@kernel.org
>
> Changes in v2:
> - Better changelog message for patch #2
> - Link to v1: https://lore.kernel.org/r/20251008-rq_bvec-v1-0-7f23d32d75e5@kernel.org
> ---
> include/linux/sunrpc/svcsock.h | 3 +++
> net/sunrpc/svcsock.c | 55 ++++++++++++++++++++++++++++++++++++------
> 2 files changed, 51 insertions(+), 7 deletions(-)
>
> diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h
> index 963bbe251e52109a902f6b9097b6e9c3c23b1fd8..de37069aba90899be19b1090e6e90e509a3cf530 100644
> --- a/include/linux/sunrpc/svcsock.h
> +++ b/include/linux/sunrpc/svcsock.h
> @@ -26,6 +26,9 @@ struct svc_sock {
> void (*sk_odata)(struct sock *);
> void (*sk_owspace)(struct sock *);
>
> + /* For sends (protected by xpt_mutex) */
> + struct bio_vec *sk_bvec;
> +
> /* private TCP part */
> /* On-the-wire fragment header: */
> __be32 sk_marker;
> diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
> index 0cb9c4d457453b26db29f08985b056c3f8d59447..93de79020a2d859a9c0b9464d794c167531d701d 100644
> --- a/net/sunrpc/svcsock.c
> +++ b/net/sunrpc/svcsock.c
> @@ -68,6 +68,17 @@
>
> #define RPCDBG_FACILITY RPCDBG_SVCXPRT
>
> +/*
> + * For UDP:
> + * 1 for header page
> + * enough pages for RPCSVC_MAXPAYLOAD_UDP
> + * 1 in case payload is not aligned
> + * 1 for tail page
> + */
> +enum {
> + SUNRPC_MAX_UDP_SENDPAGES = 1 + RPCSVC_MAXPAYLOAD_UDP / PAGE_SIZE + 1 + 1
> +};
> +
> /* To-do: to avoid tying up an nfsd thread while waiting for a
> * handshake request, the request could instead be deferred.
> */
> @@ -740,14 +751,14 @@ static int svc_udp_sendto(struct svc_rqst *rqstp)
> if (svc_xprt_is_dead(xprt))
> goto out_notconn;
>
> - count = xdr_buf_to_bvec(rqstp->rq_bvec, rqstp->rq_maxpages, xdr);
> + count = xdr_buf_to_bvec(svsk->sk_bvec, SUNRPC_MAX_UDP_SENDPAGES, xdr);
>
> - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec,
> + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, svsk->sk_bvec,
> count, rqstp->rq_res.len);
> err = sock_sendmsg(svsk->sk_sock, &msg);
> if (err == -ECONNREFUSED) {
> /* ICMP error on earlier request. */
> - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec,
> + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, svsk->sk_bvec,
> count, rqstp->rq_res.len);
> err = sock_sendmsg(svsk->sk_sock, &msg);
> }
> @@ -1236,19 +1247,19 @@ static int svc_tcp_sendmsg(struct svc_sock *svsk, struct svc_rqst *rqstp,
> int ret;
>
> /* The stream record marker is copied into a temporary page
> - * fragment buffer so that it can be included in rq_bvec.
> + * fragment buffer so that it can be included in sk_bvec.
> */
> buf = page_frag_alloc(&svsk->sk_frag_cache, sizeof(marker),
> GFP_KERNEL);
> if (!buf)
> return -ENOMEM;
> memcpy(buf, &marker, sizeof(marker));
> - bvec_set_virt(rqstp->rq_bvec, buf, sizeof(marker));
> + bvec_set_virt(svsk->sk_bvec, buf, sizeof(marker));
>
> - count = xdr_buf_to_bvec(rqstp->rq_bvec + 1, rqstp->rq_maxpages,
> + count = xdr_buf_to_bvec(svsk->sk_bvec + 1, rqstp->rq_maxpages,
> &rqstp->rq_res);
>
> - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec,
> + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, svsk->sk_bvec,
> 1 + count, sizeof(marker) + rqstp->rq_res.len);
> ret = sock_sendmsg(svsk->sk_sock, &msg);
> page_frag_free(buf);
> @@ -1393,6 +1404,20 @@ void svc_sock_update_bufs(struct svc_serv *serv)
> spin_unlock_bh(&serv->sv_lock);
> }
>
> +static int svc_sock_sendpages(struct svc_serv *serv, struct socket *sock, int flags)
> +{
> + switch (sock->type) {
> + case SOCK_STREAM:
> + /* +1 for TCP record marker */
> + if (flags & SVC_SOCK_TEMPORARY)
> + return svc_serv_maxpages(serv) + 1;
> + return 0;
> + case SOCK_DGRAM:
> + return SUNRPC_MAX_UDP_SENDPAGES;
> + }
> + return -EINVAL;
> +}
> +
> /*
> * Initialize socket for RPC use and create svc_sock struct
> */
> @@ -1403,12 +1428,26 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
> struct svc_sock *svsk;
> struct sock *inet;
> int pmap_register = !(flags & SVC_SOCK_ANONYMOUS);
> + int sendpages;
> unsigned long pages;
>
> + sendpages = svc_sock_sendpages(serv, sock, flags);
> + if (sendpages < 0)
> + return ERR_PTR(sendpages);
> +
> pages = svc_serv_maxpages(serv);
> svsk = kzalloc(struct_size(svsk, sk_pages, pages), GFP_KERNEL);
> if (!svsk)
> return ERR_PTR(-ENOMEM);
> +
> + if (sendpages) {
> + svsk->sk_bvec = kcalloc(sendpages, sizeof(*svsk->sk_bvec), GFP_KERNEL);
> + if (!svsk->sk_bvec) {
> + kfree(svsk);
> + return ERR_PTR(-ENOMEM);
> + }
> + }
> +
> svsk->sk_maxpages = pages;
>
> inet = sock->sk;
> @@ -1420,6 +1459,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
> inet->sk_protocol,
> ntohs(inet_sk(inet)->inet_sport));
> if (err < 0) {
> + kfree(svsk->sk_bvec);
> kfree(svsk);
> return ERR_PTR(err);
> }
> @@ -1637,5 +1677,6 @@ static void svc_sock_free(struct svc_xprt *xprt)
> sock_release(sock);
>
> page_frag_cache_drain(&svsk->sk_frag_cache);
> + kfree(svsk->sk_bvec);
> kfree(svsk);
> }
>
> ---
> base-commit: 05d2192090744b16ce05bd221c459a9357c17525
> change-id: 20251008-rq_bvec-b66afd0fdbbb
>
> Best regards,
> --
> Jeff Layton <jlayton@kernel.org>
>
>
From: Chuck Lever <chuck.lever@oracle.com>
On Mon, 13 Oct 2025 09:54:53 -0400, Jeff Layton wrote:
> svc_tcp_sendmsg() calls xdr_buf_to_bvec() with the second slot of
> rq_bvec as the start, but doesn't reduce the array length by one, which
> could lead to an array overrun. Also, rq_bvec is always rq_maxpages in
> length, which can be too short in some cases, since the TCP record
> marker consumes a slot.
>
> Fix both problems by adding a separate bvec array to the svc_sock that
> is specifically for sending. For TCP, make this array one slot longer
> than rq_maxpages, to account for the record marker. For UDP, only
> allocate as large an array as we need since it's limited to 64k of
> payload.
>
> [...]
Applied to nfsd-testing, thanks!
[1/1] sunrpc: allocate a separate bvec array for socket sends
commit: fcf522297252d21db162e141d49b2bb4f1c2b0c4
--
Chuck Lever <chuck.lever@oracle.com>
© 2016 - 2025 Red Hat, Inc.