[PATCH net] vhost: rewind next_avail_head while discarding descriptors

Jason Wang posted 1 patch 3 weeks, 3 days ago
drivers/vhost/net.c   | 53 ++++++++++++++++++++++++++-----------------
drivers/vhost/vhost.c | 43 ++++++++++++++++++++++++-----------
drivers/vhost/vhost.h |  9 +++++++-
3 files changed, 70 insertions(+), 35 deletions(-)
[PATCH net] vhost: rewind next_avail_head while discarding descriptors
Posted by Jason Wang 3 weeks, 3 days ago
When discarding descriptors with IN_ORDER, we should rewind
next_avail_head otherwise it would run out of sync with
last_avail_idx. This would cause driver to report
"id X is not a head".

Fixing this by returning the number of descriptors that is used for
each buffer via vhost_get_vq_desc_n() so caller can use the value
while discarding descriptors.

Fixes: 67a873df0c41 ("vhost: basic in order support")
Cc: stable@vger.kernel.org
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c   | 53 ++++++++++++++++++++++++++-----------------
 drivers/vhost/vhost.c | 43 ++++++++++++++++++++++++-----------
 drivers/vhost/vhost.h |  9 +++++++-
 3 files changed, 70 insertions(+), 35 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 35ded4330431..8f7f50acb6d6 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -592,14 +592,15 @@ static void vhost_net_busy_poll(struct vhost_net *net,
 static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
 				    struct vhost_net_virtqueue *tnvq,
 				    unsigned int *out_num, unsigned int *in_num,
-				    struct msghdr *msghdr, bool *busyloop_intr)
+				    struct msghdr *msghdr, bool *busyloop_intr,
+				    unsigned int *ndesc)
 {
 	struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
 	struct vhost_virtqueue *rvq = &rnvq->vq;
 	struct vhost_virtqueue *tvq = &tnvq->vq;
 
-	int r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
-				  out_num, in_num, NULL, NULL);
+	int r = vhost_get_vq_desc_n(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
+				    out_num, in_num, NULL, NULL, ndesc);
 
 	if (r == tvq->num && tvq->busyloop_timeout) {
 		/* Flush batched packets first */
@@ -610,8 +611,8 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
 
 		vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, false);
 
-		r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
-				      out_num, in_num, NULL, NULL);
+		r = vhost_get_vq_desc_n(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
+					out_num, in_num, NULL, NULL, ndesc);
 	}
 
 	return r;
@@ -642,12 +643,14 @@ static int get_tx_bufs(struct vhost_net *net,
 		       struct vhost_net_virtqueue *nvq,
 		       struct msghdr *msg,
 		       unsigned int *out, unsigned int *in,
-		       size_t *len, bool *busyloop_intr)
+		       size_t *len, bool *busyloop_intr,
+		       unsigned int *ndesc)
 {
 	struct vhost_virtqueue *vq = &nvq->vq;
 	int ret;
 
-	ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg, busyloop_intr);
+	ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg,
+				       busyloop_intr, ndesc);
 
 	if (ret < 0 || ret == vq->num)
 		return ret;
@@ -766,6 +769,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
 	int sent_pkts = 0;
 	bool sock_can_batch = (sock->sk->sk_sndbuf == INT_MAX);
 	bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
+	unsigned int ndesc = 0;
 
 	do {
 		bool busyloop_intr = false;
@@ -774,7 +778,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
 			vhost_tx_batch(net, nvq, sock, &msg);
 
 		head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
-				   &busyloop_intr);
+				   &busyloop_intr, &ndesc);
 		/* On error, stop handling until the next kick. */
 		if (unlikely(head < 0))
 			break;
@@ -806,7 +810,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
 				goto done;
 			} else if (unlikely(err != -ENOSPC)) {
 				vhost_tx_batch(net, nvq, sock, &msg);
-				vhost_discard_vq_desc(vq, 1);
+				vhost_discard_vq_desc(vq, 1, ndesc);
 				vhost_net_enable_vq(net, vq);
 				break;
 			}
@@ -829,7 +833,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
 		err = sock->ops->sendmsg(sock, &msg, len);
 		if (unlikely(err < 0)) {
 			if (err == -EAGAIN || err == -ENOMEM || err == -ENOBUFS) {
-				vhost_discard_vq_desc(vq, 1);
+				vhost_discard_vq_desc(vq, 1, ndesc);
 				vhost_net_enable_vq(net, vq);
 				break;
 			}
@@ -868,6 +872,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
 	int err;
 	struct vhost_net_ubuf_ref *ubufs;
 	struct ubuf_info_msgzc *ubuf;
+	unsigned int ndesc = 0;
 	bool zcopy_used;
 	int sent_pkts = 0;
 
@@ -879,7 +884,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
 
 		busyloop_intr = false;
 		head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
-				   &busyloop_intr);
+				   &busyloop_intr, &ndesc);
 		/* On error, stop handling until the next kick. */
 		if (unlikely(head < 0))
 			break;
@@ -941,7 +946,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
 					vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN;
 			}
 			if (retry) {
-				vhost_discard_vq_desc(vq, 1);
+				vhost_discard_vq_desc(vq, 1, ndesc);
 				vhost_net_enable_vq(net, vq);
 				break;
 			}
@@ -1045,11 +1050,12 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq,
 		       unsigned *iovcount,
 		       struct vhost_log *log,
 		       unsigned *log_num,
-		       unsigned int quota)
+		       unsigned int quota,
+		       unsigned int *ndesc)
 {
 	struct vhost_virtqueue *vq = &nvq->vq;
 	bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
-	unsigned int out, in;
+	unsigned int out, in, desc_num, n = 0;
 	int seg = 0;
 	int headcount = 0;
 	unsigned d;
@@ -1064,9 +1070,9 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq,
 			r = -ENOBUFS;
 			goto err;
 		}
-		r = vhost_get_vq_desc(vq, vq->iov + seg,
-				      ARRAY_SIZE(vq->iov) - seg, &out,
-				      &in, log, log_num);
+		r = vhost_get_vq_desc_n(vq, vq->iov + seg,
+					ARRAY_SIZE(vq->iov) - seg, &out,
+					&in, log, log_num, &desc_num);
 		if (unlikely(r < 0))
 			goto err;
 
@@ -1093,6 +1099,7 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq,
 		++headcount;
 		datalen -= len;
 		seg += in;
+		n += desc_num;
 	}
 
 	*iovcount = seg;
@@ -1113,9 +1120,11 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq,
 		nheads[0] = headcount;
 	}
 
+	*ndesc = n;
+
 	return headcount;
 err:
-	vhost_discard_vq_desc(vq, headcount);
+	vhost_discard_vq_desc(vq, headcount, n);
 	return r;
 }
 
@@ -1151,6 +1160,7 @@ static void handle_rx(struct vhost_net *net)
 	struct iov_iter fixup;
 	__virtio16 num_buffers;
 	int recv_pkts = 0;
+	unsigned int ndesc;
 
 	mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_RX);
 	sock = vhost_vq_get_backend(vq);
@@ -1182,7 +1192,8 @@ static void handle_rx(struct vhost_net *net)
 		headcount = get_rx_bufs(nvq, vq->heads + count,
 					vq->nheads + count,
 					vhost_len, &in, vq_log, &log,
-					likely(mergeable) ? UIO_MAXIOV : 1);
+					likely(mergeable) ? UIO_MAXIOV : 1,
+					&ndesc);
 		/* On error, stop handling until the next kick. */
 		if (unlikely(headcount < 0))
 			goto out;
@@ -1228,7 +1239,7 @@ static void handle_rx(struct vhost_net *net)
 		if (unlikely(err != sock_len)) {
 			pr_debug("Discarded rx packet: "
 				 " len %d, expected %zd\n", err, sock_len);
-			vhost_discard_vq_desc(vq, headcount);
+			vhost_discard_vq_desc(vq, headcount, ndesc);
 			continue;
 		}
 		/* Supply virtio_net_hdr if VHOST_NET_F_VIRTIO_NET_HDR */
@@ -1252,7 +1263,7 @@ static void handle_rx(struct vhost_net *net)
 		    copy_to_iter(&num_buffers, sizeof num_buffers,
 				 &fixup) != sizeof num_buffers) {
 			vq_err(vq, "Failed num_buffers write");
-			vhost_discard_vq_desc(vq, headcount);
+			vhost_discard_vq_desc(vq, headcount, ndesc);
 			goto out;
 		}
 		nvq->done_idx += headcount;
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 8570fdf2e14a..b56568807588 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2792,18 +2792,11 @@ static int get_indirect(struct vhost_virtqueue *vq,
 	return 0;
 }
 
-/* This looks in the virtqueue and for the first available buffer, and converts
- * it to an iovec for convenient access.  Since descriptors consist of some
- * number of output then some number of input descriptors, it's actually two
- * iovecs, but we pack them into one and note how many of each there were.
- *
- * This function returns the descriptor number found, or vq->num (which is
- * never a valid descriptor number) if none was found.  A negative code is
- * returned on error. */
-int vhost_get_vq_desc(struct vhost_virtqueue *vq,
-		      struct iovec iov[], unsigned int iov_size,
-		      unsigned int *out_num, unsigned int *in_num,
-		      struct vhost_log *log, unsigned int *log_num)
+int vhost_get_vq_desc_n(struct vhost_virtqueue *vq,
+			struct iovec iov[], unsigned int iov_size,
+			unsigned int *out_num, unsigned int *in_num,
+			struct vhost_log *log, unsigned int *log_num,
+			unsigned int *ndesc)
 {
 	bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
 	struct vring_desc desc;
@@ -2921,16 +2914,40 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 	vq->last_avail_idx++;
 	vq->next_avail_head += c;
 
+	if (ndesc)
+		*ndesc = c;
+
 	/* Assume notifications from guest are disabled at this point,
 	 * if they aren't we would need to update avail_event index. */
 	BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
 	return head;
 }
+EXPORT_SYMBOL_GPL(vhost_get_vq_desc_n);
+
+/* This looks in the virtqueue and for the first available buffer, and converts
+ * it to an iovec for convenient access.  Since descriptors consist of some
+ * number of output then some number of input descriptors, it's actually two
+ * iovecs, but we pack them into one and note how many of each there were.
+ *
+ * This function returns the descriptor number found, or vq->num (which is
+ * never a valid descriptor number) if none was found.  A negative code is
+ * returned on error.
+ */
+int vhost_get_vq_desc(struct vhost_virtqueue *vq,
+		      struct iovec iov[], unsigned int iov_size,
+		      unsigned int *out_num, unsigned int *in_num,
+		      struct vhost_log *log, unsigned int *log_num)
+{
+	return vhost_get_vq_desc_n(vq, iov, iov_size, out_num, in_num,
+				   log, log_num, NULL);
+}
 EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
 
 /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
-void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
+void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n,
+			   unsigned int ndesc)
 {
+	vq->next_avail_head -= ndesc;
 	vq->last_avail_idx -= n;
 }
 EXPORT_SYMBOL_GPL(vhost_discard_vq_desc);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 621a6d9a8791..69a39540df3d 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -230,7 +230,14 @@ int vhost_get_vq_desc(struct vhost_virtqueue *,
 		      struct iovec iov[], unsigned int iov_size,
 		      unsigned int *out_num, unsigned int *in_num,
 		      struct vhost_log *log, unsigned int *log_num);
-void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
+
+int vhost_get_vq_desc_n(struct vhost_virtqueue *vq,
+			struct iovec iov[], unsigned int iov_size,
+			unsigned int *out_num, unsigned int *in_num,
+			struct vhost_log *log, unsigned int *log_num,
+			unsigned int *ndesc);
+
+void vhost_discard_vq_desc(struct vhost_virtqueue *, int n, unsigned int ndesc);
 
 bool vhost_vq_work_queue(struct vhost_virtqueue *vq, struct vhost_work *work);
 bool vhost_vq_has_work(struct vhost_virtqueue *vq);
-- 
2.31.1
Re: [PATCH net] vhost: rewind next_avail_head while discarding descriptors
Posted by Michael S. Tsirkin 3 weeks, 3 days ago
On Thu, Nov 13, 2025 at 09:54:20AM +0800, Jason Wang wrote:
> When discarding descriptors with IN_ORDER, we should rewind
> next_avail_head otherwise it would run out of sync with
> last_avail_idx. This would cause driver to report
> "id X is not a head".
> 
> Fixing this by returning the number of descriptors that is used for
> each buffer via vhost_get_vq_desc_n() so caller can use the value
> while discarding descriptors.
> 
> Fixes: 67a873df0c41 ("vhost: basic in order support")
> Cc: stable@vger.kernel.org
> Signed-off-by: Jason Wang <jasowang@redhat.com>

Wow that change really caused a lot of fallout.

Thanks for the patch! Yet something to improve:


> ---
>  drivers/vhost/net.c   | 53 ++++++++++++++++++++++++++-----------------
>  drivers/vhost/vhost.c | 43 ++++++++++++++++++++++++-----------
>  drivers/vhost/vhost.h |  9 +++++++-
>  3 files changed, 70 insertions(+), 35 deletions(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 35ded4330431..8f7f50acb6d6 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -592,14 +592,15 @@ static void vhost_net_busy_poll(struct vhost_net *net,
>  static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
>  				    struct vhost_net_virtqueue *tnvq,
>  				    unsigned int *out_num, unsigned int *in_num,
> -				    struct msghdr *msghdr, bool *busyloop_intr)
> +				    struct msghdr *msghdr, bool *busyloop_intr,
> +				    unsigned int *ndesc)
>  {
>  	struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
>  	struct vhost_virtqueue *rvq = &rnvq->vq;
>  	struct vhost_virtqueue *tvq = &tnvq->vq;
>  
> -	int r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
> -				  out_num, in_num, NULL, NULL);
> +	int r = vhost_get_vq_desc_n(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
> +				    out_num, in_num, NULL, NULL, ndesc);
>  
>  	if (r == tvq->num && tvq->busyloop_timeout) {
>  		/* Flush batched packets first */
> @@ -610,8 +611,8 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
>  
>  		vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, false);
>  
> -		r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
> -				      out_num, in_num, NULL, NULL);
> +		r = vhost_get_vq_desc_n(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
> +					out_num, in_num, NULL, NULL, ndesc);
>  	}
>  
>  	return r;
> @@ -642,12 +643,14 @@ static int get_tx_bufs(struct vhost_net *net,
>  		       struct vhost_net_virtqueue *nvq,
>  		       struct msghdr *msg,
>  		       unsigned int *out, unsigned int *in,
> -		       size_t *len, bool *busyloop_intr)
> +		       size_t *len, bool *busyloop_intr,
> +		       unsigned int *ndesc)
>  {
>  	struct vhost_virtqueue *vq = &nvq->vq;
>  	int ret;
>  
> -	ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg, busyloop_intr);
> +	ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg,
> +				       busyloop_intr, ndesc);
>  
>  	if (ret < 0 || ret == vq->num)
>  		return ret;
> @@ -766,6 +769,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
>  	int sent_pkts = 0;
>  	bool sock_can_batch = (sock->sk->sk_sndbuf == INT_MAX);
>  	bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
> +	unsigned int ndesc = 0;
>  
>  	do {
>  		bool busyloop_intr = false;
> @@ -774,7 +778,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
>  			vhost_tx_batch(net, nvq, sock, &msg);
>  
>  		head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
> -				   &busyloop_intr);
> +				   &busyloop_intr, &ndesc);
>  		/* On error, stop handling until the next kick. */
>  		if (unlikely(head < 0))
>  			break;
> @@ -806,7 +810,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
>  				goto done;
>  			} else if (unlikely(err != -ENOSPC)) {
>  				vhost_tx_batch(net, nvq, sock, &msg);
> -				vhost_discard_vq_desc(vq, 1);
> +				vhost_discard_vq_desc(vq, 1, ndesc);
>  				vhost_net_enable_vq(net, vq);
>  				break;
>  			}
> @@ -829,7 +833,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
>  		err = sock->ops->sendmsg(sock, &msg, len);
>  		if (unlikely(err < 0)) {
>  			if (err == -EAGAIN || err == -ENOMEM || err == -ENOBUFS) {
> -				vhost_discard_vq_desc(vq, 1);
> +				vhost_discard_vq_desc(vq, 1, ndesc);
>  				vhost_net_enable_vq(net, vq);
>  				break;
>  			}
> @@ -868,6 +872,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
>  	int err;
>  	struct vhost_net_ubuf_ref *ubufs;
>  	struct ubuf_info_msgzc *ubuf;
> +	unsigned int ndesc = 0;
>  	bool zcopy_used;
>  	int sent_pkts = 0;
>  
> @@ -879,7 +884,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
>  
>  		busyloop_intr = false;
>  		head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
> -				   &busyloop_intr);
> +				   &busyloop_intr, &ndesc);
>  		/* On error, stop handling until the next kick. */
>  		if (unlikely(head < 0))
>  			break;
> @@ -941,7 +946,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
>  					vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN;
>  			}
>  			if (retry) {
> -				vhost_discard_vq_desc(vq, 1);
> +				vhost_discard_vq_desc(vq, 1, ndesc);
>  				vhost_net_enable_vq(net, vq);
>  				break;
>  			}
> @@ -1045,11 +1050,12 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq,
>  		       unsigned *iovcount,
>  		       struct vhost_log *log,
>  		       unsigned *log_num,
> -		       unsigned int quota)
> +		       unsigned int quota,
> +		       unsigned int *ndesc)
>  {
>  	struct vhost_virtqueue *vq = &nvq->vq;
>  	bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
> -	unsigned int out, in;
> +	unsigned int out, in, desc_num, n = 0;
>  	int seg = 0;
>  	int headcount = 0;
>  	unsigned d;
> @@ -1064,9 +1070,9 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq,
>  			r = -ENOBUFS;
>  			goto err;
>  		}
> -		r = vhost_get_vq_desc(vq, vq->iov + seg,
> -				      ARRAY_SIZE(vq->iov) - seg, &out,
> -				      &in, log, log_num);
> +		r = vhost_get_vq_desc_n(vq, vq->iov + seg,
> +					ARRAY_SIZE(vq->iov) - seg, &out,
> +					&in, log, log_num, &desc_num);
>  		if (unlikely(r < 0))
>  			goto err;
>  
> @@ -1093,6 +1099,7 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq,
>  		++headcount;
>  		datalen -= len;
>  		seg += in;
> +		n += desc_num;
>  	}
>  
>  	*iovcount = seg;
> @@ -1113,9 +1120,11 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq,
>  		nheads[0] = headcount;
>  	}
>  
> +	*ndesc = n;
> +
>  	return headcount;
>  err:
> -	vhost_discard_vq_desc(vq, headcount);
> +	vhost_discard_vq_desc(vq, headcount, n);

So here ndesc and n are the same, but below in vhost_discard_vq_desc
they are different. Fun.

>  	return r;
>  }
>  
> @@ -1151,6 +1160,7 @@ static void handle_rx(struct vhost_net *net)
>  	struct iov_iter fixup;
>  	__virtio16 num_buffers;
>  	int recv_pkts = 0;
> +	unsigned int ndesc;
>  
>  	mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_RX);
>  	sock = vhost_vq_get_backend(vq);
> @@ -1182,7 +1192,8 @@ static void handle_rx(struct vhost_net *net)
>  		headcount = get_rx_bufs(nvq, vq->heads + count,
>  					vq->nheads + count,
>  					vhost_len, &in, vq_log, &log,
> -					likely(mergeable) ? UIO_MAXIOV : 1);
> +					likely(mergeable) ? UIO_MAXIOV : 1,
> +					&ndesc);
>  		/* On error, stop handling until the next kick. */
>  		if (unlikely(headcount < 0))
>  			goto out;
> @@ -1228,7 +1239,7 @@ static void handle_rx(struct vhost_net *net)
>  		if (unlikely(err != sock_len)) {
>  			pr_debug("Discarded rx packet: "
>  				 " len %d, expected %zd\n", err, sock_len);
> -			vhost_discard_vq_desc(vq, headcount);
> +			vhost_discard_vq_desc(vq, headcount, ndesc);
>  			continue;
>  		}
>  		/* Supply virtio_net_hdr if VHOST_NET_F_VIRTIO_NET_HDR */
> @@ -1252,7 +1263,7 @@ static void handle_rx(struct vhost_net *net)
>  		    copy_to_iter(&num_buffers, sizeof num_buffers,
>  				 &fixup) != sizeof num_buffers) {
>  			vq_err(vq, "Failed num_buffers write");
> -			vhost_discard_vq_desc(vq, headcount);
> +			vhost_discard_vq_desc(vq, headcount, ndesc);
>  			goto out;
>  		}
>  		nvq->done_idx += headcount;
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 8570fdf2e14a..b56568807588 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -2792,18 +2792,11 @@ static int get_indirect(struct vhost_virtqueue *vq,
>  	return 0;
>  }
>  
> -/* This looks in the virtqueue and for the first available buffer, and converts
> - * it to an iovec for convenient access.  Since descriptors consist of some
> - * number of output then some number of input descriptors, it's actually two
> - * iovecs, but we pack them into one and note how many of each there were.
> - *
> - * This function returns the descriptor number found, or vq->num (which is
> - * never a valid descriptor number) if none was found.  A negative code is
> - * returned on error. */

A new module API with no docs at all is not good.
Please add documentation to this one. vhost_get_vq_desc
is a subset and could refer to it.

> -int vhost_get_vq_desc(struct vhost_virtqueue *vq,
> -		      struct iovec iov[], unsigned int iov_size,
> -		      unsigned int *out_num, unsigned int *in_num,
> -		      struct vhost_log *log, unsigned int *log_num)
> +int vhost_get_vq_desc_n(struct vhost_virtqueue *vq,
> +			struct iovec iov[], unsigned int iov_size,
> +			unsigned int *out_num, unsigned int *in_num,
> +			struct vhost_log *log, unsigned int *log_num,
> +			unsigned int *ndesc)

>  {
>  	bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
>  	struct vring_desc desc;
> @@ -2921,16 +2914,40 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
>  	vq->last_avail_idx++;
>  	vq->next_avail_head += c;
>  
> +	if (ndesc)
> +		*ndesc = c;
> +
>  	/* Assume notifications from guest are disabled at this point,
>  	 * if they aren't we would need to update avail_event index. */
>  	BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
>  	return head;
>  }
> +EXPORT_SYMBOL_GPL(vhost_get_vq_desc_n);
> +
> +/* This looks in the virtqueue and for the first available buffer, and converts
> + * it to an iovec for convenient access.  Since descriptors consist of some
> + * number of output then some number of input descriptors, it's actually two
> + * iovecs, but we pack them into one and note how many of each there were.
> + *
> + * This function returns the descriptor number found, or vq->num (which is
> + * never a valid descriptor number) if none was found.  A negative code is
> + * returned on error.
> + */
> +int vhost_get_vq_desc(struct vhost_virtqueue *vq,
> +		      struct iovec iov[], unsigned int iov_size,
> +		      unsigned int *out_num, unsigned int *in_num,
> +		      struct vhost_log *log, unsigned int *log_num)
> +{
> +	return vhost_get_vq_desc_n(vq, iov, iov_size, out_num, in_num,
> +				   log, log_num, NULL);
> +}
>  EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
>  
>  /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
> -void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
> +void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n,
> +			   unsigned int ndesc)

ndesc is number of descriptors? And n is what, in that case?


>  {
> +	vq->next_avail_head -= ndesc;
>  	vq->last_avail_idx -= n;
>  }
>  EXPORT_SYMBOL_GPL(vhost_discard_vq_desc);
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index 621a6d9a8791..69a39540df3d 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -230,7 +230,14 @@ int vhost_get_vq_desc(struct vhost_virtqueue *,
>  		      struct iovec iov[], unsigned int iov_size,
>  		      unsigned int *out_num, unsigned int *in_num,
>  		      struct vhost_log *log, unsigned int *log_num);
> -void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
> +
> +int vhost_get_vq_desc_n(struct vhost_virtqueue *vq,
> +			struct iovec iov[], unsigned int iov_size,
> +			unsigned int *out_num, unsigned int *in_num,
> +			struct vhost_log *log, unsigned int *log_num,
> +			unsigned int *ndesc);
> +
> +void vhost_discard_vq_desc(struct vhost_virtqueue *, int n, unsigned int ndesc);
>  
>  bool vhost_vq_work_queue(struct vhost_virtqueue *vq, struct vhost_work *work);
>  bool vhost_vq_has_work(struct vhost_virtqueue *vq);
> -- 
> 2.31.1
Re: [PATCH net] vhost: rewind next_avail_head while discarding descriptors
Posted by Jason Wang 3 weeks, 2 days ago
On Thu, Nov 13, 2025 at 4:13 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Thu, Nov 13, 2025 at 09:54:20AM +0800, Jason Wang wrote:
> > When discarding descriptors with IN_ORDER, we should rewind
> > next_avail_head otherwise it would run out of sync with
> > last_avail_idx. This would cause driver to report
> > "id X is not a head".
> >
> > Fixing this by returning the number of descriptors that is used for
> > each buffer via vhost_get_vq_desc_n() so caller can use the value
> > while discarding descriptors.
> >
> > Fixes: 67a873df0c41 ("vhost: basic in order support")
> > Cc: stable@vger.kernel.org
> > Signed-off-by: Jason Wang <jasowang@redhat.com>
>
> Wow that change really caused a lot of fallout.
>
> Thanks for the patch! Yet something to improve:
>
>
> > ---
> >  drivers/vhost/net.c   | 53 ++++++++++++++++++++++++++-----------------
> >  drivers/vhost/vhost.c | 43 ++++++++++++++++++++++++-----------
> >  drivers/vhost/vhost.h |  9 +++++++-
> >  3 files changed, 70 insertions(+), 35 deletions(-)
> >
> > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > index 35ded4330431..8f7f50acb6d6 100644
> > --- a/drivers/vhost/net.c
> > +++ b/drivers/vhost/net.c
> > @@ -592,14 +592,15 @@ static void vhost_net_busy_poll(struct vhost_net *net,
> >  static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
> >                                   struct vhost_net_virtqueue *tnvq,
> >                                   unsigned int *out_num, unsigned int *in_num,
> > -                                 struct msghdr *msghdr, bool *busyloop_intr)
> > +                                 struct msghdr *msghdr, bool *busyloop_intr,
> > +                                 unsigned int *ndesc)
> >  {
> >       struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
> >       struct vhost_virtqueue *rvq = &rnvq->vq;
> >       struct vhost_virtqueue *tvq = &tnvq->vq;
> >
> > -     int r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
> > -                               out_num, in_num, NULL, NULL);
> > +     int r = vhost_get_vq_desc_n(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
> > +                                 out_num, in_num, NULL, NULL, ndesc);
> >
> >       if (r == tvq->num && tvq->busyloop_timeout) {
> >               /* Flush batched packets first */
> > @@ -610,8 +611,8 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
> >
> >               vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, false);
> >
> > -             r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
> > -                                   out_num, in_num, NULL, NULL);
> > +             r = vhost_get_vq_desc_n(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
> > +                                     out_num, in_num, NULL, NULL, ndesc);
> >       }
> >
> >       return r;
> > @@ -642,12 +643,14 @@ static int get_tx_bufs(struct vhost_net *net,
> >                      struct vhost_net_virtqueue *nvq,
> >                      struct msghdr *msg,
> >                      unsigned int *out, unsigned int *in,
> > -                    size_t *len, bool *busyloop_intr)
> > +                    size_t *len, bool *busyloop_intr,
> > +                    unsigned int *ndesc)
> >  {
> >       struct vhost_virtqueue *vq = &nvq->vq;
> >       int ret;
> >
> > -     ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg, busyloop_intr);
> > +     ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg,
> > +                                    busyloop_intr, ndesc);
> >
> >       if (ret < 0 || ret == vq->num)
> >               return ret;
> > @@ -766,6 +769,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
> >       int sent_pkts = 0;
> >       bool sock_can_batch = (sock->sk->sk_sndbuf == INT_MAX);
> >       bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
> > +     unsigned int ndesc = 0;
> >
> >       do {
> >               bool busyloop_intr = false;
> > @@ -774,7 +778,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
> >                       vhost_tx_batch(net, nvq, sock, &msg);
> >
> >               head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
> > -                                &busyloop_intr);
> > +                                &busyloop_intr, &ndesc);
> >               /* On error, stop handling until the next kick. */
> >               if (unlikely(head < 0))
> >                       break;
> > @@ -806,7 +810,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
> >                               goto done;
> >                       } else if (unlikely(err != -ENOSPC)) {
> >                               vhost_tx_batch(net, nvq, sock, &msg);
> > -                             vhost_discard_vq_desc(vq, 1);
> > +                             vhost_discard_vq_desc(vq, 1, ndesc);
> >                               vhost_net_enable_vq(net, vq);
> >                               break;
> >                       }
> > @@ -829,7 +833,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
> >               err = sock->ops->sendmsg(sock, &msg, len);
> >               if (unlikely(err < 0)) {
> >                       if (err == -EAGAIN || err == -ENOMEM || err == -ENOBUFS) {
> > -                             vhost_discard_vq_desc(vq, 1);
> > +                             vhost_discard_vq_desc(vq, 1, ndesc);
> >                               vhost_net_enable_vq(net, vq);
> >                               break;
> >                       }
> > @@ -868,6 +872,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
> >       int err;
> >       struct vhost_net_ubuf_ref *ubufs;
> >       struct ubuf_info_msgzc *ubuf;
> > +     unsigned int ndesc = 0;
> >       bool zcopy_used;
> >       int sent_pkts = 0;
> >
> > @@ -879,7 +884,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
> >
> >               busyloop_intr = false;
> >               head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
> > -                                &busyloop_intr);
> > +                                &busyloop_intr, &ndesc);
> >               /* On error, stop handling until the next kick. */
> >               if (unlikely(head < 0))
> >                       break;
> > @@ -941,7 +946,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
> >                                       vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN;
> >                       }
> >                       if (retry) {
> > -                             vhost_discard_vq_desc(vq, 1);
> > +                             vhost_discard_vq_desc(vq, 1, ndesc);
> >                               vhost_net_enable_vq(net, vq);
> >                               break;
> >                       }
> > @@ -1045,11 +1050,12 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq,
> >                      unsigned *iovcount,
> >                      struct vhost_log *log,
> >                      unsigned *log_num,
> > -                    unsigned int quota)
> > +                    unsigned int quota,
> > +                    unsigned int *ndesc)
> >  {
> >       struct vhost_virtqueue *vq = &nvq->vq;
> >       bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
> > -     unsigned int out, in;
> > +     unsigned int out, in, desc_num, n = 0;
> >       int seg = 0;
> >       int headcount = 0;
> >       unsigned d;
> > @@ -1064,9 +1070,9 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq,
> >                       r = -ENOBUFS;
> >                       goto err;
> >               }
> > -             r = vhost_get_vq_desc(vq, vq->iov + seg,
> > -                                   ARRAY_SIZE(vq->iov) - seg, &out,
> > -                                   &in, log, log_num);
> > +             r = vhost_get_vq_desc_n(vq, vq->iov + seg,
> > +                                     ARRAY_SIZE(vq->iov) - seg, &out,
> > +                                     &in, log, log_num, &desc_num);
> >               if (unlikely(r < 0))
> >                       goto err;
> >
> > @@ -1093,6 +1099,7 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq,
> >               ++headcount;
> >               datalen -= len;
> >               seg += in;
> > +             n += desc_num;
> >       }
> >
> >       *iovcount = seg;
> > @@ -1113,9 +1120,11 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq,
> >               nheads[0] = headcount;
> >       }
> >
> > +     *ndesc = n;
> > +
> >       return headcount;
> >  err:
> > -     vhost_discard_vq_desc(vq, headcount);
> > +     vhost_discard_vq_desc(vq, headcount, n);
>
> So here ndesc and n are the same, but below in vhost_discard_vq_desc
> they are different. Fun.

Not necessarily the same, a buffer could contain more than 1 descriptor.

>
> >       return r;
> >  }
> >
> > @@ -1151,6 +1160,7 @@ static void handle_rx(struct vhost_net *net)
> >       struct iov_iter fixup;
> >       __virtio16 num_buffers;
> >       int recv_pkts = 0;
> > +     unsigned int ndesc;
> >
> >       mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_RX);
> >       sock = vhost_vq_get_backend(vq);
> > @@ -1182,7 +1192,8 @@ static void handle_rx(struct vhost_net *net)
> >               headcount = get_rx_bufs(nvq, vq->heads + count,
> >                                       vq->nheads + count,
> >                                       vhost_len, &in, vq_log, &log,
> > -                                     likely(mergeable) ? UIO_MAXIOV : 1);
> > +                                     likely(mergeable) ? UIO_MAXIOV : 1,
> > +                                     &ndesc);
> >               /* On error, stop handling until the next kick. */
> >               if (unlikely(headcount < 0))
> >                       goto out;
> > @@ -1228,7 +1239,7 @@ static void handle_rx(struct vhost_net *net)
> >               if (unlikely(err != sock_len)) {
> >                       pr_debug("Discarded rx packet: "
> >                                " len %d, expected %zd\n", err, sock_len);
> > -                     vhost_discard_vq_desc(vq, headcount);
> > +                     vhost_discard_vq_desc(vq, headcount, ndesc);
> >                       continue;
> >               }
> >               /* Supply virtio_net_hdr if VHOST_NET_F_VIRTIO_NET_HDR */
> > @@ -1252,7 +1263,7 @@ static void handle_rx(struct vhost_net *net)
> >                   copy_to_iter(&num_buffers, sizeof num_buffers,
> >                                &fixup) != sizeof num_buffers) {
> >                       vq_err(vq, "Failed num_buffers write");
> > -                     vhost_discard_vq_desc(vq, headcount);
> > +                     vhost_discard_vq_desc(vq, headcount, ndesc);
> >                       goto out;
> >               }
> >               nvq->done_idx += headcount;
> > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > index 8570fdf2e14a..b56568807588 100644
> > --- a/drivers/vhost/vhost.c
> > +++ b/drivers/vhost/vhost.c
> > @@ -2792,18 +2792,11 @@ static int get_indirect(struct vhost_virtqueue *vq,
> >       return 0;
> >  }
> >
> > -/* This looks in the virtqueue and for the first available buffer, and converts
> > - * it to an iovec for convenient access.  Since descriptors consist of some
> > - * number of output then some number of input descriptors, it's actually two
> > - * iovecs, but we pack them into one and note how many of each there were.
> > - *
> > - * This function returns the descriptor number found, or vq->num (which is
> > - * never a valid descriptor number) if none was found.  A negative code is
> > - * returned on error. */
>
> A new module API with no docs at all is not good.
> Please add documentation to this one. vhost_get_vq_desc
> is a subset and could refer to it.

Fixed.

>
> > -int vhost_get_vq_desc(struct vhost_virtqueue *vq,
> > -                   struct iovec iov[], unsigned int iov_size,
> > -                   unsigned int *out_num, unsigned int *in_num,
> > -                   struct vhost_log *log, unsigned int *log_num)
> > +int vhost_get_vq_desc_n(struct vhost_virtqueue *vq,
> > +                     struct iovec iov[], unsigned int iov_size,
> > +                     unsigned int *out_num, unsigned int *in_num,
> > +                     struct vhost_log *log, unsigned int *log_num,
> > +                     unsigned int *ndesc)
>
> >  {
> >       bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
> >       struct vring_desc desc;
> > @@ -2921,16 +2914,40 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
> >       vq->last_avail_idx++;
> >       vq->next_avail_head += c;
> >
> > +     if (ndesc)
> > +             *ndesc = c;
> > +
> >       /* Assume notifications from guest are disabled at this point,
> >        * if they aren't we would need to update avail_event index. */
> >       BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
> >       return head;
> >  }
> > +EXPORT_SYMBOL_GPL(vhost_get_vq_desc_n);
> > +
> > +/* This looks in the virtqueue and for the first available buffer, and converts
> > + * it to an iovec for convenient access.  Since descriptors consist of some
> > + * number of output then some number of input descriptors, it's actually two
> > + * iovecs, but we pack them into one and note how many of each there were.
> > + *
> > + * This function returns the descriptor number found, or vq->num (which is
> > + * never a valid descriptor number) if none was found.  A negative code is
> > + * returned on error.
> > + */
> > +int vhost_get_vq_desc(struct vhost_virtqueue *vq,
> > +                   struct iovec iov[], unsigned int iov_size,
> > +                   unsigned int *out_num, unsigned int *in_num,
> > +                   struct vhost_log *log, unsigned int *log_num)
> > +{
> > +     return vhost_get_vq_desc_n(vq, iov, iov_size, out_num, in_num,
> > +                                log, log_num, NULL);
> > +}
> >  EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
> >
> >  /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
> > -void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
> > +void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n,
> > +                        unsigned int ndesc)
>
> ndesc is number of descriptors? And n is what, in that case?

The semantic of n is not changed which is the number of buffers, ndesc
is the number of descriptors.

>
>
> >  {
> > +     vq->next_avail_head -= ndesc;
> >       vq->last_avail_idx -= n;
> >  }
> >  EXPORT_SYMBOL_GPL(vhost_discard_vq_desc);
> > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> > index 621a6d9a8791..69a39540df3d 100644
> > --- a/drivers/vhost/vhost.h
> > +++ b/drivers/vhost/vhost.h
> > @@ -230,7 +230,14 @@ int vhost_get_vq_desc(struct vhost_virtqueue *,
> >                     struct iovec iov[], unsigned int iov_size,
> >                     unsigned int *out_num, unsigned int *in_num,
> >                     struct vhost_log *log, unsigned int *log_num);
> > -void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
> > +
> > +int vhost_get_vq_desc_n(struct vhost_virtqueue *vq,
> > +                     struct iovec iov[], unsigned int iov_size,
> > +                     unsigned int *out_num, unsigned int *in_num,
> > +                     struct vhost_log *log, unsigned int *log_num,
> > +                     unsigned int *ndesc);
> > +
> > +void vhost_discard_vq_desc(struct vhost_virtqueue *, int n, unsigned int ndesc);
> >
> >  bool vhost_vq_work_queue(struct vhost_virtqueue *vq, struct vhost_work *work);
> >  bool vhost_vq_has_work(struct vhost_virtqueue *vq);
> > --
> > 2.31.1
>

Thanks
Re: [PATCH net] vhost: rewind next_avail_head while discarding descriptors
Posted by Michael S. Tsirkin 3 weeks, 2 days ago
On Fri, Nov 14, 2025 at 09:53:12AM +0800, Jason Wang wrote:
> On Thu, Nov 13, 2025 at 4:13 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Thu, Nov 13, 2025 at 09:54:20AM +0800, Jason Wang wrote:
> > > When discarding descriptors with IN_ORDER, we should rewind
> > > next_avail_head otherwise it would run out of sync with
> > > last_avail_idx. This would cause driver to report
> > > "id X is not a head".
> > >
> > > Fixing this by returning the number of descriptors that is used for
> > > each buffer via vhost_get_vq_desc_n() so caller can use the value
> > > while discarding descriptors.
> > >
> > > Fixes: 67a873df0c41 ("vhost: basic in order support")
> > > Cc: stable@vger.kernel.org
> > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> >
> > Wow that change really caused a lot of fallout.
> >
> > Thanks for the patch! Yet something to improve:
> >
> >
> > > ---
> > >  drivers/vhost/net.c   | 53 ++++++++++++++++++++++++++-----------------
> > >  drivers/vhost/vhost.c | 43 ++++++++++++++++++++++++-----------
> > >  drivers/vhost/vhost.h |  9 +++++++-
> > >  3 files changed, 70 insertions(+), 35 deletions(-)
> > >
> > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > > index 35ded4330431..8f7f50acb6d6 100644
> > > --- a/drivers/vhost/net.c
> > > +++ b/drivers/vhost/net.c
> > > @@ -592,14 +592,15 @@ static void vhost_net_busy_poll(struct vhost_net *net,
> > >  static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
> > >                                   struct vhost_net_virtqueue *tnvq,
> > >                                   unsigned int *out_num, unsigned int *in_num,
> > > -                                 struct msghdr *msghdr, bool *busyloop_intr)
> > > +                                 struct msghdr *msghdr, bool *busyloop_intr,
> > > +                                 unsigned int *ndesc)
> > >  {
> > >       struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
> > >       struct vhost_virtqueue *rvq = &rnvq->vq;
> > >       struct vhost_virtqueue *tvq = &tnvq->vq;
> > >
> > > -     int r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
> > > -                               out_num, in_num, NULL, NULL);
> > > +     int r = vhost_get_vq_desc_n(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
> > > +                                 out_num, in_num, NULL, NULL, ndesc);
> > >
> > >       if (r == tvq->num && tvq->busyloop_timeout) {
> > >               /* Flush batched packets first */
> > > @@ -610,8 +611,8 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
> > >
> > >               vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, false);
> > >
> > > -             r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
> > > -                                   out_num, in_num, NULL, NULL);
> > > +             r = vhost_get_vq_desc_n(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
> > > +                                     out_num, in_num, NULL, NULL, ndesc);
> > >       }
> > >
> > >       return r;
> > > @@ -642,12 +643,14 @@ static int get_tx_bufs(struct vhost_net *net,
> > >                      struct vhost_net_virtqueue *nvq,
> > >                      struct msghdr *msg,
> > >                      unsigned int *out, unsigned int *in,
> > > -                    size_t *len, bool *busyloop_intr)
> > > +                    size_t *len, bool *busyloop_intr,
> > > +                    unsigned int *ndesc)
> > >  {
> > >       struct vhost_virtqueue *vq = &nvq->vq;
> > >       int ret;
> > >
> > > -     ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg, busyloop_intr);
> > > +     ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg,
> > > +                                    busyloop_intr, ndesc);
> > >
> > >       if (ret < 0 || ret == vq->num)
> > >               return ret;
> > > @@ -766,6 +769,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
> > >       int sent_pkts = 0;
> > >       bool sock_can_batch = (sock->sk->sk_sndbuf == INT_MAX);
> > >       bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
> > > +     unsigned int ndesc = 0;
> > >
> > >       do {
> > >               bool busyloop_intr = false;
> > > @@ -774,7 +778,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
> > >                       vhost_tx_batch(net, nvq, sock, &msg);
> > >
> > >               head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
> > > -                                &busyloop_intr);
> > > +                                &busyloop_intr, &ndesc);
> > >               /* On error, stop handling until the next kick. */
> > >               if (unlikely(head < 0))
> > >                       break;
> > > @@ -806,7 +810,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
> > >                               goto done;
> > >                       } else if (unlikely(err != -ENOSPC)) {
> > >                               vhost_tx_batch(net, nvq, sock, &msg);
> > > -                             vhost_discard_vq_desc(vq, 1);
> > > +                             vhost_discard_vq_desc(vq, 1, ndesc);
> > >                               vhost_net_enable_vq(net, vq);
> > >                               break;
> > >                       }
> > > @@ -829,7 +833,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
> > >               err = sock->ops->sendmsg(sock, &msg, len);
> > >               if (unlikely(err < 0)) {
> > >                       if (err == -EAGAIN || err == -ENOMEM || err == -ENOBUFS) {
> > > -                             vhost_discard_vq_desc(vq, 1);
> > > +                             vhost_discard_vq_desc(vq, 1, ndesc);
> > >                               vhost_net_enable_vq(net, vq);
> > >                               break;
> > >                       }
> > > @@ -868,6 +872,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
> > >       int err;
> > >       struct vhost_net_ubuf_ref *ubufs;
> > >       struct ubuf_info_msgzc *ubuf;
> > > +     unsigned int ndesc = 0;
> > >       bool zcopy_used;
> > >       int sent_pkts = 0;
> > >
> > > @@ -879,7 +884,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
> > >
> > >               busyloop_intr = false;
> > >               head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
> > > -                                &busyloop_intr);
> > > +                                &busyloop_intr, &ndesc);
> > >               /* On error, stop handling until the next kick. */
> > >               if (unlikely(head < 0))
> > >                       break;
> > > @@ -941,7 +946,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
> > >                                       vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN;
> > >                       }
> > >                       if (retry) {
> > > -                             vhost_discard_vq_desc(vq, 1);
> > > +                             vhost_discard_vq_desc(vq, 1, ndesc);
> > >                               vhost_net_enable_vq(net, vq);
> > >                               break;
> > >                       }
> > > @@ -1045,11 +1050,12 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq,
> > >                      unsigned *iovcount,
> > >                      struct vhost_log *log,
> > >                      unsigned *log_num,
> > > -                    unsigned int quota)
> > > +                    unsigned int quota,
> > > +                    unsigned int *ndesc)
> > >  {
> > >       struct vhost_virtqueue *vq = &nvq->vq;
> > >       bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
> > > -     unsigned int out, in;
> > > +     unsigned int out, in, desc_num, n = 0;
> > >       int seg = 0;
> > >       int headcount = 0;
> > >       unsigned d;
> > > @@ -1064,9 +1070,9 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq,
> > >                       r = -ENOBUFS;
> > >                       goto err;
> > >               }
> > > -             r = vhost_get_vq_desc(vq, vq->iov + seg,
> > > -                                   ARRAY_SIZE(vq->iov) - seg, &out,
> > > -                                   &in, log, log_num);
> > > +             r = vhost_get_vq_desc_n(vq, vq->iov + seg,
> > > +                                     ARRAY_SIZE(vq->iov) - seg, &out,
> > > +                                     &in, log, log_num, &desc_num);
> > >               if (unlikely(r < 0))
> > >                       goto err;
> > >
> > > @@ -1093,6 +1099,7 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq,
> > >               ++headcount;
> > >               datalen -= len;
> > >               seg += in;
> > > +             n += desc_num;
> > >       }
> > >
> > >       *iovcount = seg;
> > > @@ -1113,9 +1120,11 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq,
> > >               nheads[0] = headcount;
> > >       }
> > >
> > > +     *ndesc = n;
> > > +
> > >       return headcount;
> > >  err:
> > > -     vhost_discard_vq_desc(vq, headcount);
> > > +     vhost_discard_vq_desc(vq, headcount, n);
> >
> > So here ndesc and n are the same, but below in vhost_discard_vq_desc
> > they are different. Fun.
> 
> Not necessarily the same, a buffer could contain more than 1 descriptor.


*ndesc = n kinda guarantees it's the same, no?

> >
> > >       return r;
> > >  }
> > >
> > > @@ -1151,6 +1160,7 @@ static void handle_rx(struct vhost_net *net)
> > >       struct iov_iter fixup;
> > >       __virtio16 num_buffers;
> > >       int recv_pkts = 0;
> > > +     unsigned int ndesc;
> > >
> > >       mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_RX);
> > >       sock = vhost_vq_get_backend(vq);
> > > @@ -1182,7 +1192,8 @@ static void handle_rx(struct vhost_net *net)
> > >               headcount = get_rx_bufs(nvq, vq->heads + count,
> > >                                       vq->nheads + count,
> > >                                       vhost_len, &in, vq_log, &log,
> > > -                                     likely(mergeable) ? UIO_MAXIOV : 1);
> > > +                                     likely(mergeable) ? UIO_MAXIOV : 1,
> > > +                                     &ndesc);
> > >               /* On error, stop handling until the next kick. */
> > >               if (unlikely(headcount < 0))
> > >                       goto out;
> > > @@ -1228,7 +1239,7 @@ static void handle_rx(struct vhost_net *net)
> > >               if (unlikely(err != sock_len)) {
> > >                       pr_debug("Discarded rx packet: "
> > >                                " len %d, expected %zd\n", err, sock_len);
> > > -                     vhost_discard_vq_desc(vq, headcount);
> > > +                     vhost_discard_vq_desc(vq, headcount, ndesc);
> > >                       continue;
> > >               }
> > >               /* Supply virtio_net_hdr if VHOST_NET_F_VIRTIO_NET_HDR */
> > > @@ -1252,7 +1263,7 @@ static void handle_rx(struct vhost_net *net)
> > >                   copy_to_iter(&num_buffers, sizeof num_buffers,
> > >                                &fixup) != sizeof num_buffers) {
> > >                       vq_err(vq, "Failed num_buffers write");
> > > -                     vhost_discard_vq_desc(vq, headcount);
> > > +                     vhost_discard_vq_desc(vq, headcount, ndesc);
> > >                       goto out;
> > >               }
> > >               nvq->done_idx += headcount;
> > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > index 8570fdf2e14a..b56568807588 100644
> > > --- a/drivers/vhost/vhost.c
> > > +++ b/drivers/vhost/vhost.c
> > > @@ -2792,18 +2792,11 @@ static int get_indirect(struct vhost_virtqueue *vq,
> > >       return 0;
> > >  }
> > >
> > > -/* This looks in the virtqueue and for the first available buffer, and converts
> > > - * it to an iovec for convenient access.  Since descriptors consist of some
> > > - * number of output then some number of input descriptors, it's actually two
> > > - * iovecs, but we pack them into one and note how many of each there were.
> > > - *
> > > - * This function returns the descriptor number found, or vq->num (which is
> > > - * never a valid descriptor number) if none was found.  A negative code is
> > > - * returned on error. */
> >
> > A new module API with no docs at all is not good.
> > Please add documentation to this one. vhost_get_vq_desc
> > is a subset and could refer to it.
> 
> Fixed.
> 
> >
> > > -int vhost_get_vq_desc(struct vhost_virtqueue *vq,
> > > -                   struct iovec iov[], unsigned int iov_size,
> > > -                   unsigned int *out_num, unsigned int *in_num,
> > > -                   struct vhost_log *log, unsigned int *log_num)
> > > +int vhost_get_vq_desc_n(struct vhost_virtqueue *vq,
> > > +                     struct iovec iov[], unsigned int iov_size,
> > > +                     unsigned int *out_num, unsigned int *in_num,
> > > +                     struct vhost_log *log, unsigned int *log_num,
> > > +                     unsigned int *ndesc)
> >
> > >  {
> > >       bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
> > >       struct vring_desc desc;
> > > @@ -2921,16 +2914,40 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
> > >       vq->last_avail_idx++;
> > >       vq->next_avail_head += c;
> > >
> > > +     if (ndesc)
> > > +             *ndesc = c;
> > > +
> > >       /* Assume notifications from guest are disabled at this point,
> > >        * if they aren't we would need to update avail_event index. */
> > >       BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
> > >       return head;
> > >  }
> > > +EXPORT_SYMBOL_GPL(vhost_get_vq_desc_n);
> > > +
> > > +/* This looks in the virtqueue and for the first available buffer, and converts
> > > + * it to an iovec for convenient access.  Since descriptors consist of some
> > > + * number of output then some number of input descriptors, it's actually two
> > > + * iovecs, but we pack them into one and note how many of each there were.
> > > + *
> > > + * This function returns the descriptor number found, or vq->num (which is
> > > + * never a valid descriptor number) if none was found.  A negative code is
> > > + * returned on error.
> > > + */
> > > +int vhost_get_vq_desc(struct vhost_virtqueue *vq,
> > > +                   struct iovec iov[], unsigned int iov_size,
> > > +                   unsigned int *out_num, unsigned int *in_num,
> > > +                   struct vhost_log *log, unsigned int *log_num)
> > > +{
> > > +     return vhost_get_vq_desc_n(vq, iov, iov_size, out_num, in_num,
> > > +                                log, log_num, NULL);
> > > +}
> > >  EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
> > >
> > >  /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
> > > -void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
> > > +void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n,
> > > +                        unsigned int ndesc)
> >
> > ndesc is number of descriptors? And n is what, in that case?
> 
> The semantic of n is not changed which is the number of buffers, ndesc
> is the number of descriptors.

History is not that relevant. To make the core readable pls
change the names to readable ones.

Specifically n is really nbufs, maybe?




> >
> >
> > >  {
> > > +     vq->next_avail_head -= ndesc;
> > >       vq->last_avail_idx -= n;
> > >  }
> > >  EXPORT_SYMBOL_GPL(vhost_discard_vq_desc);
> > > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> > > index 621a6d9a8791..69a39540df3d 100644
> > > --- a/drivers/vhost/vhost.h
> > > +++ b/drivers/vhost/vhost.h
> > > @@ -230,7 +230,14 @@ int vhost_get_vq_desc(struct vhost_virtqueue *,
> > >                     struct iovec iov[], unsigned int iov_size,
> > >                     unsigned int *out_num, unsigned int *in_num,
> > >                     struct vhost_log *log, unsigned int *log_num);
> > > -void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
> > > +
> > > +int vhost_get_vq_desc_n(struct vhost_virtqueue *vq,
> > > +                     struct iovec iov[], unsigned int iov_size,
> > > +                     unsigned int *out_num, unsigned int *in_num,
> > > +                     struct vhost_log *log, unsigned int *log_num,
> > > +                     unsigned int *ndesc);
> > > +
> > > +void vhost_discard_vq_desc(struct vhost_virtqueue *, int n, unsigned int ndesc);
> > >
> > >  bool vhost_vq_work_queue(struct vhost_virtqueue *vq, struct vhost_work *work);
> > >  bool vhost_vq_has_work(struct vhost_virtqueue *vq);
> > > --
> > > 2.31.1
> >
> 
> Thanks

Re: [PATCH net] vhost: rewind next_avail_head while discarding descriptors
Posted by Jason Wang 2 weeks, 6 days ago
On Fri, Nov 14, 2025 at 2:25 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Fri, Nov 14, 2025 at 09:53:12AM +0800, Jason Wang wrote:
> > On Thu, Nov 13, 2025 at 4:13 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Thu, Nov 13, 2025 at 09:54:20AM +0800, Jason Wang wrote:
> > > > When discarding descriptors with IN_ORDER, we should rewind
> > > > next_avail_head otherwise it would run out of sync with
> > > > last_avail_idx. This would cause driver to report
> > > > "id X is not a head".
> > > >
> > > > Fixing this by returning the number of descriptors that is used for
> > > > each buffer via vhost_get_vq_desc_n() so caller can use the value
> > > > while discarding descriptors.
> > > >
> > > > Fixes: 67a873df0c41 ("vhost: basic in order support")
> > > > Cc: stable@vger.kernel.org
> > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > >
> > > Wow that change really caused a lot of fallout.
> > >
> > > Thanks for the patch! Yet something to improve:
> > >
> > >
> > > > ---
> > > >  drivers/vhost/net.c   | 53 ++++++++++++++++++++++++++-----------------
> > > >  drivers/vhost/vhost.c | 43 ++++++++++++++++++++++++-----------
> > > >  drivers/vhost/vhost.h |  9 +++++++-
> > > >  3 files changed, 70 insertions(+), 35 deletions(-)
> > > >
> > > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > > > index 35ded4330431..8f7f50acb6d6 100644
> > > > --- a/drivers/vhost/net.c
> > > > +++ b/drivers/vhost/net.c
> > > > @@ -592,14 +592,15 @@ static void vhost_net_busy_poll(struct vhost_net *net,
> > > >  static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
> > > >                                   struct vhost_net_virtqueue *tnvq,
> > > >                                   unsigned int *out_num, unsigned int *in_num,
> > > > -                                 struct msghdr *msghdr, bool *busyloop_intr)
> > > > +                                 struct msghdr *msghdr, bool *busyloop_intr,
> > > > +                                 unsigned int *ndesc)
> > > >  {
> > > >       struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
> > > >       struct vhost_virtqueue *rvq = &rnvq->vq;
> > > >       struct vhost_virtqueue *tvq = &tnvq->vq;
> > > >
> > > > -     int r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
> > > > -                               out_num, in_num, NULL, NULL);
> > > > +     int r = vhost_get_vq_desc_n(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
> > > > +                                 out_num, in_num, NULL, NULL, ndesc);
> > > >
> > > >       if (r == tvq->num && tvq->busyloop_timeout) {
> > > >               /* Flush batched packets first */
> > > > @@ -610,8 +611,8 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
> > > >
> > > >               vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, false);
> > > >
> > > > -             r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
> > > > -                                   out_num, in_num, NULL, NULL);
> > > > +             r = vhost_get_vq_desc_n(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
> > > > +                                     out_num, in_num, NULL, NULL, ndesc);
> > > >       }
> > > >
> > > >       return r;
> > > > @@ -642,12 +643,14 @@ static int get_tx_bufs(struct vhost_net *net,
> > > >                      struct vhost_net_virtqueue *nvq,
> > > >                      struct msghdr *msg,
> > > >                      unsigned int *out, unsigned int *in,
> > > > -                    size_t *len, bool *busyloop_intr)
> > > > +                    size_t *len, bool *busyloop_intr,
> > > > +                    unsigned int *ndesc)
> > > >  {
> > > >       struct vhost_virtqueue *vq = &nvq->vq;
> > > >       int ret;
> > > >
> > > > -     ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg, busyloop_intr);
> > > > +     ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg,
> > > > +                                    busyloop_intr, ndesc);
> > > >
> > > >       if (ret < 0 || ret == vq->num)
> > > >               return ret;
> > > > @@ -766,6 +769,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
> > > >       int sent_pkts = 0;
> > > >       bool sock_can_batch = (sock->sk->sk_sndbuf == INT_MAX);
> > > >       bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
> > > > +     unsigned int ndesc = 0;
> > > >
> > > >       do {
> > > >               bool busyloop_intr = false;
> > > > @@ -774,7 +778,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
> > > >                       vhost_tx_batch(net, nvq, sock, &msg);
> > > >
> > > >               head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
> > > > -                                &busyloop_intr);
> > > > +                                &busyloop_intr, &ndesc);
> > > >               /* On error, stop handling until the next kick. */
> > > >               if (unlikely(head < 0))
> > > >                       break;
> > > > @@ -806,7 +810,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
> > > >                               goto done;
> > > >                       } else if (unlikely(err != -ENOSPC)) {
> > > >                               vhost_tx_batch(net, nvq, sock, &msg);
> > > > -                             vhost_discard_vq_desc(vq, 1);
> > > > +                             vhost_discard_vq_desc(vq, 1, ndesc);
> > > >                               vhost_net_enable_vq(net, vq);
> > > >                               break;
> > > >                       }
> > > > @@ -829,7 +833,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
> > > >               err = sock->ops->sendmsg(sock, &msg, len);
> > > >               if (unlikely(err < 0)) {
> > > >                       if (err == -EAGAIN || err == -ENOMEM || err == -ENOBUFS) {
> > > > -                             vhost_discard_vq_desc(vq, 1);
> > > > +                             vhost_discard_vq_desc(vq, 1, ndesc);
> > > >                               vhost_net_enable_vq(net, vq);
> > > >                               break;
> > > >                       }
> > > > @@ -868,6 +872,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
> > > >       int err;
> > > >       struct vhost_net_ubuf_ref *ubufs;
> > > >       struct ubuf_info_msgzc *ubuf;
> > > > +     unsigned int ndesc = 0;
> > > >       bool zcopy_used;
> > > >       int sent_pkts = 0;
> > > >
> > > > @@ -879,7 +884,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
> > > >
> > > >               busyloop_intr = false;
> > > >               head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
> > > > -                                &busyloop_intr);
> > > > +                                &busyloop_intr, &ndesc);
> > > >               /* On error, stop handling until the next kick. */
> > > >               if (unlikely(head < 0))
> > > >                       break;
> > > > @@ -941,7 +946,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
> > > >                                       vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN;
> > > >                       }
> > > >                       if (retry) {
> > > > -                             vhost_discard_vq_desc(vq, 1);
> > > > +                             vhost_discard_vq_desc(vq, 1, ndesc);
> > > >                               vhost_net_enable_vq(net, vq);
> > > >                               break;
> > > >                       }
> > > > @@ -1045,11 +1050,12 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq,
> > > >                      unsigned *iovcount,
> > > >                      struct vhost_log *log,
> > > >                      unsigned *log_num,
> > > > -                    unsigned int quota)
> > > > +                    unsigned int quota,
> > > > +                    unsigned int *ndesc)
> > > >  {
> > > >       struct vhost_virtqueue *vq = &nvq->vq;
> > > >       bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
> > > > -     unsigned int out, in;
> > > > +     unsigned int out, in, desc_num, n = 0;
> > > >       int seg = 0;
> > > >       int headcount = 0;
> > > >       unsigned d;
> > > > @@ -1064,9 +1070,9 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq,
> > > >                       r = -ENOBUFS;
> > > >                       goto err;
> > > >               }
> > > > -             r = vhost_get_vq_desc(vq, vq->iov + seg,
> > > > -                                   ARRAY_SIZE(vq->iov) - seg, &out,
> > > > -                                   &in, log, log_num);
> > > > +             r = vhost_get_vq_desc_n(vq, vq->iov + seg,
> > > > +                                     ARRAY_SIZE(vq->iov) - seg, &out,
> > > > +                                     &in, log, log_num, &desc_num);
> > > >               if (unlikely(r < 0))
> > > >                       goto err;
> > > >
> > > > @@ -1093,6 +1099,7 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq,
> > > >               ++headcount;
> > > >               datalen -= len;
> > > >               seg += in;
> > > > +             n += desc_num;
> > > >       }
> > > >
> > > >       *iovcount = seg;
> > > > @@ -1113,9 +1120,11 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq,
> > > >               nheads[0] = headcount;
> > > >       }
> > > >
> > > > +     *ndesc = n;
> > > > +
> > > >       return headcount;
> > > >  err:
> > > > -     vhost_discard_vq_desc(vq, headcount);
> > > > +     vhost_discard_vq_desc(vq, headcount, n);
> > >
> > > So here ndesc and n are the same, but below in vhost_discard_vq_desc
> > > they are different. Fun.
> >
> > Not necessarily the same, a buffer could contain more than 1 descriptor.
>
>
> *ndesc = n kinda guarantees it's the same, no?

I misread your comment, in the error path the ndesc is left unused.
Would this be a problem?

>
> > >
> > > >       return r;
> > > >  }
> > > >
> > > > @@ -1151,6 +1160,7 @@ static void handle_rx(struct vhost_net *net)
> > > >       struct iov_iter fixup;
> > > >       __virtio16 num_buffers;
> > > >       int recv_pkts = 0;
> > > > +     unsigned int ndesc;
> > > >
> > > >       mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_RX);
> > > >       sock = vhost_vq_get_backend(vq);
> > > > @@ -1182,7 +1192,8 @@ static void handle_rx(struct vhost_net *net)
> > > >               headcount = get_rx_bufs(nvq, vq->heads + count,
> > > >                                       vq->nheads + count,
> > > >                                       vhost_len, &in, vq_log, &log,
> > > > -                                     likely(mergeable) ? UIO_MAXIOV : 1);
> > > > +                                     likely(mergeable) ? UIO_MAXIOV : 1,
> > > > +                                     &ndesc);
> > > >               /* On error, stop handling until the next kick. */
> > > >               if (unlikely(headcount < 0))
> > > >                       goto out;
> > > > @@ -1228,7 +1239,7 @@ static void handle_rx(struct vhost_net *net)
> > > >               if (unlikely(err != sock_len)) {
> > > >                       pr_debug("Discarded rx packet: "
> > > >                                " len %d, expected %zd\n", err, sock_len);
> > > > -                     vhost_discard_vq_desc(vq, headcount);
> > > > +                     vhost_discard_vq_desc(vq, headcount, ndesc);
> > > >                       continue;
> > > >               }
> > > >               /* Supply virtio_net_hdr if VHOST_NET_F_VIRTIO_NET_HDR */
> > > > @@ -1252,7 +1263,7 @@ static void handle_rx(struct vhost_net *net)
> > > >                   copy_to_iter(&num_buffers, sizeof num_buffers,
> > > >                                &fixup) != sizeof num_buffers) {
> > > >                       vq_err(vq, "Failed num_buffers write");
> > > > -                     vhost_discard_vq_desc(vq, headcount);
> > > > +                     vhost_discard_vq_desc(vq, headcount, ndesc);
> > > >                       goto out;
> > > >               }
> > > >               nvq->done_idx += headcount;
> > > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > > index 8570fdf2e14a..b56568807588 100644
> > > > --- a/drivers/vhost/vhost.c
> > > > +++ b/drivers/vhost/vhost.c
> > > > @@ -2792,18 +2792,11 @@ static int get_indirect(struct vhost_virtqueue *vq,
> > > >       return 0;
> > > >  }
> > > >
> > > > -/* This looks in the virtqueue and for the first available buffer, and converts
> > > > - * it to an iovec for convenient access.  Since descriptors consist of some
> > > > - * number of output then some number of input descriptors, it's actually two
> > > > - * iovecs, but we pack them into one and note how many of each there were.
> > > > - *
> > > > - * This function returns the descriptor number found, or vq->num (which is
> > > > - * never a valid descriptor number) if none was found.  A negative code is
> > > > - * returned on error. */
> > >
> > > A new module API with no docs at all is not good.
> > > Please add documentation to this one. vhost_get_vq_desc
> > > is a subset and could refer to it.
> >
> > Fixed.
> >
> > >
> > > > -int vhost_get_vq_desc(struct vhost_virtqueue *vq,
> > > > -                   struct iovec iov[], unsigned int iov_size,
> > > > -                   unsigned int *out_num, unsigned int *in_num,
> > > > -                   struct vhost_log *log, unsigned int *log_num)
> > > > +int vhost_get_vq_desc_n(struct vhost_virtqueue *vq,
> > > > +                     struct iovec iov[], unsigned int iov_size,
> > > > +                     unsigned int *out_num, unsigned int *in_num,
> > > > +                     struct vhost_log *log, unsigned int *log_num,
> > > > +                     unsigned int *ndesc)
> > >
> > > >  {
> > > >       bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
> > > >       struct vring_desc desc;
> > > > @@ -2921,16 +2914,40 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
> > > >       vq->last_avail_idx++;
> > > >       vq->next_avail_head += c;
> > > >
> > > > +     if (ndesc)
> > > > +             *ndesc = c;
> > > > +
> > > >       /* Assume notifications from guest are disabled at this point,
> > > >        * if they aren't we would need to update avail_event index. */
> > > >       BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
> > > >       return head;
> > > >  }
> > > > +EXPORT_SYMBOL_GPL(vhost_get_vq_desc_n);
> > > > +
> > > > +/* This looks in the virtqueue and for the first available buffer, and converts
> > > > + * it to an iovec for convenient access.  Since descriptors consist of some
> > > > + * number of output then some number of input descriptors, it's actually two
> > > > + * iovecs, but we pack them into one and note how many of each there were.
> > > > + *
> > > > + * This function returns the descriptor number found, or vq->num (which is
> > > > + * never a valid descriptor number) if none was found.  A negative code is
> > > > + * returned on error.
> > > > + */
> > > > +int vhost_get_vq_desc(struct vhost_virtqueue *vq,
> > > > +                   struct iovec iov[], unsigned int iov_size,
> > > > +                   unsigned int *out_num, unsigned int *in_num,
> > > > +                   struct vhost_log *log, unsigned int *log_num)
> > > > +{
> > > > +     return vhost_get_vq_desc_n(vq, iov, iov_size, out_num, in_num,
> > > > +                                log, log_num, NULL);
> > > > +}
> > > >  EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
> > > >
> > > >  /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
> > > > -void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
> > > > +void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n,
> > > > +                        unsigned int ndesc)
> > >
> > > ndesc is number of descriptors? And n is what, in that case?
> >
> > The semantic of n is not changed which is the number of buffers, ndesc
> > is the number of descriptors.
>
> History is not that relevant. To make the core readable pls
> change the names to readable ones.
>
> Specifically n is really nbufs, maybe?

Right.

Thanks
Re: [PATCH net] vhost: rewind next_avail_head while discarding descriptors
Posted by Michael S. Tsirkin 2 weeks, 6 days ago
On Mon, Nov 17, 2025 at 12:26:51PM +0800, Jason Wang wrote:
> On Fri, Nov 14, 2025 at 2:25 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Fri, Nov 14, 2025 at 09:53:12AM +0800, Jason Wang wrote:
> > > On Thu, Nov 13, 2025 at 4:13 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > >
> > > > On Thu, Nov 13, 2025 at 09:54:20AM +0800, Jason Wang wrote:
> > > > > When discarding descriptors with IN_ORDER, we should rewind
> > > > > next_avail_head otherwise it would run out of sync with
> > > > > last_avail_idx. This would cause driver to report
> > > > > "id X is not a head".
> > > > >
> > > > > Fixing this by returning the number of descriptors that is used for
> > > > > each buffer via vhost_get_vq_desc_n() so caller can use the value
> > > > > while discarding descriptors.
> > > > >
> > > > > Fixes: 67a873df0c41 ("vhost: basic in order support")
> > > > > Cc: stable@vger.kernel.org
> > > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > >
> > > > Wow that change really caused a lot of fallout.
> > > >
> > > > Thanks for the patch! Yet something to improve:
> > > >
> > > >
> > > > > ---
> > > > >  drivers/vhost/net.c   | 53 ++++++++++++++++++++++++++-----------------
> > > > >  drivers/vhost/vhost.c | 43 ++++++++++++++++++++++++-----------
> > > > >  drivers/vhost/vhost.h |  9 +++++++-
> > > > >  3 files changed, 70 insertions(+), 35 deletions(-)
> > > > >
> > > > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > > > > index 35ded4330431..8f7f50acb6d6 100644
> > > > > --- a/drivers/vhost/net.c
> > > > > +++ b/drivers/vhost/net.c
> > > > > @@ -592,14 +592,15 @@ static void vhost_net_busy_poll(struct vhost_net *net,
> > > > >  static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
> > > > >                                   struct vhost_net_virtqueue *tnvq,
> > > > >                                   unsigned int *out_num, unsigned int *in_num,
> > > > > -                                 struct msghdr *msghdr, bool *busyloop_intr)
> > > > > +                                 struct msghdr *msghdr, bool *busyloop_intr,
> > > > > +                                 unsigned int *ndesc)
> > > > >  {
> > > > >       struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
> > > > >       struct vhost_virtqueue *rvq = &rnvq->vq;
> > > > >       struct vhost_virtqueue *tvq = &tnvq->vq;
> > > > >
> > > > > -     int r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
> > > > > -                               out_num, in_num, NULL, NULL);
> > > > > +     int r = vhost_get_vq_desc_n(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
> > > > > +                                 out_num, in_num, NULL, NULL, ndesc);
> > > > >
> > > > >       if (r == tvq->num && tvq->busyloop_timeout) {
> > > > >               /* Flush batched packets first */
> > > > > @@ -610,8 +611,8 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
> > > > >
> > > > >               vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, false);
> > > > >
> > > > > -             r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
> > > > > -                                   out_num, in_num, NULL, NULL);
> > > > > +             r = vhost_get_vq_desc_n(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
> > > > > +                                     out_num, in_num, NULL, NULL, ndesc);
> > > > >       }
> > > > >
> > > > >       return r;
> > > > > @@ -642,12 +643,14 @@ static int get_tx_bufs(struct vhost_net *net,
> > > > >                      struct vhost_net_virtqueue *nvq,
> > > > >                      struct msghdr *msg,
> > > > >                      unsigned int *out, unsigned int *in,
> > > > > -                    size_t *len, bool *busyloop_intr)
> > > > > +                    size_t *len, bool *busyloop_intr,
> > > > > +                    unsigned int *ndesc)
> > > > >  {
> > > > >       struct vhost_virtqueue *vq = &nvq->vq;
> > > > >       int ret;
> > > > >
> > > > > -     ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg, busyloop_intr);
> > > > > +     ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg,
> > > > > +                                    busyloop_intr, ndesc);
> > > > >
> > > > >       if (ret < 0 || ret == vq->num)
> > > > >               return ret;
> > > > > @@ -766,6 +769,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
> > > > >       int sent_pkts = 0;
> > > > >       bool sock_can_batch = (sock->sk->sk_sndbuf == INT_MAX);
> > > > >       bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
> > > > > +     unsigned int ndesc = 0;
> > > > >
> > > > >       do {
> > > > >               bool busyloop_intr = false;
> > > > > @@ -774,7 +778,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
> > > > >                       vhost_tx_batch(net, nvq, sock, &msg);
> > > > >
> > > > >               head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
> > > > > -                                &busyloop_intr);
> > > > > +                                &busyloop_intr, &ndesc);
> > > > >               /* On error, stop handling until the next kick. */
> > > > >               if (unlikely(head < 0))
> > > > >                       break;
> > > > > @@ -806,7 +810,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
> > > > >                               goto done;
> > > > >                       } else if (unlikely(err != -ENOSPC)) {
> > > > >                               vhost_tx_batch(net, nvq, sock, &msg);
> > > > > -                             vhost_discard_vq_desc(vq, 1);
> > > > > +                             vhost_discard_vq_desc(vq, 1, ndesc);
> > > > >                               vhost_net_enable_vq(net, vq);
> > > > >                               break;
> > > > >                       }
> > > > > @@ -829,7 +833,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
> > > > >               err = sock->ops->sendmsg(sock, &msg, len);
> > > > >               if (unlikely(err < 0)) {
> > > > >                       if (err == -EAGAIN || err == -ENOMEM || err == -ENOBUFS) {
> > > > > -                             vhost_discard_vq_desc(vq, 1);
> > > > > +                             vhost_discard_vq_desc(vq, 1, ndesc);
> > > > >                               vhost_net_enable_vq(net, vq);
> > > > >                               break;
> > > > >                       }
> > > > > @@ -868,6 +872,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
> > > > >       int err;
> > > > >       struct vhost_net_ubuf_ref *ubufs;
> > > > >       struct ubuf_info_msgzc *ubuf;
> > > > > +     unsigned int ndesc = 0;
> > > > >       bool zcopy_used;
> > > > >       int sent_pkts = 0;
> > > > >
> > > > > @@ -879,7 +884,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
> > > > >
> > > > >               busyloop_intr = false;
> > > > >               head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
> > > > > -                                &busyloop_intr);
> > > > > +                                &busyloop_intr, &ndesc);
> > > > >               /* On error, stop handling until the next kick. */
> > > > >               if (unlikely(head < 0))
> > > > >                       break;
> > > > > @@ -941,7 +946,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
> > > > >                                       vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN;
> > > > >                       }
> > > > >                       if (retry) {
> > > > > -                             vhost_discard_vq_desc(vq, 1);
> > > > > +                             vhost_discard_vq_desc(vq, 1, ndesc);
> > > > >                               vhost_net_enable_vq(net, vq);
> > > > >                               break;
> > > > >                       }
> > > > > @@ -1045,11 +1050,12 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq,
> > > > >                      unsigned *iovcount,
> > > > >                      struct vhost_log *log,
> > > > >                      unsigned *log_num,
> > > > > -                    unsigned int quota)
> > > > > +                    unsigned int quota,
> > > > > +                    unsigned int *ndesc)
> > > > >  {
> > > > >       struct vhost_virtqueue *vq = &nvq->vq;
> > > > >       bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
> > > > > -     unsigned int out, in;
> > > > > +     unsigned int out, in, desc_num, n = 0;
> > > > >       int seg = 0;
> > > > >       int headcount = 0;
> > > > >       unsigned d;
> > > > > @@ -1064,9 +1070,9 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq,
> > > > >                       r = -ENOBUFS;
> > > > >                       goto err;
> > > > >               }
> > > > > -             r = vhost_get_vq_desc(vq, vq->iov + seg,
> > > > > -                                   ARRAY_SIZE(vq->iov) - seg, &out,
> > > > > -                                   &in, log, log_num);
> > > > > +             r = vhost_get_vq_desc_n(vq, vq->iov + seg,
> > > > > +                                     ARRAY_SIZE(vq->iov) - seg, &out,
> > > > > +                                     &in, log, log_num, &desc_num);
> > > > >               if (unlikely(r < 0))
> > > > >                       goto err;
> > > > >
> > > > > @@ -1093,6 +1099,7 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq,
> > > > >               ++headcount;
> > > > >               datalen -= len;
> > > > >               seg += in;
> > > > > +             n += desc_num;
> > > > >       }
> > > > >
> > > > >       *iovcount = seg;
> > > > > @@ -1113,9 +1120,11 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq,
> > > > >               nheads[0] = headcount;
> > > > >       }
> > > > >
> > > > > +     *ndesc = n;
> > > > > +
> > > > >       return headcount;
> > > > >  err:
> > > > > -     vhost_discard_vq_desc(vq, headcount);
> > > > > +     vhost_discard_vq_desc(vq, headcount, n);
> > > >
> > > > So here ndesc and n are the same, but below in vhost_discard_vq_desc
> > > > they are different. Fun.
> > >
> > > Not necessarily the same, a buffer could contain more than 1 descriptor.
> >
> >
> > *ndesc = n kinda guarantees it's the same, no?
> 
> I misread your comment, in the error path the ndesc is left unused.




> Would this be a problem?
> >
> > > >
> > > > >       return r;
> > > > >  }
> > > > >
> > > > > @@ -1151,6 +1160,7 @@ static void handle_rx(struct vhost_net *net)
> > > > >       struct iov_iter fixup;
> > > > >       __virtio16 num_buffers;
> > > > >       int recv_pkts = 0;
> > > > > +     unsigned int ndesc;
> > > > >
> > > > >       mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_RX);
> > > > >       sock = vhost_vq_get_backend(vq);
> > > > > @@ -1182,7 +1192,8 @@ static void handle_rx(struct vhost_net *net)
> > > > >               headcount = get_rx_bufs(nvq, vq->heads + count,
> > > > >                                       vq->nheads + count,
> > > > >                                       vhost_len, &in, vq_log, &log,
> > > > > -                                     likely(mergeable) ? UIO_MAXIOV : 1);
> > > > > +                                     likely(mergeable) ? UIO_MAXIOV : 1,
> > > > > +                                     &ndesc);
> > > > >               /* On error, stop handling until the next kick. */
> > > > >               if (unlikely(headcount < 0))
> > > > >                       goto out;
> > > > > @@ -1228,7 +1239,7 @@ static void handle_rx(struct vhost_net *net)
> > > > >               if (unlikely(err != sock_len)) {
> > > > >                       pr_debug("Discarded rx packet: "
> > > > >                                " len %d, expected %zd\n", err, sock_len);
> > > > > -                     vhost_discard_vq_desc(vq, headcount);
> > > > > +                     vhost_discard_vq_desc(vq, headcount, ndesc);
> > > > >                       continue;
> > > > >               }
> > > > >               /* Supply virtio_net_hdr if VHOST_NET_F_VIRTIO_NET_HDR */
> > > > > @@ -1252,7 +1263,7 @@ static void handle_rx(struct vhost_net *net)
> > > > >                   copy_to_iter(&num_buffers, sizeof num_buffers,
> > > > >                                &fixup) != sizeof num_buffers) {
> > > > >                       vq_err(vq, "Failed num_buffers write");
> > > > > -                     vhost_discard_vq_desc(vq, headcount);
> > > > > +                     vhost_discard_vq_desc(vq, headcount, ndesc);
> > > > >                       goto out;
> > > > >               }
> > > > >               nvq->done_idx += headcount;
> > > > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > > > index 8570fdf2e14a..b56568807588 100644
> > > > > --- a/drivers/vhost/vhost.c
> > > > > +++ b/drivers/vhost/vhost.c
> > > > > @@ -2792,18 +2792,11 @@ static int get_indirect(struct vhost_virtqueue *vq,
> > > > >       return 0;
> > > > >  }
> > > > >
> > > > > -/* This looks in the virtqueue and for the first available buffer, and converts
> > > > > - * it to an iovec for convenient access.  Since descriptors consist of some
> > > > > - * number of output then some number of input descriptors, it's actually two
> > > > > - * iovecs, but we pack them into one and note how many of each there were.
> > > > > - *
> > > > > - * This function returns the descriptor number found, or vq->num (which is
> > > > > - * never a valid descriptor number) if none was found.  A negative code is
> > > > > - * returned on error. */
> > > >
> > > > A new module API with no docs at all is not good.
> > > > Please add documentation to this one. vhost_get_vq_desc
> > > > is a subset and could refer to it.
> > >
> > > Fixed.
> > >
> > > >
> > > > > -int vhost_get_vq_desc(struct vhost_virtqueue *vq,
> > > > > -                   struct iovec iov[], unsigned int iov_size,
> > > > > -                   unsigned int *out_num, unsigned int *in_num,
> > > > > -                   struct vhost_log *log, unsigned int *log_num)
> > > > > +int vhost_get_vq_desc_n(struct vhost_virtqueue *vq,
> > > > > +                     struct iovec iov[], unsigned int iov_size,
> > > > > +                     unsigned int *out_num, unsigned int *in_num,
> > > > > +                     struct vhost_log *log, unsigned int *log_num,
> > > > > +                     unsigned int *ndesc)
> > > >
> > > > >  {
> > > > >       bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
> > > > >       struct vring_desc desc;
> > > > > @@ -2921,16 +2914,40 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
> > > > >       vq->last_avail_idx++;
> > > > >       vq->next_avail_head += c;
> > > > >
> > > > > +     if (ndesc)
> > > > > +             *ndesc = c;
> > > > > +
> > > > >       /* Assume notifications from guest are disabled at this point,
> > > > >        * if they aren't we would need to update avail_event index. */
> > > > >       BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
> > > > >       return head;
> > > > >  }
> > > > > +EXPORT_SYMBOL_GPL(vhost_get_vq_desc_n);
> > > > > +
> > > > > +/* This looks in the virtqueue and for the first available buffer, and converts
> > > > > + * it to an iovec for convenient access.  Since descriptors consist of some
> > > > > + * number of output then some number of input descriptors, it's actually two
> > > > > + * iovecs, but we pack them into one and note how many of each there were.
> > > > > + *
> > > > > + * This function returns the descriptor number found, or vq->num (which is
> > > > > + * never a valid descriptor number) if none was found.  A negative code is
> > > > > + * returned on error.
> > > > > + */
> > > > > +int vhost_get_vq_desc(struct vhost_virtqueue *vq,
> > > > > +                   struct iovec iov[], unsigned int iov_size,
> > > > > +                   unsigned int *out_num, unsigned int *in_num,
> > > > > +                   struct vhost_log *log, unsigned int *log_num)
> > > > > +{
> > > > > +     return vhost_get_vq_desc_n(vq, iov, iov_size, out_num, in_num,
> > > > > +                                log, log_num, NULL);
> > > > > +}
> > > > >  EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
> > > > >
> > > > >  /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
> > > > > -void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
> > > > > +void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n,
> > > > > +                        unsigned int ndesc)
> > > >
> > > > ndesc is number of descriptors? And n is what, in that case?
> > >
> > > The semantic of n is not changed which is the number of buffers, ndesc
> > > is the number of descriptors.
> >
> > History is not that relevant. To make the core readable pls
> > change the names to readable ones.
> >
> > Specifically n is really nbufs, maybe?
> 
> Right.
> 
> Thanks

All I am asking for is that in the API the parameter is named in a way
that makes it clear what it is counting.

vhost_get_vq_desc_n is the function you want to document, making it
clear what is returned in ndesc and how it's different from the return
value.



-- 
MST

Re: [PATCH net] vhost: rewind next_avail_head while discarding descriptors
Posted by Jason Wang 2 weeks, 4 days ago
On Mon, Nov 17, 2025 at 4:49 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Mon, Nov 17, 2025 at 12:26:51PM +0800, Jason Wang wrote:
> > On Fri, Nov 14, 2025 at 2:25 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Fri, Nov 14, 2025 at 09:53:12AM +0800, Jason Wang wrote:
> > > > On Thu, Nov 13, 2025 at 4:13 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > >
> > > > > On Thu, Nov 13, 2025 at 09:54:20AM +0800, Jason Wang wrote:
> > > > > > When discarding descriptors with IN_ORDER, we should rewind
> > > > > > next_avail_head otherwise it would run out of sync with
> > > > > > last_avail_idx. This would cause driver to report
> > > > > > "id X is not a head".
> > > > > >
> > > > > > Fixing this by returning the number of descriptors that is used for
> > > > > > each buffer via vhost_get_vq_desc_n() so caller can use the value
> > > > > > while discarding descriptors.
> > > > > >
> > > > > > Fixes: 67a873df0c41 ("vhost: basic in order support")
> > > > > > Cc: stable@vger.kernel.org
> > > > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > >
> > > > > Wow that change really caused a lot of fallout.
> > > > >
> > > > > Thanks for the patch! Yet something to improve:
> > > > >
> > > > >
> > > > > > ---
> > > > > >  drivers/vhost/net.c   | 53 ++++++++++++++++++++++++++-----------------
> > > > > >  drivers/vhost/vhost.c | 43 ++++++++++++++++++++++++-----------
> > > > > >  drivers/vhost/vhost.h |  9 +++++++-
> > > > > >  3 files changed, 70 insertions(+), 35 deletions(-)
> > > > > >
> > > > > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > > > > > index 35ded4330431..8f7f50acb6d6 100644
> > > > > > --- a/drivers/vhost/net.c
> > > > > > +++ b/drivers/vhost/net.c
> > > > > > @@ -592,14 +592,15 @@ static void vhost_net_busy_poll(struct vhost_net *net,
> > > > > >  static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
> > > > > >                                   struct vhost_net_virtqueue *tnvq,
> > > > > >                                   unsigned int *out_num, unsigned int *in_num,
> > > > > > -                                 struct msghdr *msghdr, bool *busyloop_intr)
> > > > > > +                                 struct msghdr *msghdr, bool *busyloop_intr,
> > > > > > +                                 unsigned int *ndesc)
> > > > > >  {
> > > > > >       struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
> > > > > >       struct vhost_virtqueue *rvq = &rnvq->vq;
> > > > > >       struct vhost_virtqueue *tvq = &tnvq->vq;
> > > > > >
> > > > > > -     int r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
> > > > > > -                               out_num, in_num, NULL, NULL);
> > > > > > +     int r = vhost_get_vq_desc_n(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
> > > > > > +                                 out_num, in_num, NULL, NULL, ndesc);
> > > > > >
> > > > > >       if (r == tvq->num && tvq->busyloop_timeout) {
> > > > > >               /* Flush batched packets first */
> > > > > > @@ -610,8 +611,8 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
> > > > > >
> > > > > >               vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, false);
> > > > > >
> > > > > > -             r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
> > > > > > -                                   out_num, in_num, NULL, NULL);
> > > > > > +             r = vhost_get_vq_desc_n(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
> > > > > > +                                     out_num, in_num, NULL, NULL, ndesc);
> > > > > >       }
> > > > > >
> > > > > >       return r;
> > > > > > @@ -642,12 +643,14 @@ static int get_tx_bufs(struct vhost_net *net,
> > > > > >                      struct vhost_net_virtqueue *nvq,
> > > > > >                      struct msghdr *msg,
> > > > > >                      unsigned int *out, unsigned int *in,
> > > > > > -                    size_t *len, bool *busyloop_intr)
> > > > > > +                    size_t *len, bool *busyloop_intr,
> > > > > > +                    unsigned int *ndesc)
> > > > > >  {
> > > > > >       struct vhost_virtqueue *vq = &nvq->vq;
> > > > > >       int ret;
> > > > > >
> > > > > > -     ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg, busyloop_intr);
> > > > > > +     ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg,
> > > > > > +                                    busyloop_intr, ndesc);
> > > > > >
> > > > > >       if (ret < 0 || ret == vq->num)
> > > > > >               return ret;
> > > > > > @@ -766,6 +769,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
> > > > > >       int sent_pkts = 0;
> > > > > >       bool sock_can_batch = (sock->sk->sk_sndbuf == INT_MAX);
> > > > > >       bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
> > > > > > +     unsigned int ndesc = 0;
> > > > > >
> > > > > >       do {
> > > > > >               bool busyloop_intr = false;
> > > > > > @@ -774,7 +778,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
> > > > > >                       vhost_tx_batch(net, nvq, sock, &msg);
> > > > > >
> > > > > >               head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
> > > > > > -                                &busyloop_intr);
> > > > > > +                                &busyloop_intr, &ndesc);
> > > > > >               /* On error, stop handling until the next kick. */
> > > > > >               if (unlikely(head < 0))
> > > > > >                       break;
> > > > > > @@ -806,7 +810,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
> > > > > >                               goto done;
> > > > > >                       } else if (unlikely(err != -ENOSPC)) {
> > > > > >                               vhost_tx_batch(net, nvq, sock, &msg);
> > > > > > -                             vhost_discard_vq_desc(vq, 1);
> > > > > > +                             vhost_discard_vq_desc(vq, 1, ndesc);
> > > > > >                               vhost_net_enable_vq(net, vq);
> > > > > >                               break;
> > > > > >                       }
> > > > > > @@ -829,7 +833,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
> > > > > >               err = sock->ops->sendmsg(sock, &msg, len);
> > > > > >               if (unlikely(err < 0)) {
> > > > > >                       if (err == -EAGAIN || err == -ENOMEM || err == -ENOBUFS) {
> > > > > > -                             vhost_discard_vq_desc(vq, 1);
> > > > > > +                             vhost_discard_vq_desc(vq, 1, ndesc);
> > > > > >                               vhost_net_enable_vq(net, vq);
> > > > > >                               break;
> > > > > >                       }
> > > > > > @@ -868,6 +872,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
> > > > > >       int err;
> > > > > >       struct vhost_net_ubuf_ref *ubufs;
> > > > > >       struct ubuf_info_msgzc *ubuf;
> > > > > > +     unsigned int ndesc = 0;
> > > > > >       bool zcopy_used;
> > > > > >       int sent_pkts = 0;
> > > > > >
> > > > > > @@ -879,7 +884,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
> > > > > >
> > > > > >               busyloop_intr = false;
> > > > > >               head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
> > > > > > -                                &busyloop_intr);
> > > > > > +                                &busyloop_intr, &ndesc);
> > > > > >               /* On error, stop handling until the next kick. */
> > > > > >               if (unlikely(head < 0))
> > > > > >                       break;
> > > > > > @@ -941,7 +946,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
> > > > > >                                       vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN;
> > > > > >                       }
> > > > > >                       if (retry) {
> > > > > > -                             vhost_discard_vq_desc(vq, 1);
> > > > > > +                             vhost_discard_vq_desc(vq, 1, ndesc);
> > > > > >                               vhost_net_enable_vq(net, vq);
> > > > > >                               break;
> > > > > >                       }
> > > > > > @@ -1045,11 +1050,12 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq,
> > > > > >                      unsigned *iovcount,
> > > > > >                      struct vhost_log *log,
> > > > > >                      unsigned *log_num,
> > > > > > -                    unsigned int quota)
> > > > > > +                    unsigned int quota,
> > > > > > +                    unsigned int *ndesc)
> > > > > >  {
> > > > > >       struct vhost_virtqueue *vq = &nvq->vq;
> > > > > >       bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
> > > > > > -     unsigned int out, in;
> > > > > > +     unsigned int out, in, desc_num, n = 0;
> > > > > >       int seg = 0;
> > > > > >       int headcount = 0;
> > > > > >       unsigned d;
> > > > > > @@ -1064,9 +1070,9 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq,
> > > > > >                       r = -ENOBUFS;
> > > > > >                       goto err;
> > > > > >               }
> > > > > > -             r = vhost_get_vq_desc(vq, vq->iov + seg,
> > > > > > -                                   ARRAY_SIZE(vq->iov) - seg, &out,
> > > > > > -                                   &in, log, log_num);
> > > > > > +             r = vhost_get_vq_desc_n(vq, vq->iov + seg,
> > > > > > +                                     ARRAY_SIZE(vq->iov) - seg, &out,
> > > > > > +                                     &in, log, log_num, &desc_num);
> > > > > >               if (unlikely(r < 0))
> > > > > >                       goto err;
> > > > > >
> > > > > > @@ -1093,6 +1099,7 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq,
> > > > > >               ++headcount;
> > > > > >               datalen -= len;
> > > > > >               seg += in;
> > > > > > +             n += desc_num;
> > > > > >       }
> > > > > >
> > > > > >       *iovcount = seg;
> > > > > > @@ -1113,9 +1120,11 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq,
> > > > > >               nheads[0] = headcount;
> > > > > >       }
> > > > > >
> > > > > > +     *ndesc = n;
> > > > > > +
> > > > > >       return headcount;
> > > > > >  err:
> > > > > > -     vhost_discard_vq_desc(vq, headcount);
> > > > > > +     vhost_discard_vq_desc(vq, headcount, n);
> > > > >
> > > > > So here ndesc and n are the same, but below in vhost_discard_vq_desc
> > > > > they are different. Fun.
> > > >
> > > > Not necessarily the same, a buffer could contain more than 1 descriptor.
> > >
> > >
> > > *ndesc = n kinda guarantees it's the same, no?
> >
> > I misread your comment, in the error path the ndesc is left unused.
>
>
>
>
> > Would this be a problem?
> > >
> > > > >
> > > > > >       return r;
> > > > > >  }
> > > > > >
> > > > > > @@ -1151,6 +1160,7 @@ static void handle_rx(struct vhost_net *net)
> > > > > >       struct iov_iter fixup;
> > > > > >       __virtio16 num_buffers;
> > > > > >       int recv_pkts = 0;
> > > > > > +     unsigned int ndesc;
> > > > > >
> > > > > >       mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_RX);
> > > > > >       sock = vhost_vq_get_backend(vq);
> > > > > > @@ -1182,7 +1192,8 @@ static void handle_rx(struct vhost_net *net)
> > > > > >               headcount = get_rx_bufs(nvq, vq->heads + count,
> > > > > >                                       vq->nheads + count,
> > > > > >                                       vhost_len, &in, vq_log, &log,
> > > > > > -                                     likely(mergeable) ? UIO_MAXIOV : 1);
> > > > > > +                                     likely(mergeable) ? UIO_MAXIOV : 1,
> > > > > > +                                     &ndesc);
> > > > > >               /* On error, stop handling until the next kick. */
> > > > > >               if (unlikely(headcount < 0))
> > > > > >                       goto out;
> > > > > > @@ -1228,7 +1239,7 @@ static void handle_rx(struct vhost_net *net)
> > > > > >               if (unlikely(err != sock_len)) {
> > > > > >                       pr_debug("Discarded rx packet: "
> > > > > >                                " len %d, expected %zd\n", err, sock_len);
> > > > > > -                     vhost_discard_vq_desc(vq, headcount);
> > > > > > +                     vhost_discard_vq_desc(vq, headcount, ndesc);
> > > > > >                       continue;
> > > > > >               }
> > > > > >               /* Supply virtio_net_hdr if VHOST_NET_F_VIRTIO_NET_HDR */
> > > > > > @@ -1252,7 +1263,7 @@ static void handle_rx(struct vhost_net *net)
> > > > > >                   copy_to_iter(&num_buffers, sizeof num_buffers,
> > > > > >                                &fixup) != sizeof num_buffers) {
> > > > > >                       vq_err(vq, "Failed num_buffers write");
> > > > > > -                     vhost_discard_vq_desc(vq, headcount);
> > > > > > +                     vhost_discard_vq_desc(vq, headcount, ndesc);
> > > > > >                       goto out;
> > > > > >               }
> > > > > >               nvq->done_idx += headcount;
> > > > > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > > > > index 8570fdf2e14a..b56568807588 100644
> > > > > > --- a/drivers/vhost/vhost.c
> > > > > > +++ b/drivers/vhost/vhost.c
> > > > > > @@ -2792,18 +2792,11 @@ static int get_indirect(struct vhost_virtqueue *vq,
> > > > > >       return 0;
> > > > > >  }
> > > > > >
> > > > > > -/* This looks in the virtqueue and for the first available buffer, and converts
> > > > > > - * it to an iovec for convenient access.  Since descriptors consist of some
> > > > > > - * number of output then some number of input descriptors, it's actually two
> > > > > > - * iovecs, but we pack them into one and note how many of each there were.
> > > > > > - *
> > > > > > - * This function returns the descriptor number found, or vq->num (which is
> > > > > > - * never a valid descriptor number) if none was found.  A negative code is
> > > > > > - * returned on error. */
> > > > >
> > > > > A new module API with no docs at all is not good.
> > > > > Please add documentation to this one. vhost_get_vq_desc
> > > > > is a subset and could refer to it.
> > > >
> > > > Fixed.
> > > >
> > > > >
> > > > > > -int vhost_get_vq_desc(struct vhost_virtqueue *vq,
> > > > > > -                   struct iovec iov[], unsigned int iov_size,
> > > > > > -                   unsigned int *out_num, unsigned int *in_num,
> > > > > > -                   struct vhost_log *log, unsigned int *log_num)
> > > > > > +int vhost_get_vq_desc_n(struct vhost_virtqueue *vq,
> > > > > > +                     struct iovec iov[], unsigned int iov_size,
> > > > > > +                     unsigned int *out_num, unsigned int *in_num,
> > > > > > +                     struct vhost_log *log, unsigned int *log_num,
> > > > > > +                     unsigned int *ndesc)
> > > > >
> > > > > >  {
> > > > > >       bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
> > > > > >       struct vring_desc desc;
> > > > > > @@ -2921,16 +2914,40 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
> > > > > >       vq->last_avail_idx++;
> > > > > >       vq->next_avail_head += c;
> > > > > >
> > > > > > +     if (ndesc)
> > > > > > +             *ndesc = c;
> > > > > > +
> > > > > >       /* Assume notifications from guest are disabled at this point,
> > > > > >        * if they aren't we would need to update avail_event index. */
> > > > > >       BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
> > > > > >       return head;
> > > > > >  }
> > > > > > +EXPORT_SYMBOL_GPL(vhost_get_vq_desc_n);
> > > > > > +
> > > > > > +/* This looks in the virtqueue and for the first available buffer, and converts
> > > > > > + * it to an iovec for convenient access.  Since descriptors consist of some
> > > > > > + * number of output then some number of input descriptors, it's actually two
> > > > > > + * iovecs, but we pack them into one and note how many of each there were.
> > > > > > + *
> > > > > > + * This function returns the descriptor number found, or vq->num (which is
> > > > > > + * never a valid descriptor number) if none was found.  A negative code is
> > > > > > + * returned on error.
> > > > > > + */
> > > > > > +int vhost_get_vq_desc(struct vhost_virtqueue *vq,
> > > > > > +                   struct iovec iov[], unsigned int iov_size,
> > > > > > +                   unsigned int *out_num, unsigned int *in_num,
> > > > > > +                   struct vhost_log *log, unsigned int *log_num)
> > > > > > +{
> > > > > > +     return vhost_get_vq_desc_n(vq, iov, iov_size, out_num, in_num,
> > > > > > +                                log, log_num, NULL);
> > > > > > +}
> > > > > >  EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
> > > > > >
> > > > > >  /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
> > > > > > -void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
> > > > > > +void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n,
> > > > > > +                        unsigned int ndesc)
> > > > >
> > > > > ndesc is number of descriptors? And n is what, in that case?
> > > >
> > > > The semantic of n is not changed which is the number of buffers, ndesc
> > > > is the number of descriptors.
> > >
> > > History is not that relevant. To make the core readable pls
> > > change the names to readable ones.
> > >
> > > Specifically n is really nbufs, maybe?
> >
> > Right.
> >
> > Thanks
>
> All I am asking for is that in the API the parameter is named in a way
> that makes it clear what it is counting.
>
> vhost_get_vq_desc_n is the function you want to document, making it
> clear what is returned in ndesc and how it's different from the return
> value.

Will do.

Thanks

>
>
>
> --
> MST
>