[v3] virtio_ring in order support

[PATCH V3 19/19] virtio_ring: add in order support

Posted by Jason Wang 7 months, 3 weeks ago

This patch implements in order support for both split virtqueue and
packed virtqueue.

Benchmark with KVM guest + testpmd on the host shows:

For split virtqueue: no obvious differences were noticed

For packed virtqueue:

1) RX gets 3.1% PPS improvements from 6.3 Mpps to 6.5 Mpps
2) TX gets 4.6% PPS improvements from 8.6 Mpps to 9.0 Mpps

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/virtio/virtio_ring.c | 423 +++++++++++++++++++++++++++++++++--
 1 file changed, 402 insertions(+), 21 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 27a9459a0555..21d456392ba0 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -70,11 +70,14 @@
 enum vq_layout {
 	SPLIT = 0,
 	PACKED,
+	SPLIT_IN_ORDER,
+	PACKED_IN_ORDER,
 	VQ_TYPE_MAX,
 };
 
 struct vring_desc_state_split {
 	void *data;			/* Data for callback. */
+	u32 total_len;			/* Buffer Length */
 
 	/* Indirect desc table and extra table, if any. These two will be
 	 * allocated together. So we won't stress more to the memory allocator.
@@ -84,6 +87,7 @@ struct vring_desc_state_split {
 
 struct vring_desc_state_packed {
 	void *data;			/* Data for callback. */
+	u32 total_len;			/* Buffer Length */
 
 	/* Indirect desc table and extra table, if any. These two will be
 	 * allocated together. So we won't stress more to the memory allocator.
@@ -206,6 +210,12 @@ struct vring_virtqueue {
 
 	/* Head of free buffer list. */
 	unsigned int free_head;
+
+	/* Head of the batched used buffers, vq->num means no batching */
+	unsigned int batch_head;
+
+	unsigned int batch_len;
+
 	/* Number we've added since last sync. */
 	unsigned int num_added;
 
@@ -256,10 +266,14 @@ static void vring_free(struct virtqueue *_vq);
 
 #define to_vvq(_vq) container_of_const(_vq, struct vring_virtqueue, vq)
 
-
 static inline bool virtqueue_is_packed(const struct vring_virtqueue *vq)
 {
-	return vq->layout == PACKED;
+	return vq->layout == PACKED || vq->layout == PACKED_IN_ORDER;
+}
+
+static inline bool virtqueue_is_in_order(const struct vring_virtqueue *vq)
+{
+	return vq->layout == SPLIT_IN_ORDER || vq->layout == PACKED_IN_ORDER;
 }
 
 static bool virtqueue_use_indirect(const struct vring_virtqueue *vq,
@@ -570,7 +584,7 @@ static inline int virtqueue_add_split(struct vring_virtqueue *vq,
 	struct vring_desc_extra *extra;
 	struct scatterlist *sg;
 	struct vring_desc *desc;
-	unsigned int i, n, c, avail, descs_used, err_idx;
+	unsigned int i, n, c, avail, descs_used, err_idx, total_len = 0;
 	int head;
 	bool indirect;
 
@@ -646,6 +660,7 @@ static inline int virtqueue_add_split(struct vring_virtqueue *vq,
 			i = virtqueue_add_desc_split(vq, desc, extra, i, addr, len,
 						     flags,
 						     premapped);
+			total_len += len;
 		}
 	}
 	for (; n < (out_sgs + in_sgs); n++) {
@@ -665,6 +680,7 @@ static inline int virtqueue_add_split(struct vring_virtqueue *vq,
 			 */
 			i = virtqueue_add_desc_split(vq, desc, extra, i, addr, len,
 						     flags, premapped);
+			total_len += len;
 		}
 	}
 
@@ -687,7 +703,12 @@ static inline int virtqueue_add_split(struct vring_virtqueue *vq,
 	vq->vq.num_free -= descs_used;
 
 	/* Update free pointer */
-	if (indirect)
+	if (virtqueue_is_in_order(vq)) {
+		vq->free_head += descs_used;
+		if (vq->free_head >= vq->split.vring.num)
+			vq->free_head -= vq->split.vring.num;
+		vq->split.desc_state[head].total_len = total_len;;
+	} else if (indirect)
 		vq->free_head = vq->split.desc_extra[head].next;
 	else
 		vq->free_head = i;
@@ -860,6 +881,14 @@ static bool more_used_split(const struct vring_virtqueue *vq)
 	return virtqueue_poll_split(vq, vq->last_used_idx);
 }
 
+static bool more_used_split_in_order(const struct vring_virtqueue *vq)
+{
+	if (vq->batch_head != vq->packed.vring.num)
+		return true;
+
+	return virtqueue_poll_split(vq, vq->last_used_idx);
+}
+
 static void *virtqueue_get_buf_ctx_split(struct vring_virtqueue *vq,
 					 unsigned int *len,
 					 void **ctx)
@@ -917,6 +946,73 @@ static void *virtqueue_get_buf_ctx_split(struct vring_virtqueue *vq,
 	return ret;
 }
 
+static void *virtqueue_get_buf_ctx_split_in_order(struct vring_virtqueue *vq,
+						  unsigned int *len,
+						  void **ctx)
+{
+	void *ret;
+	unsigned int num = vq->split.vring.num;
+	u16 last_used;
+
+	START_USE(vq);
+
+	if (unlikely(vq->broken)) {
+		END_USE(vq);
+		return NULL;
+	}
+
+	last_used = (vq->last_used_idx & (vq->split.vring.num - 1));
+
+	if (vq->batch_head == num) {
+		if (!more_used_split(vq)) {
+			pr_debug("No more buffers in queue\n");
+			END_USE(vq);
+			return NULL;
+		}
+
+		/* Only get used array entries after they have been
+		 * exposed by host. */
+		virtio_rmb(vq->weak_barriers);
+		vq->batch_head = virtio32_to_cpu(vq->vq.vdev,
+				 vq->split.vring.used->ring[last_used].id);
+		vq->batch_len = virtio32_to_cpu(vq->vq.vdev,
+				vq->split.vring.used->ring[last_used].len);
+	}
+
+	if (vq->batch_head == last_used) {
+		vq->batch_head = num;
+		*len = vq->batch_len;
+	} else
+		*len = vq->split.desc_state[last_used].total_len;
+
+	if (unlikely(last_used >= num)) {
+		BAD_RING(vq, "id %u out of range\n", last_used);
+		return NULL;
+	}
+	if (unlikely(!vq->split.desc_state[last_used].data)) {
+		BAD_RING(vq, "id %u is not a head!\n", last_used);
+		return NULL;
+	}
+
+	/* detach_buf_split clears data, so grab it now. */
+	ret = vq->split.desc_state[last_used].data;
+	detach_buf_split_in_order(vq, last_used, ctx);
+
+	vq->last_used_idx++;
+	/* If we expect an interrupt for the next entry, tell host
+	 * by writing event index and flush out the write before
+	 * the read in the next get_buf call. */
+	if (!(vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT))
+		virtio_store_mb(vq->weak_barriers,
+				&vring_used_event(&vq->split.vring),
+				cpu_to_virtio16(vq->vq.vdev, vq->last_used_idx));
+
+	LAST_ADD_TIME_INVALID(vq);
+
+	END_USE(vq);
+	return ret;
+}
+
 static void virtqueue_disable_cb_split(struct vring_virtqueue *vq)
 {
 	if (!(vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) {
@@ -1010,7 +1106,10 @@ static void *virtqueue_detach_unused_buf_split(struct vring_virtqueue *vq)
 			continue;
 		/* detach_buf_split clears data, so grab it now. */
 		buf = vq->split.desc_state[i].data;
-		detach_buf_split(vq, i, NULL);
+		if (virtqueue_is_in_order(vq))
+			detach_buf_split_in_order(vq, i, NULL);
+		else
+			detach_buf_split(vq, i, NULL);
 		vq->split.avail_idx_shadow--;
 		vq->split.vring.avail->idx = cpu_to_virtio16(vq->vq.vdev,
 				vq->split.avail_idx_shadow);
@@ -1073,6 +1172,7 @@ static void virtqueue_vring_attach_split(struct vring_virtqueue *vq,
 
 	/* Put everything in free lists. */
 	vq->free_head = 0;
+	vq->batch_head = vq->split.vring.num;
 }
 
 static int vring_alloc_state_extra_split(struct vring_virtqueue_split *vring_split)
@@ -1183,7 +1283,6 @@ static struct virtqueue *__vring_new_virtqueue_split(unsigned int index,
 	if (!vq)
 		return NULL;
 
-	vq->layout = SPLIT;
 	vq->vq.callback = callback;
 	vq->vq.vdev = vdev;
 	vq->vq.name = name;
@@ -1203,6 +1302,8 @@ static struct virtqueue *__vring_new_virtqueue_split(unsigned int index,
 	vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) &&
 		!context;
 	vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
+	vq->layout = virtio_has_feature(vdev, VIRTIO_F_IN_ORDER) ?
+		     SPLIT_IN_ORDER : SPLIT;
 
 	if (virtio_has_feature(vdev, VIRTIO_F_ORDER_PLATFORM))
 		vq->weak_barriers = false;
@@ -1366,13 +1467,14 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
 					 unsigned int in_sgs,
 					 void *data,
 					 bool premapped,
-					 gfp_t gfp)
+					 gfp_t gfp,
+					 u16 id)
 {
 	struct vring_desc_extra *extra;
 	struct vring_packed_desc *desc;
 	struct scatterlist *sg;
-	unsigned int i, n, err_idx, len;
-	u16 head, id;
+	unsigned int i, n, err_idx, len, total_len = 0;
+	u16 head;
 	dma_addr_t addr;
 
 	head = vq->packed.next_avail_idx;
@@ -1390,8 +1492,6 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
 	}
 
 	i = 0;
-	id = vq->free_head;
-	BUG_ON(id == vq->packed.vring.num);
 
 	for (n = 0; n < out_sgs + in_sgs; n++) {
 		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
@@ -1411,6 +1511,7 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
 				extra[i].flags = n < out_sgs ?  0 : VRING_DESC_F_WRITE;
 			}
 
+			total_len += len;
 			i++;
 		}
 	}
@@ -1464,6 +1565,7 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
 	vq->packed.desc_state[id].data = data;
 	vq->packed.desc_state[id].indir_desc = desc;
 	vq->packed.desc_state[id].last = id;
+	vq->packed.desc_state[id].total_len = total_len;
 
 	vq->num_added += 1;
 
@@ -1516,8 +1618,11 @@ static inline int virtqueue_add_packed(struct vring_virtqueue *vq,
 	BUG_ON(total_sg == 0);
 
 	if (virtqueue_use_indirect(vq, total_sg)) {
+		id = vq->free_head;
+		BUG_ON(id == vq->packed.vring.num);
 		err = virtqueue_add_indirect_packed(vq, sgs, total_sg, out_sgs,
-						    in_sgs, data, premapped, gfp);
+						    in_sgs, data, premapped,
+						    gfp, id);
 		if (err != -ENOMEM) {
 			END_USE(vq);
 			return err;
@@ -1638,6 +1743,152 @@ static inline int virtqueue_add_packed(struct vring_virtqueue *vq,
 	return -EIO;
 }
 
+static inline int virtqueue_add_packed_in_order(struct vring_virtqueue *vq,
+						struct scatterlist *sgs[],
+						unsigned int total_sg,
+						unsigned int out_sgs,
+						unsigned int in_sgs,
+						void *data,
+						void *ctx,
+						bool premapped,
+						gfp_t gfp)
+{
+	struct vring_packed_desc *desc;
+	struct scatterlist *sg;
+	unsigned int i, n, c, err_idx, total_len = 0;
+	__le16 head_flags, flags;
+	u16 head, avail_used_flags;
+	int err;
+
+	START_USE(vq);
+
+	BUG_ON(data == NULL);
+	BUG_ON(ctx && vq->indirect);
+
+	if (unlikely(vq->broken)) {
+		END_USE(vq);
+		return -EIO;
+	}
+
+	LAST_ADD_TIME_UPDATE(vq);
+
+	BUG_ON(total_sg == 0);
+
+	if (virtqueue_use_indirect(vq, total_sg)) {
+		err = virtqueue_add_indirect_packed(vq, sgs, total_sg, out_sgs,
+						    in_sgs, data, premapped, gfp,
+						    vq->packed.next_avail_idx);
+		if (err != -ENOMEM) {
+			END_USE(vq);
+			return err;
+		}
+
+		/* fall back on direct */
+	}
+
+	head = vq->packed.next_avail_idx;
+	avail_used_flags = vq->packed.avail_used_flags;
+
+	WARN_ON_ONCE(total_sg > vq->packed.vring.num && !vq->indirect);
+
+	desc = vq->packed.vring.desc;
+	i = head;
+
+	if (unlikely(vq->vq.num_free < total_sg)) {
+		pr_debug("Can't add buf len %i - avail = %i\n",
+			 total_sg, vq->vq.num_free);
+		END_USE(vq);
+		return -ENOSPC;
+	}
+
+	c = 0;
+	for (n = 0; n < out_sgs + in_sgs; n++) {
+		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
+			dma_addr_t addr;
+			u32 len;
+
+			if (vring_map_one_sg(vq, sg, n < out_sgs ?
+					     DMA_TO_DEVICE : DMA_FROM_DEVICE,
+					     &addr, &len, premapped))
+				goto unmap_release;
+
+			flags = cpu_to_le16(vq->packed.avail_used_flags |
+				    (++c == total_sg ? 0 : VRING_DESC_F_NEXT) |
+				    (n < out_sgs ? 0 : VRING_DESC_F_WRITE));
+			if (i == head)
+				head_flags = flags;
+			else
+				desc[i].flags = flags;
+
+
+			desc[i].addr = cpu_to_le64(addr);
+			desc[i].len = cpu_to_le32(len);
+			desc[i].id = cpu_to_le16(head);
+
+			if (unlikely(vq->use_dma_api)) {
+				vq->packed.desc_extra[i].addr = premapped ?
+				      DMA_MAPPING_ERROR: addr;
+				vq->packed.desc_extra[i].len = len;
+				vq->packed.desc_extra[i].flags =
+					le16_to_cpu(flags);
+			}
+
+			if ((unlikely(++i >= vq->packed.vring.num))) {
+				i = 0;
+				vq->packed.avail_used_flags ^=
+					1 << VRING_PACKED_DESC_F_AVAIL |
+					1 << VRING_PACKED_DESC_F_USED;
+				vq->packed.avail_wrap_counter ^= 1;
+			}
+
+			total_len += len;
+		}
+	}
+
+	/* We're using some buffers from the free list. */
+	vq->vq.num_free -= total_sg;
+
+	/* Update free pointer */
+	vq->packed.next_avail_idx = i;
+
+	/* Store token. */
+	vq->packed.desc_state[head].num = total_sg;
+	vq->packed.desc_state[head].data = data;
+	vq->packed.desc_state[head].indir_desc = ctx;
+	vq->packed.desc_state[head].total_len = total_len;
+
+	/*
+	 * A driver MUST NOT make the first descriptor in the list
+	 * available before all subsequent descriptors comprising
+	 * the list are made available.
+	 */
+	virtio_wmb(vq->weak_barriers);
+	vq->packed.vring.desc[head].flags = head_flags;
+	vq->num_added += total_sg;
+
+	pr_debug("Added buffer head %i to %p\n", head, vq);
+	END_USE(vq);
+
+	return 0;
+
+unmap_release:
+	err_idx = i;
+	i = head;
+	vq->packed.avail_used_flags = avail_used_flags;
+
+	for (n = 0; n < total_sg; n++) {
+		if (i == err_idx)
+			break;
+		vring_unmap_extra_packed(vq, &vq->packed.desc_extra[i]);
+		i++;
+		if (i >= vq->packed.vring.num)
+			i = 0;
+	}
+
+	END_USE(vq);
+	return -EIO;
+}
+
 static bool virtqueue_kick_prepare_packed(struct vring_virtqueue *vq)
 {
 	u16 new, old, off_wrap, flags, wrap_counter, event_idx;
@@ -1758,7 +2009,7 @@ static inline bool is_used_desc_packed(const struct vring_virtqueue *vq,
 	return avail == used && used == used_wrap_counter;
 }
 
-static bool virtqueue_poll_packed(const struct vring_virtqueue *vq, u16 off_wrap)
+static bool __virtqueue_poll_packed(const struct vring_virtqueue *vq, u16 off_wrap)
 {
 	bool wrap_counter;
 	u16 used_idx;
@@ -1769,6 +2020,11 @@ static bool virtqueue_poll_packed(const struct vring_virtqueue *vq, u16 off_wrap
 	return is_used_desc_packed(vq, used_idx, wrap_counter);
 }
 
+static bool virtqueue_poll_packed(const struct vring_virtqueue *vq, u16 off_wrap)
+{
+	return __virtqueue_poll_packed(vq, off_wrap);
+}
+
 static bool more_used_packed(const struct vring_virtqueue *vq)
 {
 	return virtqueue_poll_packed(vq, READ_ONCE(vq->last_used_idx));
@@ -1798,10 +2054,84 @@ static void update_last_used_idx_packed(struct vring_virtqueue *vq,
 				cpu_to_le16(vq->last_used_idx));
 }
 
+static bool more_used_packed_in_order(const struct vring_virtqueue *vq)
+{
+	if (vq->batch_head != vq->packed.vring.num)
+		return true;
+
+	return virtqueue_poll_packed(vq, READ_ONCE(vq->last_used_idx));
+}
+
+static bool __more_used_packed(const struct vring_virtqueue *vq)
+{
+	return __virtqueue_poll_packed(vq, READ_ONCE(vq->last_used_idx));
+}
+
+static void *virtqueue_get_buf_ctx_packed_in_order(struct vring_virtqueue *vq,
+						   unsigned int *len,
+						   void **ctx)
+{
+	unsigned int num = vq->packed.vring.num;
+	u16 last_used, id, last_used_idx;
+	bool used_wrap_counter;
+	void *ret;
+
+	START_USE(vq);
+
+	if (unlikely(vq->broken)) {
+		END_USE(vq);
+		return NULL;
+	}
+
+	last_used_idx = vq->last_used_idx;
+	used_wrap_counter = packed_used_wrap_counter(last_used_idx);
+	last_used = packed_last_used(last_used_idx);
+
+	if (vq->batch_head == num) {
+		if (!__more_used_packed(vq)) {
+			pr_debug("No more buffers in queue\n");
+			END_USE(vq);
+			return NULL;
+		}
+		/* Only get used elements after they have been exposed by host. */
+		virtio_rmb(vq->weak_barriers);
+		vq->batch_head = le16_to_cpu(vq->packed.vring.desc[last_used].id);
+		vq->batch_len = le32_to_cpu(vq->packed.vring.desc[last_used].len);
+	}
+
+	if (vq->batch_head == last_used) {
+		vq->batch_head = num;
+		*len = vq->batch_len;
+	} else
+		*len = vq->packed.desc_state[last_used].total_len;
+
+	if (unlikely(last_used >= num)) {
+		BAD_RING(vq, "id %u out of range\n", id);
+		return NULL;
+	}
+	if (unlikely(!vq->packed.desc_state[last_used].data)) {
+		BAD_RING(vq, "id %u is not a head!\n", id);
+		return NULL;
+	}
+
+	/* detach_buf_packed clears data, so grab it now. */
+	ret = vq->packed.desc_state[last_used].data;
+	detach_buf_packed_in_order(vq, last_used, ctx);
+
+	update_last_used_idx_packed(vq, last_used, last_used,
+				    used_wrap_counter);
+
+	LAST_ADD_TIME_INVALID(vq);
+
+	END_USE(vq);
+	return ret;
+}
+
 static void *virtqueue_get_buf_ctx_packed(struct vring_virtqueue *vq,
 					  unsigned int *len,
 					  void **ctx)
 {
+	unsigned int num = vq->packed.vring.num;
 	u16 last_used, id, last_used_idx;
 	bool used_wrap_counter;
 	void *ret;
@@ -1813,7 +2143,7 @@ static void *virtqueue_get_buf_ctx_packed(struct vring_virtqueue *vq,
 		return NULL;
 	}
 
-	if (!more_used_packed(vq)) {
+	if (!__more_used_packed(vq)) {
 		pr_debug("No more buffers in queue\n");
 		END_USE(vq);
 		return NULL;
@@ -1828,7 +2158,7 @@ static void *virtqueue_get_buf_ctx_packed(struct vring_virtqueue *vq,
 	id = le16_to_cpu(vq->packed.vring.desc[last_used].id);
 	*len = le32_to_cpu(vq->packed.vring.desc[last_used].len);
 
-	if (unlikely(id >= vq->packed.vring.num)) {
+	if (unlikely(id >= num)) {
 		BAD_RING(vq, "id %u out of range\n", id);
 		return NULL;
 	}
@@ -1948,6 +2278,7 @@ static bool virtqueue_enable_cb_delayed_packed(struct vring_virtqueue *vq)
 	last_used_idx = READ_ONCE(vq->last_used_idx);
 	wrap_counter = packed_used_wrap_counter(last_used_idx);
 	used_idx = packed_last_used(last_used_idx);
+
 	if (is_used_desc_packed(vq, used_idx, wrap_counter)) {
 		END_USE(vq);
 		return false;
@@ -1969,7 +2300,7 @@ static void *virtqueue_detach_unused_buf_packed(struct vring_virtqueue *vq)
 			continue;
 		/* detach_buf clears data, so grab it now. */
 		buf = vq->packed.desc_state[i].data;
-		detach_buf_packed(vq, i, NULL);
+		detach_buf_packed_in_order(vq, i, NULL);
 		END_USE(vq);
 		return buf;
 	}
@@ -1995,6 +2326,8 @@ static struct vring_desc_extra *vring_alloc_desc_extra(unsigned int num)
 	for (i = 0; i < num - 1; i++)
 		desc_extra[i].next = i + 1;
 
+	desc_extra[num - 1].next = 0;
+
 	return desc_extra;
 }
 
@@ -2126,8 +2459,12 @@ static void virtqueue_vring_attach_packed(struct vring_virtqueue *vq,
 {
 	vq->packed = *vring_packed;
 
-	/* Put everything in free lists. */
-	vq->free_head = 0;
+	if (virtqueue_is_in_order(vq))
+		vq->batch_head = vq->split.vring.num;
+	else {
+		/* Put everything in free lists. */
+		vq->free_head = 0;
+	}
 }
 
 static void virtqueue_reset_packed(struct vring_virtqueue *vq)
@@ -2174,13 +2511,14 @@ static struct virtqueue *__vring_new_virtqueue_packed(unsigned int index,
 #else
 	vq->broken = false;
 #endif
-	vq->layout = PACKED;
 	vq->dma_dev = dma_dev;
 	vq->use_dma_api = vring_use_dma_api(vdev);
 
 	vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) &&
 		!context;
 	vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
+	vq->layout = virtio_has_feature(vdev, VIRTIO_F_IN_ORDER) ?
+		     PACKED_IN_ORDER : PACKED;
 
 	if (virtio_has_feature(vdev, VIRTIO_F_ORDER_PLATFORM))
 		vq->weak_barriers = false;
@@ -2290,9 +2628,39 @@ static const struct virtqueue_ops packed_ops = {
 	.reset = virtqueue_reset_packed,
 };
 
+static const struct virtqueue_ops split_in_order_ops = {
+	.add = virtqueue_add_split,
+	.get = virtqueue_get_buf_ctx_split_in_order,
+	.kick_prepare = virtqueue_kick_prepare_split,
+	.disable_cb = virtqueue_disable_cb_split,
+	.enable_cb_delayed = virtqueue_enable_cb_delayed_split,
+	.enable_cb_prepare = virtqueue_enable_cb_prepare_split,
+	.poll = virtqueue_poll_split,
+	.detach_unused_buf = virtqueue_detach_unused_buf_split,
+	.more_used = more_used_split_in_order,
+	.resize = virtqueue_resize_split,
+	.reset = virtqueue_reset_split,
+};
+
+static const struct virtqueue_ops packed_in_order_ops = {
+	.add = virtqueue_add_packed_in_order,
+	.get = virtqueue_get_buf_ctx_packed_in_order,
+	.kick_prepare = virtqueue_kick_prepare_packed,
+	.disable_cb = virtqueue_disable_cb_packed,
+	.enable_cb_delayed = virtqueue_enable_cb_delayed_packed,
+	.enable_cb_prepare = virtqueue_enable_cb_prepare_packed,
+	.poll = virtqueue_poll_packed,
+	.detach_unused_buf = virtqueue_detach_unused_buf_packed,
+	.more_used = more_used_packed_in_order,
+	.resize = virtqueue_resize_packed,
+	.reset = virtqueue_reset_packed,
+};
+
 static const struct virtqueue_ops *const all_ops[VQ_TYPE_MAX] = {
 	[SPLIT] = &split_ops,
-	[PACKED] = &packed_ops
+	[PACKED] = &packed_ops,
+	[SPLIT_IN_ORDER] = &split_in_order_ops,
+	[PACKED_IN_ORDER] = &packed_in_order_ops,
 };
 
 static int virtqueue_disable_and_recycle(struct virtqueue *_vq,
@@ -2336,7 +2704,6 @@ static int virtqueue_enable_after_reset(struct virtqueue *_vq)
 /*
  * Generic functions and exported symbols.
  */
-
 #define VIRTQUEUE_CALL(vq, op, ...)					\
 	({								\
 	typeof(all_ops[SPLIT]->op(vq, ##__VA_ARGS__)) ret;		\
@@ -2347,6 +2714,12 @@ static int virtqueue_enable_after_reset(struct virtqueue *_vq)
 	case PACKED:							\
 		ret = all_ops[PACKED]->op(vq, ##__VA_ARGS__);		\
 		break;							\
+	case SPLIT_IN_ORDER:						\
+		ret = all_ops[SPLIT_IN_ORDER]->op(vq, ##__VA_ARGS__);	\
+		break;							\
+	case PACKED_IN_ORDER:						\
+		ret = all_ops[PACKED_IN_ORDER]->op(vq, ##__VA_ARGS__);	\
+		break;							\
 	default:							\
 		BUG();							\
 		break;							\
@@ -2363,6 +2736,12 @@ static int virtqueue_enable_after_reset(struct virtqueue *_vq)
         case PACKED:					  \
                 all_ops[PACKED]->op(vq, ##__VA_ARGS__);   \
                 break;					  \
+        case SPLIT_IN_ORDER:						\
+                all_ops[SPLIT_IN_ORDER]->op(vq, ##__VA_ARGS__);		\
+                break;							\
+        case PACKED_IN_ORDER:						\
+                all_ops[PACKED_IN_ORDER]->op(vq, ##__VA_ARGS__);	\
+                break;							\
         default:					  \
                 BUG();					  \
                 break;					  \
@@ -3073,6 +3452,8 @@ void vring_transport_features(struct virtio_device *vdev)
 			break;
 		case VIRTIO_F_NOTIFICATION_DATA:
 			break;
+		case VIRTIO_F_IN_ORDER:
+			break;
 		default:
 			/* We don't understand this bit. */
 			__virtio_clear_bit(vdev, i);
-- 
2.34.1

Re: [PATCH V3 19/19] virtio_ring: add in order support

Posted by Michael S. Tsirkin 7 months, 1 week ago

On Mon, Jun 16, 2025 at 04:25:17PM +0800, Jason Wang wrote:
> This patch implements in order support for both split virtqueue and
> packed virtqueue.

I'd like to see more motivation for this work, documented.
It's not really performance, not as it stands, see below:

> 
> Benchmark with KVM guest + testpmd on the host shows:
> 
> For split virtqueue: no obvious differences were noticed
> 
> For packed virtqueue:
> 
> 1) RX gets 3.1% PPS improvements from 6.3 Mpps to 6.5 Mpps
> 2) TX gets 4.6% PPS improvements from 8.6 Mpps to 9.0 Mpps
>

That's a very modest improvement for a lot of code.
I also note you put in some batching just for in-order.
Which could also explain the gains maybe?
What if you just put in a simple implementation with no
batching tricks? do you still see a gain?
Does any hardware implement this? Maybe that can demonstrate
bigger gains.

 
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
>  drivers/virtio/virtio_ring.c | 423 +++++++++++++++++++++++++++++++++--
>  1 file changed, 402 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index 27a9459a0555..21d456392ba0 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -70,11 +70,14 @@
>  enum vq_layout {
>  	SPLIT = 0,
>  	PACKED,
> +	SPLIT_IN_ORDER,
> +	PACKED_IN_ORDER,
>  	VQ_TYPE_MAX,
>  };
>  
>  struct vring_desc_state_split {
>  	void *data;			/* Data for callback. */
> +	u32 total_len;			/* Buffer Length */
>  
>  	/* Indirect desc table and extra table, if any. These two will be
>  	 * allocated together. So we won't stress more to the memory allocator.
> @@ -84,6 +87,7 @@ struct vring_desc_state_split {
>  
>  struct vring_desc_state_packed {
>  	void *data;			/* Data for callback. */
> +	u32 total_len;			/* Buffer Length */
>  
>  	/* Indirect desc table and extra table, if any. These two will be
>  	 * allocated together. So we won't stress more to the memory allocator.

We are bloating up the cache footprint for everyone,
so there's a chance of regressions.
Pls include benchmark for in order off, to make sure we
are not regressing.
How big was the ring?
Worth trying with a biggish one, where there is more cache
pressure.


Why not have a separate state for in-order?



> @@ -206,6 +210,12 @@ struct vring_virtqueue {
>  
>  	/* Head of free buffer list. */
>  	unsigned int free_head;
> +
> +	/* Head of the batched used buffers, vq->num means no batching */
> +	unsigned int batch_head;
> +
> +	unsigned int batch_len;
> +

Are these two only used for in-order? Please document that.
I also want some documentation about the batching trickery
used please.
What is batched, when, how is batching flushed, why are we
only batching in-order ...




>  	/* Number we've added since last sync. */
>  	unsigned int num_added;
>  
> @@ -256,10 +266,14 @@ static void vring_free(struct virtqueue *_vq);
>  
>  #define to_vvq(_vq) container_of_const(_vq, struct vring_virtqueue, vq)
>  
> -
>  static inline bool virtqueue_is_packed(const struct vring_virtqueue *vq)
>  {
> -	return vq->layout == PACKED;
> +	return vq->layout == PACKED || vq->layout == PACKED_IN_ORDER;
> +}
> +
> +static inline bool virtqueue_is_in_order(const struct vring_virtqueue *vq)
> +{
> +	return vq->layout == SPLIT_IN_ORDER || vq->layout == PACKED_IN_ORDER;
>  }
>  
>  static bool virtqueue_use_indirect(const struct vring_virtqueue *vq,
> @@ -570,7 +584,7 @@ static inline int virtqueue_add_split(struct vring_virtqueue *vq,
>  	struct vring_desc_extra *extra;
>  	struct scatterlist *sg;
>  	struct vring_desc *desc;
> -	unsigned int i, n, c, avail, descs_used, err_idx;
> +	unsigned int i, n, c, avail, descs_used, err_idx, total_len = 0;


I would add a comment here:

/* Total length for in-order */
unsigned int total_len = 0;

>  	int head;
>  	bool indirect;
>  
> @@ -646,6 +660,7 @@ static inline int virtqueue_add_split(struct vring_virtqueue *vq,
>  			i = virtqueue_add_desc_split(vq, desc, extra, i, addr, len,
>  						     flags,
>  						     premapped);
> +			total_len += len;
>  		}
>  	}
>  	for (; n < (out_sgs + in_sgs); n++) {




> @@ -665,6 +680,7 @@ static inline int virtqueue_add_split(struct vring_virtqueue *vq,
>  			 */
>  			i = virtqueue_add_desc_split(vq, desc, extra, i, addr, len,
>  						     flags, premapped);
> +			total_len += len;
>  		}
>  	}
>  
> @@ -687,7 +703,12 @@ static inline int virtqueue_add_split(struct vring_virtqueue *vq,
>  	vq->vq.num_free -= descs_used;
>  
>  	/* Update free pointer */
> -	if (indirect)
> +	if (virtqueue_is_in_order(vq)) {
> +		vq->free_head += descs_used;
> +		if (vq->free_head >= vq->split.vring.num)
> +			vq->free_head -= vq->split.vring.num;
> +		vq->split.desc_state[head].total_len = total_len;;
> +	} else if (indirect)
>  		vq->free_head = vq->split.desc_extra[head].next;
>  	else
>  		vq->free_head = i;
> @@ -860,6 +881,14 @@ static bool more_used_split(const struct vring_virtqueue *vq)
>  	return virtqueue_poll_split(vq, vq->last_used_idx);
>  }
>  
> +static bool more_used_split_in_order(const struct vring_virtqueue *vq)
> +{
> +	if (vq->batch_head != vq->packed.vring.num)
> +		return true;
> +
> +	return virtqueue_poll_split(vq, vq->last_used_idx);
> +}
> +
>  static void *virtqueue_get_buf_ctx_split(struct vring_virtqueue *vq,
>  					 unsigned int *len,
>  					 void **ctx)
> @@ -917,6 +946,73 @@ static void *virtqueue_get_buf_ctx_split(struct vring_virtqueue *vq,
>  	return ret;
>  }
>  
> +static void *virtqueue_get_buf_ctx_split_in_order(struct vring_virtqueue *vq,
> +						  unsigned int *len,
> +						  void **ctx)
> +{
> +	void *ret;
> +	unsigned int num = vq->split.vring.num;
> +	u16 last_used;
> +
> +	START_USE(vq);
> +
> +	if (unlikely(vq->broken)) {
> +		END_USE(vq);
> +		return NULL;
> +	}
> +
> +	last_used = (vq->last_used_idx & (vq->split.vring.num - 1));
> +
> +	if (vq->batch_head == num) {
> +		if (!more_used_split(vq)) {
> +			pr_debug("No more buffers in queue\n");
> +			END_USE(vq);
> +			return NULL;
> +		}
> +
> +		/* Only get used array entries after they have been
> +		 * exposed by host. */
> +		virtio_rmb(vq->weak_barriers);
> +		vq->batch_head = virtio32_to_cpu(vq->vq.vdev,
> +				 vq->split.vring.used->ring[last_used].id);
> +		vq->batch_len = virtio32_to_cpu(vq->vq.vdev,
> +				vq->split.vring.used->ring[last_used].len);
> +	}
> +
> +	if (vq->batch_head == last_used) {
> +		vq->batch_head = num;
> +		*len = vq->batch_len;
> +	} else
> +		*len = vq->split.desc_state[last_used].total_len;
> +
> +	if (unlikely(last_used >= num)) {
> +		BAD_RING(vq, "id %u out of range\n", last_used);
> +		return NULL;
> +	}
> +	if (unlikely(!vq->split.desc_state[last_used].data)) {
> +		BAD_RING(vq, "id %u is not a head!\n", last_used);
> +		return NULL;
> +	}
> +
> +	/* detach_buf_split clears data, so grab it now. */
> +	ret = vq->split.desc_state[last_used].data;
> +	detach_buf_split_in_order(vq, last_used, ctx);
> +
> +	vq->last_used_idx++;
> +	/* If we expect an interrupt for the next entry, tell host
> +	 * by writing event index and flush out the write before
> +	 * the read in the next get_buf call. */
> +	if (!(vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT))
> +		virtio_store_mb(vq->weak_barriers,
> +				&vring_used_event(&vq->split.vring),
> +				cpu_to_virtio16(vq->vq.vdev, vq->last_used_idx));
> +
> +	LAST_ADD_TIME_INVALID(vq);
> +
> +	END_USE(vq);
> +	return ret;
> +}
> +
>  static void virtqueue_disable_cb_split(struct vring_virtqueue *vq)
>  {
>  	if (!(vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) {
> @@ -1010,7 +1106,10 @@ static void *virtqueue_detach_unused_buf_split(struct vring_virtqueue *vq)
>  			continue;
>  		/* detach_buf_split clears data, so grab it now. */
>  		buf = vq->split.desc_state[i].data;
> -		detach_buf_split(vq, i, NULL);
> +		if (virtqueue_is_in_order(vq))
> +			detach_buf_split_in_order(vq, i, NULL);
> +		else
> +			detach_buf_split(vq, i, NULL);
>  		vq->split.avail_idx_shadow--;
>  		vq->split.vring.avail->idx = cpu_to_virtio16(vq->vq.vdev,
>  				vq->split.avail_idx_shadow);
> @@ -1073,6 +1172,7 @@ static void virtqueue_vring_attach_split(struct vring_virtqueue *vq,
>  
>  	/* Put everything in free lists. */
>  	vq->free_head = 0;
> +	vq->batch_head = vq->split.vring.num;
>  }
>  
>  static int vring_alloc_state_extra_split(struct vring_virtqueue_split *vring_split)
> @@ -1183,7 +1283,6 @@ static struct virtqueue *__vring_new_virtqueue_split(unsigned int index,
>  	if (!vq)
>  		return NULL;
>  
> -	vq->layout = SPLIT;
>  	vq->vq.callback = callback;
>  	vq->vq.vdev = vdev;
>  	vq->vq.name = name;
> @@ -1203,6 +1302,8 @@ static struct virtqueue *__vring_new_virtqueue_split(unsigned int index,
>  	vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) &&
>  		!context;
>  	vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
> +	vq->layout = virtio_has_feature(vdev, VIRTIO_F_IN_ORDER) ?
> +		     SPLIT_IN_ORDER : SPLIT;
>  
>  	if (virtio_has_feature(vdev, VIRTIO_F_ORDER_PLATFORM))
>  		vq->weak_barriers = false;
> @@ -1366,13 +1467,14 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
>  					 unsigned int in_sgs,
>  					 void *data,
>  					 bool premapped,
> -					 gfp_t gfp)
> +					 gfp_t gfp,
> +					 u16 id)
>  {
>  	struct vring_desc_extra *extra;
>  	struct vring_packed_desc *desc;
>  	struct scatterlist *sg;
> -	unsigned int i, n, err_idx, len;
> -	u16 head, id;
> +	unsigned int i, n, err_idx, len, total_len = 0;
> +	u16 head;
>  	dma_addr_t addr;
>  
>  	head = vq->packed.next_avail_idx;
> @@ -1390,8 +1492,6 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
>  	}
>  
>  	i = 0;
> -	id = vq->free_head;
> -	BUG_ON(id == vq->packed.vring.num);
>  
>  	for (n = 0; n < out_sgs + in_sgs; n++) {
>  		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> @@ -1411,6 +1511,7 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
>  				extra[i].flags = n < out_sgs ?  0 : VRING_DESC_F_WRITE;
>  			}
>  
> +			total_len += len;
>  			i++;
>  		}
>  	}
> @@ -1464,6 +1565,7 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
>  	vq->packed.desc_state[id].data = data;
>  	vq->packed.desc_state[id].indir_desc = desc;
>  	vq->packed.desc_state[id].last = id;
> +	vq->packed.desc_state[id].total_len = total_len;
>  
>  	vq->num_added += 1;
>  
> @@ -1516,8 +1618,11 @@ static inline int virtqueue_add_packed(struct vring_virtqueue *vq,
>  	BUG_ON(total_sg == 0);
>  
>  	if (virtqueue_use_indirect(vq, total_sg)) {
> +		id = vq->free_head;
> +		BUG_ON(id == vq->packed.vring.num);
>  		err = virtqueue_add_indirect_packed(vq, sgs, total_sg, out_sgs,
> -						    in_sgs, data, premapped, gfp);
> +						    in_sgs, data, premapped,
> +						    gfp, id);
>  		if (err != -ENOMEM) {
>  			END_USE(vq);
>  			return err;
> @@ -1638,6 +1743,152 @@ static inline int virtqueue_add_packed(struct vring_virtqueue *vq,
>  	return -EIO;
>  }
>  
> +static inline int virtqueue_add_packed_in_order(struct vring_virtqueue *vq,
> +						struct scatterlist *sgs[],
> +						unsigned int total_sg,
> +						unsigned int out_sgs,
> +						unsigned int in_sgs,
> +						void *data,
> +						void *ctx,
> +						bool premapped,
> +						gfp_t gfp)
> +{
> +	struct vring_packed_desc *desc;
> +	struct scatterlist *sg;
> +	unsigned int i, n, c, err_idx, total_len = 0;
> +	__le16 head_flags, flags;
> +	u16 head, avail_used_flags;
> +	int err;
> +
> +	START_USE(vq);
> +
> +	BUG_ON(data == NULL);
> +	BUG_ON(ctx && vq->indirect);
> +
> +	if (unlikely(vq->broken)) {
> +		END_USE(vq);
> +		return -EIO;
> +	}
> +
> +	LAST_ADD_TIME_UPDATE(vq);
> +
> +	BUG_ON(total_sg == 0);
> +
> +	if (virtqueue_use_indirect(vq, total_sg)) {
> +		err = virtqueue_add_indirect_packed(vq, sgs, total_sg, out_sgs,
> +						    in_sgs, data, premapped, gfp,
> +						    vq->packed.next_avail_idx);
> +		if (err != -ENOMEM) {
> +			END_USE(vq);
> +			return err;
> +		}
> +
> +		/* fall back on direct */
> +	}
> +
> +	head = vq->packed.next_avail_idx;
> +	avail_used_flags = vq->packed.avail_used_flags;
> +
> +	WARN_ON_ONCE(total_sg > vq->packed.vring.num && !vq->indirect);
> +
> +	desc = vq->packed.vring.desc;
> +	i = head;
> +
> +	if (unlikely(vq->vq.num_free < total_sg)) {
> +		pr_debug("Can't add buf len %i - avail = %i\n",
> +			 total_sg, vq->vq.num_free);
> +		END_USE(vq);
> +		return -ENOSPC;
> +	}
> +
> +	c = 0;
> +	for (n = 0; n < out_sgs + in_sgs; n++) {
> +		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> +			dma_addr_t addr;
> +			u32 len;
> +
> +			if (vring_map_one_sg(vq, sg, n < out_sgs ?
> +					     DMA_TO_DEVICE : DMA_FROM_DEVICE,
> +					     &addr, &len, premapped))
> +				goto unmap_release;
> +
> +			flags = cpu_to_le16(vq->packed.avail_used_flags |
> +				    (++c == total_sg ? 0 : VRING_DESC_F_NEXT) |
> +				    (n < out_sgs ? 0 : VRING_DESC_F_WRITE));
> +			if (i == head)
> +				head_flags = flags;
> +			else
> +				desc[i].flags = flags;
> +
> +
> +			desc[i].addr = cpu_to_le64(addr);
> +			desc[i].len = cpu_to_le32(len);
> +			desc[i].id = cpu_to_le16(head);
> +
> +			if (unlikely(vq->use_dma_api)) {
> +				vq->packed.desc_extra[i].addr = premapped ?
> +				      DMA_MAPPING_ERROR: addr;
> +				vq->packed.desc_extra[i].len = len;
> +				vq->packed.desc_extra[i].flags =
> +					le16_to_cpu(flags);
> +			}
> +
> +			if ((unlikely(++i >= vq->packed.vring.num))) {
> +				i = 0;
> +				vq->packed.avail_used_flags ^=
> +					1 << VRING_PACKED_DESC_F_AVAIL |
> +					1 << VRING_PACKED_DESC_F_USED;
> +				vq->packed.avail_wrap_counter ^= 1;
> +			}
> +
> +			total_len += len;
> +		}
> +	}
> +
> +	/* We're using some buffers from the free list. */
> +	vq->vq.num_free -= total_sg;
> +
> +	/* Update free pointer */
> +	vq->packed.next_avail_idx = i;
> +
> +	/* Store token. */
> +	vq->packed.desc_state[head].num = total_sg;
> +	vq->packed.desc_state[head].data = data;
> +	vq->packed.desc_state[head].indir_desc = ctx;
> +	vq->packed.desc_state[head].total_len = total_len;
> +
> +	/*
> +	 * A driver MUST NOT make the first descriptor in the list
> +	 * available before all subsequent descriptors comprising
> +	 * the list are made available.
> +	 */
> +	virtio_wmb(vq->weak_barriers);
> +	vq->packed.vring.desc[head].flags = head_flags;
> +	vq->num_added += total_sg;
> +
> +	pr_debug("Added buffer head %i to %p\n", head, vq);
> +	END_USE(vq);
> +
> +	return 0;
> +
> +unmap_release:
> +	err_idx = i;
> +	i = head;
> +	vq->packed.avail_used_flags = avail_used_flags;
> +
> +	for (n = 0; n < total_sg; n++) {
> +		if (i == err_idx)
> +			break;
> +		vring_unmap_extra_packed(vq, &vq->packed.desc_extra[i]);
> +		i++;
> +		if (i >= vq->packed.vring.num)
> +			i = 0;
> +	}
> +
> +	END_USE(vq);
> +	return -EIO;
> +}
> +
>  static bool virtqueue_kick_prepare_packed(struct vring_virtqueue *vq)
>  {
>  	u16 new, old, off_wrap, flags, wrap_counter, event_idx;
> @@ -1758,7 +2009,7 @@ static inline bool is_used_desc_packed(const struct vring_virtqueue *vq,
>  	return avail == used && used == used_wrap_counter;
>  }
>  
> -static bool virtqueue_poll_packed(const struct vring_virtqueue *vq, u16 off_wrap)
> +static bool __virtqueue_poll_packed(const struct vring_virtqueue *vq, u16 off_wrap)
>  {
>  	bool wrap_counter;
>  	u16 used_idx;
> @@ -1769,6 +2020,11 @@ static bool virtqueue_poll_packed(const struct vring_virtqueue *vq, u16 off_wrap
>  	return is_used_desc_packed(vq, used_idx, wrap_counter);
>  }
>  
> +static bool virtqueue_poll_packed(const struct vring_virtqueue *vq, u16 off_wrap)
> +{
> +	return __virtqueue_poll_packed(vq, off_wrap);
> +}
> +
>  static bool more_used_packed(const struct vring_virtqueue *vq)
>  {
>  	return virtqueue_poll_packed(vq, READ_ONCE(vq->last_used_idx));
> @@ -1798,10 +2054,84 @@ static void update_last_used_idx_packed(struct vring_virtqueue *vq,
>  				cpu_to_le16(vq->last_used_idx));
>  }
>  
> +static bool more_used_packed_in_order(const struct vring_virtqueue *vq)
> +{
> +	if (vq->batch_head != vq->packed.vring.num)
> +		return true;
> +
> +	return virtqueue_poll_packed(vq, READ_ONCE(vq->last_used_idx));
> +}
> +
> +static bool __more_used_packed(const struct vring_virtqueue *vq)
> +{
> +	return __virtqueue_poll_packed(vq, READ_ONCE(vq->last_used_idx));
> +}
> +
> +static void *virtqueue_get_buf_ctx_packed_in_order(struct vring_virtqueue *vq,
> +						   unsigned int *len,
> +						   void **ctx)
> +{
> +	unsigned int num = vq->packed.vring.num;
> +	u16 last_used, id, last_used_idx;
> +	bool used_wrap_counter;
> +	void *ret;
> +
> +	START_USE(vq);
> +
> +	if (unlikely(vq->broken)) {
> +		END_USE(vq);
> +		return NULL;
> +	}
> +
> +	last_used_idx = vq->last_used_idx;
> +	used_wrap_counter = packed_used_wrap_counter(last_used_idx);
> +	last_used = packed_last_used(last_used_idx);
> +
> +	if (vq->batch_head == num) {
> +		if (!__more_used_packed(vq)) {
> +			pr_debug("No more buffers in queue\n");
> +			END_USE(vq);
> +			return NULL;
> +		}
> +		/* Only get used elements after they have been exposed by host. */
> +		virtio_rmb(vq->weak_barriers);
> +		vq->batch_head = le16_to_cpu(vq->packed.vring.desc[last_used].id);
> +		vq->batch_len = le32_to_cpu(vq->packed.vring.desc[last_used].len);
> +	}
> +
> +	if (vq->batch_head == last_used) {
> +		vq->batch_head = num;
> +		*len = vq->batch_len;
> +	} else
> +		*len = vq->packed.desc_state[last_used].total_len;
> +
> +	if (unlikely(last_used >= num)) {
> +		BAD_RING(vq, "id %u out of range\n", id);
> +		return NULL;
> +	}
> +	if (unlikely(!vq->packed.desc_state[last_used].data)) {
> +		BAD_RING(vq, "id %u is not a head!\n", id);
> +		return NULL;
> +	}
> +
> +	/* detach_buf_packed clears data, so grab it now. */
> +	ret = vq->packed.desc_state[last_used].data;
> +	detach_buf_packed_in_order(vq, last_used, ctx);
> +
> +	update_last_used_idx_packed(vq, last_used, last_used,
> +				    used_wrap_counter);
> +
> +	LAST_ADD_TIME_INVALID(vq);
> +
> +	END_USE(vq);
> +	return ret;
> +}
> +
>  static void *virtqueue_get_buf_ctx_packed(struct vring_virtqueue *vq,
>  					  unsigned int *len,
>  					  void **ctx)
>  {
> +	unsigned int num = vq->packed.vring.num;
>  	u16 last_used, id, last_used_idx;
>  	bool used_wrap_counter;
>  	void *ret;
> @@ -1813,7 +2143,7 @@ static void *virtqueue_get_buf_ctx_packed(struct vring_virtqueue *vq,
>  		return NULL;
>  	}
>  
> -	if (!more_used_packed(vq)) {
> +	if (!__more_used_packed(vq)) {
>  		pr_debug("No more buffers in queue\n");
>  		END_USE(vq);
>  		return NULL;
> @@ -1828,7 +2158,7 @@ static void *virtqueue_get_buf_ctx_packed(struct vring_virtqueue *vq,
>  	id = le16_to_cpu(vq->packed.vring.desc[last_used].id);
>  	*len = le32_to_cpu(vq->packed.vring.desc[last_used].len);
>  
> -	if (unlikely(id >= vq->packed.vring.num)) {
> +	if (unlikely(id >= num)) {
>  		BAD_RING(vq, "id %u out of range\n", id);
>  		return NULL;
>  	}
> @@ -1948,6 +2278,7 @@ static bool virtqueue_enable_cb_delayed_packed(struct vring_virtqueue *vq)
>  	last_used_idx = READ_ONCE(vq->last_used_idx);
>  	wrap_counter = packed_used_wrap_counter(last_used_idx);
>  	used_idx = packed_last_used(last_used_idx);
> +
>  	if (is_used_desc_packed(vq, used_idx, wrap_counter)) {
>  		END_USE(vq);
>  		return false;
> @@ -1969,7 +2300,7 @@ static void *virtqueue_detach_unused_buf_packed(struct vring_virtqueue *vq)
>  			continue;
>  		/* detach_buf clears data, so grab it now. */
>  		buf = vq->packed.desc_state[i].data;
> -		detach_buf_packed(vq, i, NULL);
> +		detach_buf_packed_in_order(vq, i, NULL);
>  		END_USE(vq);
>  		return buf;
>  	}
> @@ -1995,6 +2326,8 @@ static struct vring_desc_extra *vring_alloc_desc_extra(unsigned int num)
>  	for (i = 0; i < num - 1; i++)
>  		desc_extra[i].next = i + 1;
>  
> +	desc_extra[num - 1].next = 0;
> +
>  	return desc_extra;
>  }
>  
> @@ -2126,8 +2459,12 @@ static void virtqueue_vring_attach_packed(struct vring_virtqueue *vq,
>  {
>  	vq->packed = *vring_packed;
>  
> -	/* Put everything in free lists. */
> -	vq->free_head = 0;
> +	if (virtqueue_is_in_order(vq))
> +		vq->batch_head = vq->split.vring.num;
> +	else {
> +		/* Put everything in free lists. */
> +		vq->free_head = 0;
> +	}
>  }
>  
>  static void virtqueue_reset_packed(struct vring_virtqueue *vq)
> @@ -2174,13 +2511,14 @@ static struct virtqueue *__vring_new_virtqueue_packed(unsigned int index,
>  #else
>  	vq->broken = false;
>  #endif
> -	vq->layout = PACKED;
>  	vq->dma_dev = dma_dev;
>  	vq->use_dma_api = vring_use_dma_api(vdev);
>  
>  	vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) &&
>  		!context;
>  	vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
> +	vq->layout = virtio_has_feature(vdev, VIRTIO_F_IN_ORDER) ?
> +		     PACKED_IN_ORDER : PACKED;
>  
>  	if (virtio_has_feature(vdev, VIRTIO_F_ORDER_PLATFORM))
>  		vq->weak_barriers = false;
> @@ -2290,9 +2628,39 @@ static const struct virtqueue_ops packed_ops = {
>  	.reset = virtqueue_reset_packed,
>  };
>  
> +static const struct virtqueue_ops split_in_order_ops = {
> +	.add = virtqueue_add_split,
> +	.get = virtqueue_get_buf_ctx_split_in_order,
> +	.kick_prepare = virtqueue_kick_prepare_split,
> +	.disable_cb = virtqueue_disable_cb_split,
> +	.enable_cb_delayed = virtqueue_enable_cb_delayed_split,
> +	.enable_cb_prepare = virtqueue_enable_cb_prepare_split,
> +	.poll = virtqueue_poll_split,
> +	.detach_unused_buf = virtqueue_detach_unused_buf_split,
> +	.more_used = more_used_split_in_order,
> +	.resize = virtqueue_resize_split,
> +	.reset = virtqueue_reset_split,
> +};
> +
> +static const struct virtqueue_ops packed_in_order_ops = {
> +	.add = virtqueue_add_packed_in_order,
> +	.get = virtqueue_get_buf_ctx_packed_in_order,
> +	.kick_prepare = virtqueue_kick_prepare_packed,
> +	.disable_cb = virtqueue_disable_cb_packed,
> +	.enable_cb_delayed = virtqueue_enable_cb_delayed_packed,
> +	.enable_cb_prepare = virtqueue_enable_cb_prepare_packed,
> +	.poll = virtqueue_poll_packed,
> +	.detach_unused_buf = virtqueue_detach_unused_buf_packed,
> +	.more_used = more_used_packed_in_order,
> +	.resize = virtqueue_resize_packed,
> +	.reset = virtqueue_reset_packed,
> +};
> +
>  static const struct virtqueue_ops *const all_ops[VQ_TYPE_MAX] = {
>  	[SPLIT] = &split_ops,
> -	[PACKED] = &packed_ops
> +	[PACKED] = &packed_ops,
> +	[SPLIT_IN_ORDER] = &split_in_order_ops,
> +	[PACKED_IN_ORDER] = &packed_in_order_ops,
>  };
>  
>  static int virtqueue_disable_and_recycle(struct virtqueue *_vq,
> @@ -2336,7 +2704,6 @@ static int virtqueue_enable_after_reset(struct virtqueue *_vq)
>  /*
>   * Generic functions and exported symbols.
>   */
> -
>  #define VIRTQUEUE_CALL(vq, op, ...)					\
>  	({								\
>  	typeof(all_ops[SPLIT]->op(vq, ##__VA_ARGS__)) ret;		\
> @@ -2347,6 +2714,12 @@ static int virtqueue_enable_after_reset(struct virtqueue *_vq)
>  	case PACKED:							\
>  		ret = all_ops[PACKED]->op(vq, ##__VA_ARGS__);		\
>  		break;							\
> +	case SPLIT_IN_ORDER:						\
> +		ret = all_ops[SPLIT_IN_ORDER]->op(vq, ##__VA_ARGS__);	\
> +		break;							\
> +	case PACKED_IN_ORDER:						\
> +		ret = all_ops[PACKED_IN_ORDER]->op(vq, ##__VA_ARGS__);	\
> +		break;							\
>  	default:							\
>  		BUG();							\
>  		break;							\
> @@ -2363,6 +2736,12 @@ static int virtqueue_enable_after_reset(struct virtqueue *_vq)
>          case PACKED:					  \
>                  all_ops[PACKED]->op(vq, ##__VA_ARGS__);   \
>                  break;					  \
> +        case SPLIT_IN_ORDER:						\
> +                all_ops[SPLIT_IN_ORDER]->op(vq, ##__VA_ARGS__);		\
> +                break;							\
> +        case PACKED_IN_ORDER:						\
> +                all_ops[PACKED_IN_ORDER]->op(vq, ##__VA_ARGS__);	\
> +                break;							\
>          default:					  \
>                  BUG();					  \
>                  break;					  \
> @@ -3073,6 +3452,8 @@ void vring_transport_features(struct virtio_device *vdev)
>  			break;
>  		case VIRTIO_F_NOTIFICATION_DATA:
>  			break;
> +		case VIRTIO_F_IN_ORDER:
> +			break;
>  		default:
>  			/* We don't understand this bit. */
>  			__virtio_clear_bit(vdev, i);
> -- 
> 2.34.1

Re: [PATCH V3 19/19] virtio_ring: add in order support

Posted by Jason Wang 7 months, 1 week ago

On Tue, Jul 1, 2025 at 2:57 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Mon, Jun 16, 2025 at 04:25:17PM +0800, Jason Wang wrote:
> > This patch implements in order support for both split virtqueue and
> > packed virtqueue.
>
> I'd like to see more motivation for this work, documented.
> It's not really performance, not as it stands, see below:
>
> >
> > Benchmark with KVM guest + testpmd on the host shows:
> >
> > For split virtqueue: no obvious differences were noticed
> >
> > For packed virtqueue:
> >
> > 1) RX gets 3.1% PPS improvements from 6.3 Mpps to 6.5 Mpps
> > 2) TX gets 4.6% PPS improvements from 8.6 Mpps to 9.0 Mpps
> >
>
> That's a very modest improvement for a lot of code.
> I also note you put in some batching just for in-order.
> Which could also explain the gains maybe?
> What if you just put in a simple implementation with no
> batching tricks? do you still see a gain?

It is used to implement the batch used updating.

"""
Some devices always use descriptors in the same order in which they
have been made available. These devices can offer the
VIRTIO_F_IN_ORDER feature. If negotiated, this knowledge allows
devices to notify the use of a batch of buffers to the driver by only
writing out a single used ring entry with the id corresponding to the
head entry of the descriptor chain describing the last buffer in the
batch.
"""

DPDK implements this behavior, so it's a must for the virtio driver.

> Does any hardware implement this? Maybe that can demonstrate
> bigger gains.

Maybe but I don't have one in my hand.

For performance, I think it should be sufficient as a starter. I can
say in the next version something like "more optimizations could be
done on top"

Note that the patch that introduces packed virtqueue, there's not even
any numbers:

commit 1ce9e6055fa0a9043405c5604cf19169ec5379ff
Author: Tiwei Bie <tiwei.bie@intel.com>
Date:   Wed Nov 21 18:03:27 2018 +0800

    virtio_ring: introduce packed ring support

    Introduce the packed ring support. Packed ring can only be
    created by vring_create_virtqueue() and each chunk of packed
    ring will be allocated individually. Packed ring can not be
    created on preallocated memory by vring_new_virtqueue() or
    the likes currently.

    Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
    Signed-off-by: David S. Miller <davem@davemloft.net>

>
>
> > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > ---
> >  drivers/virtio/virtio_ring.c | 423 +++++++++++++++++++++++++++++++++--
> >  1 file changed, 402 insertions(+), 21 deletions(-)
> >
> > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > index 27a9459a0555..21d456392ba0 100644
> > --- a/drivers/virtio/virtio_ring.c
> > +++ b/drivers/virtio/virtio_ring.c
> > @@ -70,11 +70,14 @@
> >  enum vq_layout {
> >       SPLIT = 0,
> >       PACKED,
> > +     SPLIT_IN_ORDER,
> > +     PACKED_IN_ORDER,
> >       VQ_TYPE_MAX,
> >  };
> >
> >  struct vring_desc_state_split {
> >       void *data;                     /* Data for callback. */
> > +     u32 total_len;                  /* Buffer Length */
> >
> >       /* Indirect desc table and extra table, if any. These two will be
> >        * allocated together. So we won't stress more to the memory allocator.
> > @@ -84,6 +87,7 @@ struct vring_desc_state_split {
> >
> >  struct vring_desc_state_packed {
> >       void *data;                     /* Data for callback. */
> > +     u32 total_len;                  /* Buffer Length */
> >
> >       /* Indirect desc table and extra table, if any. These two will be
> >        * allocated together. So we won't stress more to the memory allocator.
>
> We are bloating up the cache footprint for everyone,
> so there's a chance of regressions.
> Pls include benchmark for in order off, to make sure we
> are not regressing.

Ok.

> How big was the ring?

256.

> Worth trying with a biggish one, where there is more cache
> pressure.

Ok.

>
>
> Why not have a separate state for in-order?

It can work.

>
>
>
> > @@ -206,6 +210,12 @@ struct vring_virtqueue {
> >
> >       /* Head of free buffer list. */
> >       unsigned int free_head;
> > +
> > +     /* Head of the batched used buffers, vq->num means no batching */
> > +     unsigned int batch_head;
> > +
> > +     unsigned int batch_len;
> > +
>
> Are these two only used for in-order? Please document that.

Yes, I will do that.

> I also want some documentation about the batching trickery
> used please.
> What is batched, when, how is batching flushed, why are we
> only batching in-order ...

I'm not sure I get things like this, what you want seems to be the
behaviour of the device which has been stated by the spec or I may
miss something here.

>
>
>
>
> >       /* Number we've added since last sync. */
> >       unsigned int num_added;
> >
> > @@ -256,10 +266,14 @@ static void vring_free(struct virtqueue *_vq);
> >
> >  #define to_vvq(_vq) container_of_const(_vq, struct vring_virtqueue, vq)
> >
> > -
> >  static inline bool virtqueue_is_packed(const struct vring_virtqueue *vq)
> >  {
> > -     return vq->layout == PACKED;
> > +     return vq->layout == PACKED || vq->layout == PACKED_IN_ORDER;
> > +}
> > +
> > +static inline bool virtqueue_is_in_order(const struct vring_virtqueue *vq)
> > +{
> > +     return vq->layout == SPLIT_IN_ORDER || vq->layout == PACKED_IN_ORDER;
> >  }
> >
> >  static bool virtqueue_use_indirect(const struct vring_virtqueue *vq,
> > @@ -570,7 +584,7 @@ static inline int virtqueue_add_split(struct vring_virtqueue *vq,
> >       struct vring_desc_extra *extra;
> >       struct scatterlist *sg;
> >       struct vring_desc *desc;
> > -     unsigned int i, n, c, avail, descs_used, err_idx;
> > +     unsigned int i, n, c, avail, descs_used, err_idx, total_len = 0;
>
>
> I would add a comment here:
>
> /* Total length for in-order */
> unsigned int total_len = 0;

Ok.

Thanks

Re: [PATCH V3 19/19] virtio_ring: add in order support

Posted by Michael S. Tsirkin 7 months, 1 week ago

On Wed, Jul 02, 2025 at 05:29:18PM +0800, Jason Wang wrote:
> On Tue, Jul 1, 2025 at 2:57 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Mon, Jun 16, 2025 at 04:25:17PM +0800, Jason Wang wrote:
> > > This patch implements in order support for both split virtqueue and
> > > packed virtqueue.
> >
> > I'd like to see more motivation for this work, documented.
> > It's not really performance, not as it stands, see below:
> >
> > >
> > > Benchmark with KVM guest + testpmd on the host shows:
> > >
> > > For split virtqueue: no obvious differences were noticed
> > >
> > > For packed virtqueue:
> > >
> > > 1) RX gets 3.1% PPS improvements from 6.3 Mpps to 6.5 Mpps
> > > 2) TX gets 4.6% PPS improvements from 8.6 Mpps to 9.0 Mpps
> > >
> >
> > That's a very modest improvement for a lot of code.
> > I also note you put in some batching just for in-order.
> > Which could also explain the gains maybe?
> > What if you just put in a simple implementation with no
> > batching tricks? do you still see a gain?
> 
> It is used to implement the batch used updating.
> 
> """
> Some devices always use descriptors in the same order in which they
> have been made available. These devices can offer the
> VIRTIO_F_IN_ORDER feature. If negotiated, this knowledge allows
> devices to notify the use of a batch of buffers to the driver by only
> writing out a single used ring entry with the id corresponding to the
> head entry of the descriptor chain describing the last buffer in the
> batch.
> """
> 
> DPDK implements this behavior, so it's a must for the virtio driver.
> 
> > Does any hardware implement this? Maybe that can demonstrate
> > bigger gains.
> 
> Maybe but I don't have one in my hand.
> 
> For performance, I think it should be sufficient as a starter. I can
> say in the next version something like "more optimizations could be
> done on top"

What are some optimizations you have in mind?



> Note that the patch that introduces packed virtqueue, there's not even
> any numbers:
> 
> commit 1ce9e6055fa0a9043405c5604cf19169ec5379ff
> Author: Tiwei Bie <tiwei.bie@intel.com>
> Date:   Wed Nov 21 18:03:27 2018 +0800
> 
>     virtio_ring: introduce packed ring support
> 
>     Introduce the packed ring support. Packed ring can only be
>     created by vring_create_virtqueue() and each chunk of packed
>     ring will be allocated individually. Packed ring can not be
>     created on preallocated memory by vring_new_virtqueue() or
>     the likes currently.
> 
>     Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
>     Signed-off-by: David S. Miller <davem@davemloft.net>


I think the assumption there was that intel has hardware that
requires packed. That's why Dave merged this.

> >
> >
> > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > ---
> > >  drivers/virtio/virtio_ring.c | 423 +++++++++++++++++++++++++++++++++--
> > >  1 file changed, 402 insertions(+), 21 deletions(-)
> > >
> > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > index 27a9459a0555..21d456392ba0 100644
> > > --- a/drivers/virtio/virtio_ring.c
> > > +++ b/drivers/virtio/virtio_ring.c
> > > @@ -70,11 +70,14 @@
> > >  enum vq_layout {
> > >       SPLIT = 0,
> > >       PACKED,
> > > +     SPLIT_IN_ORDER,
> > > +     PACKED_IN_ORDER,
> > >       VQ_TYPE_MAX,
> > >  };
> > >
> > >  struct vring_desc_state_split {
> > >       void *data;                     /* Data for callback. */
> > > +     u32 total_len;                  /* Buffer Length */
> > >
> > >       /* Indirect desc table and extra table, if any. These two will be
> > >        * allocated together. So we won't stress more to the memory allocator.
> > > @@ -84,6 +87,7 @@ struct vring_desc_state_split {
> > >
> > >  struct vring_desc_state_packed {
> > >       void *data;                     /* Data for callback. */
> > > +     u32 total_len;                  /* Buffer Length */
> > >
> > >       /* Indirect desc table and extra table, if any. These two will be
> > >        * allocated together. So we won't stress more to the memory allocator.
> >
> > We are bloating up the cache footprint for everyone,
> > so there's a chance of regressions.
> > Pls include benchmark for in order off, to make sure we
> > are not regressing.
> 
> Ok.
> 
> > How big was the ring?
> 
> 256.

that is very modest, you want to fill at least one cache way,
preferably more.

> > Worth trying with a biggish one, where there is more cache
> > pressure.
> 
> Ok.
> 
> >
> >
> > Why not have a separate state for in-order?
> 
> It can work.
> 
> >
> >
> >
> > > @@ -206,6 +210,12 @@ struct vring_virtqueue {
> > >
> > >       /* Head of free buffer list. */
> > >       unsigned int free_head;
> > > +
> > > +     /* Head of the batched used buffers, vq->num means no batching */
> > > +     unsigned int batch_head;
> > > +
> > > +     unsigned int batch_len;
> > > +
> >
> > Are these two only used for in-order? Please document that.
> 
> Yes, I will do that.
> 
> > I also want some documentation about the batching trickery
> > used please.
> > What is batched, when, how is batching flushed, why are we
> > only batching in-order ...
> 
> I'm not sure I get things like this, what you want seems to be the
> behaviour of the device which has been stated by the spec or I may
> miss something here.

"a single used ring entry with the id corresponding to the
 head entry of the descriptor chain describing the last buffer in the
 batch"
?

so together they form this used ring entry describing the last buffer?
"head" is the id and "len" the length?

maybe

	/*
	 * With IN_ORDER, devices write a single used ring entry with
	 * the id corresponding to the head entry of the descriptor chain
	 * describing the last buffer in the batch
	 */
	struct used_entry {
		u32 id;
		u32 len;
	} batch_last;

?




> >
> >
> >
> >
> > >       /* Number we've added since last sync. */
> > >       unsigned int num_added;
> > >
> > > @@ -256,10 +266,14 @@ static void vring_free(struct virtqueue *_vq);
> > >
> > >  #define to_vvq(_vq) container_of_const(_vq, struct vring_virtqueue, vq)
> > >
> > > -
> > >  static inline bool virtqueue_is_packed(const struct vring_virtqueue *vq)
> > >  {
> > > -     return vq->layout == PACKED;
> > > +     return vq->layout == PACKED || vq->layout == PACKED_IN_ORDER;
> > > +}
> > > +
> > > +static inline bool virtqueue_is_in_order(const struct vring_virtqueue *vq)
> > > +{
> > > +     return vq->layout == SPLIT_IN_ORDER || vq->layout == PACKED_IN_ORDER;
> > >  }
> > >
> > >  static bool virtqueue_use_indirect(const struct vring_virtqueue *vq,
> > > @@ -570,7 +584,7 @@ static inline int virtqueue_add_split(struct vring_virtqueue *vq,
> > >       struct vring_desc_extra *extra;
> > >       struct scatterlist *sg;
> > >       struct vring_desc *desc;
> > > -     unsigned int i, n, c, avail, descs_used, err_idx;
> > > +     unsigned int i, n, c, avail, descs_used, err_idx, total_len = 0;
> >
> >
> > I would add a comment here:
> >
> > /* Total length for in-order */
> > unsigned int total_len = 0;
> 
> Ok.
> 
> Thanks

Re: [PATCH V3 19/19] virtio_ring: add in order support

Posted by Jason Wang 7 months, 1 week ago

On Wed, Jul 2, 2025 at 6:57 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Wed, Jul 02, 2025 at 05:29:18PM +0800, Jason Wang wrote:
> > On Tue, Jul 1, 2025 at 2:57 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Mon, Jun 16, 2025 at 04:25:17PM +0800, Jason Wang wrote:
> > > > This patch implements in order support for both split virtqueue and
> > > > packed virtqueue.
> > >
> > > I'd like to see more motivation for this work, documented.
> > > It's not really performance, not as it stands, see below:
> > >
> > > >
> > > > Benchmark with KVM guest + testpmd on the host shows:
> > > >
> > > > For split virtqueue: no obvious differences were noticed
> > > >
> > > > For packed virtqueue:
> > > >
> > > > 1) RX gets 3.1% PPS improvements from 6.3 Mpps to 6.5 Mpps
> > > > 2) TX gets 4.6% PPS improvements from 8.6 Mpps to 9.0 Mpps
> > > >
> > >
> > > That's a very modest improvement for a lot of code.
> > > I also note you put in some batching just for in-order.
> > > Which could also explain the gains maybe?
> > > What if you just put in a simple implementation with no
> > > batching tricks? do you still see a gain?
> >
> > It is used to implement the batch used updating.
> >
> > """
> > Some devices always use descriptors in the same order in which they
> > have been made available. These devices can offer the
> > VIRTIO_F_IN_ORDER feature. If negotiated, this knowledge allows
> > devices to notify the use of a batch of buffers to the driver by only
> > writing out a single used ring entry with the id corresponding to the
> > head entry of the descriptor chain describing the last buffer in the
> > batch.
> > """
> >
> > DPDK implements this behavior, so it's a must for the virtio driver.
> >
> > > Does any hardware implement this? Maybe that can demonstrate
> > > bigger gains.
> >
> > Maybe but I don't have one in my hand.
> >
> > For performance, I think it should be sufficient as a starter. I can
> > say in the next version something like "more optimizations could be
> > done on top"
>
> What are some optimizations you have in mind?

One thing in my mind, spec currently said:

"""
 If negotiated, this knowledge allows devices to notify the use of a
batch of buffers to the driver by only writing out a single used
descriptor with the Buffer ID corresponding to the last descriptor in
the batch.
"""

If the device writes the last descriptor ID instead of the buffer ID
and skip the number of descriptors in the used ring. For split
virtqueue, the avail ring is not needed anymore. Device knows the
availability of buffers via avail_idx. In this way, we completely
eliminate the access of the available ring. This reduces the memory
access which is expensive for both:

1) kernel vhost-net where small user space memory access is expensive
2) hardware PCI transactions

Does this make sense?

>
>
>
> > Note that the patch that introduces packed virtqueue, there's not even
> > any numbers:
> >
> > commit 1ce9e6055fa0a9043405c5604cf19169ec5379ff
> > Author: Tiwei Bie <tiwei.bie@intel.com>
> > Date:   Wed Nov 21 18:03:27 2018 +0800
> >
> >     virtio_ring: introduce packed ring support
> >
> >     Introduce the packed ring support. Packed ring can only be
> >     created by vring_create_virtqueue() and each chunk of packed
> >     ring will be allocated individually. Packed ring can not be
> >     created on preallocated memory by vring_new_virtqueue() or
> >     the likes currently.
> >
> >     Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> >     Signed-off-by: David S. Miller <davem@davemloft.net>
>
>
> I think the assumption there was that intel has hardware that
> requires packed. That's why Dave merged this.

I think it should according to Qemu patch:

commit c03213fdc9d7b680cc575cd1e725750702a10b09
Author: Jonah Palmer <jonah.palmer@oracle.com>
Date:   Wed Jul 10 08:55:18 2024 -0400

    vhost,vhost-user: Add VIRTIO_F_IN_ORDER to vhost feature bits

    Add support for the VIRTIO_F_IN_ORDER feature across a variety of vhost
    devices.

    The inclusion of VIRTIO_F_IN_ORDER in the feature bits arrays for these
    devices ensures that the backend is capable of offering and providing
    support for this feature, and that it can be disabled if the backend
    does not support it.

    Acked-by: Eugenio Pérez <eperezma@redhat.com>
    Signed-off-by: Jonah Palmer <jonah.palmer@oracle.com>
    Message-Id: <20240710125522.4168043-6-jonah.palmer@oracle.com>
    Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
    Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

Adding Jonah for more thought here.

>
> > >
> > >
> > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > ---
> > > >  drivers/virtio/virtio_ring.c | 423 +++++++++++++++++++++++++++++++++--
> > > >  1 file changed, 402 insertions(+), 21 deletions(-)
> > > >
> > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > index 27a9459a0555..21d456392ba0 100644
> > > > --- a/drivers/virtio/virtio_ring.c
> > > > +++ b/drivers/virtio/virtio_ring.c
> > > > @@ -70,11 +70,14 @@
> > > >  enum vq_layout {
> > > >       SPLIT = 0,
> > > >       PACKED,
> > > > +     SPLIT_IN_ORDER,
> > > > +     PACKED_IN_ORDER,
> > > >       VQ_TYPE_MAX,
> > > >  };
> > > >
> > > >  struct vring_desc_state_split {
> > > >       void *data;                     /* Data for callback. */
> > > > +     u32 total_len;                  /* Buffer Length */
> > > >
> > > >       /* Indirect desc table and extra table, if any. These two will be
> > > >        * allocated together. So we won't stress more to the memory allocator.
> > > > @@ -84,6 +87,7 @@ struct vring_desc_state_split {
> > > >
> > > >  struct vring_desc_state_packed {
> > > >       void *data;                     /* Data for callback. */
> > > > +     u32 total_len;                  /* Buffer Length */
> > > >
> > > >       /* Indirect desc table and extra table, if any. These two will be
> > > >        * allocated together. So we won't stress more to the memory allocator.
> > >
> > > We are bloating up the cache footprint for everyone,
> > > so there's a chance of regressions.
> > > Pls include benchmark for in order off, to make sure we
> > > are not regressing.
> >
> > Ok.
> >
> > > How big was the ring?
> >
> > 256.
>
> that is very modest, you want to fill at least one cache way,
> preferably more.

I can test larger queue sizes.

>
> > > Worth trying with a biggish one, where there is more cache
> > > pressure.
> >
> > Ok.
> >
> > >
> > >
> > > Why not have a separate state for in-order?
> >
> > It can work.
> >
> > >
> > >
> > >
> > > > @@ -206,6 +210,12 @@ struct vring_virtqueue {
> > > >
> > > >       /* Head of free buffer list. */
> > > >       unsigned int free_head;
> > > > +
> > > > +     /* Head of the batched used buffers, vq->num means no batching */
> > > > +     unsigned int batch_head;
> > > > +
> > > > +     unsigned int batch_len;
> > > > +
> > >
> > > Are these two only used for in-order? Please document that.
> >
> > Yes, I will do that.
> >
> > > I also want some documentation about the batching trickery
> > > used please.
> > > What is batched, when, how is batching flushed, why are we
> > > only batching in-order ...
> >
> > I'm not sure I get things like this, what you want seems to be the
> > behaviour of the device which has been stated by the spec or I may
> > miss something here.
>
> "a single used ring entry with the id corresponding to the
>  head entry of the descriptor chain describing the last buffer in the
>  batch"
> ?

Exactly.

>
> so together they form this used ring entry describing the last buffer?
> "head" is the id and "len" the length?

Yes.

>
> maybe
>
>         /*
>          * With IN_ORDER, devices write a single used ring entry with
>          * the id corresponding to the head entry of the descriptor chain
>          * describing the last buffer in the batch
>          */
>         struct used_entry {
>                 u32 id;
>                 u32 len;
>         } batch_last;
>
> ?

This should be fine.

>
>
>
>
> > >
> > >
> > >
> > >
> > > >       /* Number we've added since last sync. */
> > > >       unsigned int num_added;
> > > >
> > > > @@ -256,10 +266,14 @@ static void vring_free(struct virtqueue *_vq);
> > > >
> > > >  #define to_vvq(_vq) container_of_const(_vq, struct vring_virtqueue, vq)
> > > >
> > > > -
> > > >  static inline bool virtqueue_is_packed(const struct vring_virtqueue *vq)
> > > >  {
> > > > -     return vq->layout == PACKED;
> > > > +     return vq->layout == PACKED || vq->layout == PACKED_IN_ORDER;
> > > > +}
> > > > +
> > > > +static inline bool virtqueue_is_in_order(const struct vring_virtqueue *vq)
> > > > +{
> > > > +     return vq->layout == SPLIT_IN_ORDER || vq->layout == PACKED_IN_ORDER;
> > > >  }
> > > >
> > > >  static bool virtqueue_use_indirect(const struct vring_virtqueue *vq,
> > > > @@ -570,7 +584,7 @@ static inline int virtqueue_add_split(struct vring_virtqueue *vq,
> > > >       struct vring_desc_extra *extra;
> > > >       struct scatterlist *sg;
> > > >       struct vring_desc *desc;
> > > > -     unsigned int i, n, c, avail, descs_used, err_idx;
> > > > +     unsigned int i, n, c, avail, descs_used, err_idx, total_len = 0;
> > >
> > >
> > > I would add a comment here:
> > >
> > > /* Total length for in-order */
> > > unsigned int total_len = 0;
> >
> > Ok.
> >
> > Thanks
>

Re: [PATCH V3 19/19] virtio_ring: add in order support

Posted by Jonah Palmer 7 months, 1 week ago


On 7/2/25 11:13 PM, Jason Wang wrote:
> On Wed, Jul 2, 2025 at 6:57 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>>
>> On Wed, Jul 02, 2025 at 05:29:18PM +0800, Jason Wang wrote:
>>> On Tue, Jul 1, 2025 at 2:57 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>>>>
>>>> On Mon, Jun 16, 2025 at 04:25:17PM +0800, Jason Wang wrote:
>>>>> This patch implements in order support for both split virtqueue and
>>>>> packed virtqueue.
>>>>
>>>> I'd like to see more motivation for this work, documented.
>>>> It's not really performance, not as it stands, see below:
>>>>
>>>>>
>>>>> Benchmark with KVM guest + testpmd on the host shows:
>>>>>
>>>>> For split virtqueue: no obvious differences were noticed
>>>>>
>>>>> For packed virtqueue:
>>>>>
>>>>> 1) RX gets 3.1% PPS improvements from 6.3 Mpps to 6.5 Mpps
>>>>> 2) TX gets 4.6% PPS improvements from 8.6 Mpps to 9.0 Mpps
>>>>>
>>>>
>>>> That's a very modest improvement for a lot of code.
>>>> I also note you put in some batching just for in-order.
>>>> Which could also explain the gains maybe?
>>>> What if you just put in a simple implementation with no
>>>> batching tricks? do you still see a gain?
>>>
>>> It is used to implement the batch used updating.
>>>
>>> """
>>> Some devices always use descriptors in the same order in which they
>>> have been made available. These devices can offer the
>>> VIRTIO_F_IN_ORDER feature. If negotiated, this knowledge allows
>>> devices to notify the use of a batch of buffers to the driver by only
>>> writing out a single used ring entry with the id corresponding to the
>>> head entry of the descriptor chain describing the last buffer in the
>>> batch.
>>> """
>>>
>>> DPDK implements this behavior, so it's a must for the virtio driver.
>>>
>>>> Does any hardware implement this? Maybe that can demonstrate
>>>> bigger gains.
>>>
>>> Maybe but I don't have one in my hand.
>>>
>>> For performance, I think it should be sufficient as a starter. I can
>>> say in the next version something like "more optimizations could be
>>> done on top"
>>
>> What are some optimizations you have in mind?
> 
> One thing in my mind, spec currently said:
> 
> """
>   If negotiated, this knowledge allows devices to notify the use of a
> batch of buffers to the driver by only writing out a single used
> descriptor with the Buffer ID corresponding to the last descriptor in
> the batch.
> """
> 
> If the device writes the last descriptor ID instead of the buffer ID
> and skip the number of descriptors in the used ring. For split
> virtqueue, the avail ring is not needed anymore. Device knows the
> availability of buffers via avail_idx. In this way, we completely
> eliminate the access of the available ring. This reduces the memory
> access which is expensive for both:
> 
> 1) kernel vhost-net where small user space memory access is expensive
> 2) hardware PCI transactions
> 
> Does this make sense?
> 
>>
>>
>>
>>> Note that the patch that introduces packed virtqueue, there's not even
>>> any numbers:
>>>
>>> commit 1ce9e6055fa0a9043405c5604cf19169ec5379ff
>>> Author: Tiwei Bie <tiwei.bie@intel.com>
>>> Date:   Wed Nov 21 18:03:27 2018 +0800
>>>
>>>      virtio_ring: introduce packed ring support
>>>
>>>      Introduce the packed ring support. Packed ring can only be
>>>      created by vring_create_virtqueue() and each chunk of packed
>>>      ring will be allocated individually. Packed ring can not be
>>>      created on preallocated memory by vring_new_virtqueue() or
>>>      the likes currently.
>>>
>>>      Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
>>>      Signed-off-by: David S. Miller <davem@davemloft.net>
>>
>>
>> I think the assumption there was that intel has hardware that
>> requires packed. That's why Dave merged this.
> 
> I think it should according to Qemu patch:
> 
> commit c03213fdc9d7b680cc575cd1e725750702a10b09
> Author: Jonah Palmer <jonah.palmer@oracle.com>
> Date:   Wed Jul 10 08:55:18 2024 -0400
> 
>      vhost,vhost-user: Add VIRTIO_F_IN_ORDER to vhost feature bits
> 
>      Add support for the VIRTIO_F_IN_ORDER feature across a variety of vhost
>      devices.
> 
>      The inclusion of VIRTIO_F_IN_ORDER in the feature bits arrays for these
>      devices ensures that the backend is capable of offering and providing
>      support for this feature, and that it can be disabled if the backend
>      does not support it.
> 
>      Acked-by: Eugenio Pérez <eperezma@redhat.com>
>      Signed-off-by: Jonah Palmer <jonah.palmer@oracle.com>
>      Message-Id: <20240710125522.4168043-6-jonah.palmer@oracle.com>
>      Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
>      Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> 
> Adding Jonah for more thought here.
> 

Hi. By "it should", are you referring to Intel having hardware requiring 
this feature? Sorry, just having a bit of trouble following.

In any case, this looks like a good first implementation that can be 
used as a foundation for future implementations to further improve its 
performance.

This was the thought process I had when I implemented this feature in 
Qemu. That is, get a solid framework down that supports this feature 
(which had some small, modest performance improvements for some devices) 
and then add on future patches (perhaps device-specific and/or general 
implementations) that take advantage of the fact that data follows this 
FIFO model.

As Jason mentioned, one such future implementation could remove the need 
for the use of the avail ring in the split VQ case since the device 
wouldn't need to read it to learn which descriptor comes next.

Another example could be with vhost-net / vhost-vdpa. Currently each 
queue tracks 3 separate indices and keeps a per-descriptor bookkeeping 
table to handle buffers completing out of order. If the backend knows 
data is FIFO, we might be able to drop these trackers and just use a 
head and tail counter with a single contiguous iovec ring. This could 
result in a smaller cache footprint and fewer DMAs.

Jonah

>>
>>>>
>>>>
>>>>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>>>>> ---
>>>>>   drivers/virtio/virtio_ring.c | 423 +++++++++++++++++++++++++++++++++--
>>>>>   1 file changed, 402 insertions(+), 21 deletions(-)
>>>>>
>>>>> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
>>>>> index 27a9459a0555..21d456392ba0 100644
>>>>> --- a/drivers/virtio/virtio_ring.c
>>>>> +++ b/drivers/virtio/virtio_ring.c
>>>>> @@ -70,11 +70,14 @@
>>>>>   enum vq_layout {
>>>>>        SPLIT = 0,
>>>>>        PACKED,
>>>>> +     SPLIT_IN_ORDER,
>>>>> +     PACKED_IN_ORDER,
>>>>>        VQ_TYPE_MAX,
>>>>>   };
>>>>>
>>>>>   struct vring_desc_state_split {
>>>>>        void *data;                     /* Data for callback. */
>>>>> +     u32 total_len;                  /* Buffer Length */
>>>>>
>>>>>        /* Indirect desc table and extra table, if any. These two will be
>>>>>         * allocated together. So we won't stress more to the memory allocator.
>>>>> @@ -84,6 +87,7 @@ struct vring_desc_state_split {
>>>>>
>>>>>   struct vring_desc_state_packed {
>>>>>        void *data;                     /* Data for callback. */
>>>>> +     u32 total_len;                  /* Buffer Length */
>>>>>
>>>>>        /* Indirect desc table and extra table, if any. These two will be
>>>>>         * allocated together. So we won't stress more to the memory allocator.
>>>>
>>>> We are bloating up the cache footprint for everyone,
>>>> so there's a chance of regressions.
>>>> Pls include benchmark for in order off, to make sure we
>>>> are not regressing.
>>>
>>> Ok.
>>>
>>>> How big was the ring?
>>>
>>> 256.
>>
>> that is very modest, you want to fill at least one cache way,
>> preferably more.
> 
> I can test larger queue sizes.
> 
>>
>>>> Worth trying with a biggish one, where there is more cache
>>>> pressure.
>>>
>>> Ok.
>>>
>>>>
>>>>
>>>> Why not have a separate state for in-order?
>>>
>>> It can work.
>>>
>>>>
>>>>
>>>>
>>>>> @@ -206,6 +210,12 @@ struct vring_virtqueue {
>>>>>
>>>>>        /* Head of free buffer list. */
>>>>>        unsigned int free_head;
>>>>> +
>>>>> +     /* Head of the batched used buffers, vq->num means no batching */
>>>>> +     unsigned int batch_head;
>>>>> +
>>>>> +     unsigned int batch_len;
>>>>> +
>>>>
>>>> Are these two only used for in-order? Please document that.
>>>
>>> Yes, I will do that.
>>>
>>>> I also want some documentation about the batching trickery
>>>> used please.
>>>> What is batched, when, how is batching flushed, why are we
>>>> only batching in-order ...
>>>
>>> I'm not sure I get things like this, what you want seems to be the
>>> behaviour of the device which has been stated by the spec or I may
>>> miss something here.
>>
>> "a single used ring entry with the id corresponding to the
>>   head entry of the descriptor chain describing the last buffer in the
>>   batch"
>> ?
> 
> Exactly.
> 
>>
>> so together they form this used ring entry describing the last buffer?
>> "head" is the id and "len" the length?
> 
> Yes.
> 
>>
>> maybe
>>
>>          /*
>>           * With IN_ORDER, devices write a single used ring entry with
>>           * the id corresponding to the head entry of the descriptor chain
>>           * describing the last buffer in the batch
>>           */
>>          struct used_entry {
>>                  u32 id;
>>                  u32 len;
>>          } batch_last;
>>
>> ?
> 
> This should be fine.
> 
>>
>>
>>
>>
>>>>
>>>>
>>>>
>>>>
>>>>>        /* Number we've added since last sync. */
>>>>>        unsigned int num_added;
>>>>>
>>>>> @@ -256,10 +266,14 @@ static void vring_free(struct virtqueue *_vq);
>>>>>
>>>>>   #define to_vvq(_vq) container_of_const(_vq, struct vring_virtqueue, vq)
>>>>>
>>>>> -
>>>>>   static inline bool virtqueue_is_packed(const struct vring_virtqueue *vq)
>>>>>   {
>>>>> -     return vq->layout == PACKED;
>>>>> +     return vq->layout == PACKED || vq->layout == PACKED_IN_ORDER;
>>>>> +}
>>>>> +
>>>>> +static inline bool virtqueue_is_in_order(const struct vring_virtqueue *vq)
>>>>> +{
>>>>> +     return vq->layout == SPLIT_IN_ORDER || vq->layout == PACKED_IN_ORDER;
>>>>>   }
>>>>>
>>>>>   static bool virtqueue_use_indirect(const struct vring_virtqueue *vq,
>>>>> @@ -570,7 +584,7 @@ static inline int virtqueue_add_split(struct vring_virtqueue *vq,
>>>>>        struct vring_desc_extra *extra;
>>>>>        struct scatterlist *sg;
>>>>>        struct vring_desc *desc;
>>>>> -     unsigned int i, n, c, avail, descs_used, err_idx;
>>>>> +     unsigned int i, n, c, avail, descs_used, err_idx, total_len = 0;
>>>>
>>>>
>>>> I would add a comment here:
>>>>
>>>> /* Total length for in-order */
>>>> unsigned int total_len = 0;
>>>
>>> Ok.
>>>
>>> Thanks
>>
>

Re: [PATCH V3 19/19] virtio_ring: add in order support

Posted by Jason Wang 7 months, 1 week ago

On Thu, Jul 3, 2025 at 9:50 PM Jonah Palmer <jonah.palmer@oracle.com> wrote:
>
>
>
> On 7/2/25 11:13 PM, Jason Wang wrote:
> > On Wed, Jul 2, 2025 at 6:57 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >>
> >> On Wed, Jul 02, 2025 at 05:29:18PM +0800, Jason Wang wrote:
> >>> On Tue, Jul 1, 2025 at 2:57 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >>>>
> >>>> On Mon, Jun 16, 2025 at 04:25:17PM +0800, Jason Wang wrote:
> >>>>> This patch implements in order support for both split virtqueue and
> >>>>> packed virtqueue.
> >>>>
> >>>> I'd like to see more motivation for this work, documented.
> >>>> It's not really performance, not as it stands, see below:
> >>>>
> >>>>>
> >>>>> Benchmark with KVM guest + testpmd on the host shows:
> >>>>>
> >>>>> For split virtqueue: no obvious differences were noticed
> >>>>>
> >>>>> For packed virtqueue:
> >>>>>
> >>>>> 1) RX gets 3.1% PPS improvements from 6.3 Mpps to 6.5 Mpps
> >>>>> 2) TX gets 4.6% PPS improvements from 8.6 Mpps to 9.0 Mpps
> >>>>>
> >>>>
> >>>> That's a very modest improvement for a lot of code.
> >>>> I also note you put in some batching just for in-order.
> >>>> Which could also explain the gains maybe?
> >>>> What if you just put in a simple implementation with no
> >>>> batching tricks? do you still see a gain?
> >>>
> >>> It is used to implement the batch used updating.
> >>>
> >>> """
> >>> Some devices always use descriptors in the same order in which they
> >>> have been made available. These devices can offer the
> >>> VIRTIO_F_IN_ORDER feature. If negotiated, this knowledge allows
> >>> devices to notify the use of a batch of buffers to the driver by only
> >>> writing out a single used ring entry with the id corresponding to the
> >>> head entry of the descriptor chain describing the last buffer in the
> >>> batch.
> >>> """
> >>>
> >>> DPDK implements this behavior, so it's a must for the virtio driver.
> >>>
> >>>> Does any hardware implement this? Maybe that can demonstrate
> >>>> bigger gains.
> >>>
> >>> Maybe but I don't have one in my hand.
> >>>
> >>> For performance, I think it should be sufficient as a starter. I can
> >>> say in the next version something like "more optimizations could be
> >>> done on top"
> >>
> >> What are some optimizations you have in mind?
> >
> > One thing in my mind, spec currently said:
> >
> > """
> >   If negotiated, this knowledge allows devices to notify the use of a
> > batch of buffers to the driver by only writing out a single used
> > descriptor with the Buffer ID corresponding to the last descriptor in
> > the batch.
> > """
> >
> > If the device writes the last descriptor ID instead of the buffer ID
> > and skip the number of descriptors in the used ring. For split
> > virtqueue, the avail ring is not needed anymore. Device knows the
> > availability of buffers via avail_idx. In this way, we completely
> > eliminate the access of the available ring. This reduces the memory
> > access which is expensive for both:
> >
> > 1) kernel vhost-net where small user space memory access is expensive
> > 2) hardware PCI transactions
> >
> > Does this make sense?
> >
> >>
> >>
> >>
> >>> Note that the patch that introduces packed virtqueue, there's not even
> >>> any numbers:
> >>>
> >>> commit 1ce9e6055fa0a9043405c5604cf19169ec5379ff
> >>> Author: Tiwei Bie <tiwei.bie@intel.com>
> >>> Date:   Wed Nov 21 18:03:27 2018 +0800
> >>>
> >>>      virtio_ring: introduce packed ring support
> >>>
> >>>      Introduce the packed ring support. Packed ring can only be
> >>>      created by vring_create_virtqueue() and each chunk of packed
> >>>      ring will be allocated individually. Packed ring can not be
> >>>      created on preallocated memory by vring_new_virtqueue() or
> >>>      the likes currently.
> >>>
> >>>      Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> >>>      Signed-off-by: David S. Miller <davem@davemloft.net>
> >>
> >>
> >> I think the assumption there was that intel has hardware that
> >> requires packed. That's why Dave merged this.
> >
> > I think it should according to Qemu patch:
> >
> > commit c03213fdc9d7b680cc575cd1e725750702a10b09
> > Author: Jonah Palmer <jonah.palmer@oracle.com>
> > Date:   Wed Jul 10 08:55:18 2024 -0400
> >
> >      vhost,vhost-user: Add VIRTIO_F_IN_ORDER to vhost feature bits
> >
> >      Add support for the VIRTIO_F_IN_ORDER feature across a variety of vhost
> >      devices.
> >
> >      The inclusion of VIRTIO_F_IN_ORDER in the feature bits arrays for these
> >      devices ensures that the backend is capable of offering and providing
> >      support for this feature, and that it can be disabled if the backend
> >      does not support it.
> >
> >      Acked-by: Eugenio Pérez <eperezma@redhat.com>
> >      Signed-off-by: Jonah Palmer <jonah.palmer@oracle.com>
> >      Message-Id: <20240710125522.4168043-6-jonah.palmer@oracle.com>
> >      Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
> >      Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> >
> > Adding Jonah for more thought here.
> >
>
> Hi. By "it should", are you referring to Intel having hardware requiring
> this feature? Sorry, just having a bit of trouble following.

I meant I guess you may have the hardware that supports IN_ORDER.

>
> In any case, this looks like a good first implementation that can be
> used as a foundation for future implementations to further improve its
> performance.

Right.

>
> This was the thought process I had when I implemented this feature in
> Qemu. That is, get a solid framework down that supports this feature
> (which had some small, modest performance improvements for some devices)
> and then add on future patches (perhaps device-specific and/or general
> implementations) that take advantage of the fact that data follows this
> FIFO model.
>
> As Jason mentioned, one such future implementation could remove the need
> for the use of the avail ring in the split VQ case since the device
> wouldn't need to read it to learn which descriptor comes next.

Right.

>
> Another example could be with vhost-net / vhost-vdpa. Currently each
> queue tracks 3 separate indices and keeps a per-descriptor bookkeeping
> table to handle buffers completing out of order. If the backend knows
> data is FIFO, we might be able to drop these trackers and just use a
> head and tail counter with a single contiguous iovec ring. This could
> result in a smaller cache footprint and fewer DMAs.

Exactly.

So I had a quick hack on vhost-net to implement this optimization
(bypasses the reading of available ring). It gives us 15.8%
improvement with this series. DPDK gives me 10% PPS as well.

https://github.com/jasowang/net/commit/719ce1d24e3369cde9c5fcc8c6d7518ca9022e5f

I think it should be sufficient to move forward even without real
hardware support. Maybe vhost-net part can go first.

Thanks

>
> Jonah
>
> >>
> >>>>
> >>>>
> >>>>> Signed-off-by: Jason Wang <jasowang@redhat.com>
> >>>>> ---
> >>>>>   drivers/virtio/virtio_ring.c | 423 +++++++++++++++++++++++++++++++++--
> >>>>>   1 file changed, 402 insertions(+), 21 deletions(-)
> >>>>>
> >>>>> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> >>>>> index 27a9459a0555..21d456392ba0 100644
> >>>>> --- a/drivers/virtio/virtio_ring.c
> >>>>> +++ b/drivers/virtio/virtio_ring.c
> >>>>> @@ -70,11 +70,14 @@
> >>>>>   enum vq_layout {
> >>>>>        SPLIT = 0,
> >>>>>        PACKED,
> >>>>> +     SPLIT_IN_ORDER,
> >>>>> +     PACKED_IN_ORDER,
> >>>>>        VQ_TYPE_MAX,
> >>>>>   };
> >>>>>
> >>>>>   struct vring_desc_state_split {
> >>>>>        void *data;                     /* Data for callback. */
> >>>>> +     u32 total_len;                  /* Buffer Length */
> >>>>>
> >>>>>        /* Indirect desc table and extra table, if any. These two will be
> >>>>>         * allocated together. So we won't stress more to the memory allocator.
> >>>>> @@ -84,6 +87,7 @@ struct vring_desc_state_split {
> >>>>>
> >>>>>   struct vring_desc_state_packed {
> >>>>>        void *data;                     /* Data for callback. */
> >>>>> +     u32 total_len;                  /* Buffer Length */
> >>>>>
> >>>>>        /* Indirect desc table and extra table, if any. These two will be
> >>>>>         * allocated together. So we won't stress more to the memory allocator.
> >>>>
> >>>> We are bloating up the cache footprint for everyone,
> >>>> so there's a chance of regressions.
> >>>> Pls include benchmark for in order off, to make sure we
> >>>> are not regressing.
> >>>
> >>> Ok.
> >>>
> >>>> How big was the ring?
> >>>
> >>> 256.
> >>
> >> that is very modest, you want to fill at least one cache way,
> >> preferably more.
> >
> > I can test larger queue sizes.
> >
> >>
> >>>> Worth trying with a biggish one, where there is more cache
> >>>> pressure.
> >>>
> >>> Ok.
> >>>
> >>>>
> >>>>
> >>>> Why not have a separate state for in-order?
> >>>
> >>> It can work.
> >>>
> >>>>
> >>>>
> >>>>
> >>>>> @@ -206,6 +210,12 @@ struct vring_virtqueue {
> >>>>>
> >>>>>        /* Head of free buffer list. */
> >>>>>        unsigned int free_head;
> >>>>> +
> >>>>> +     /* Head of the batched used buffers, vq->num means no batching */
> >>>>> +     unsigned int batch_head;
> >>>>> +
> >>>>> +     unsigned int batch_len;
> >>>>> +
> >>>>
> >>>> Are these two only used for in-order? Please document that.
> >>>
> >>> Yes, I will do that.
> >>>
> >>>> I also want some documentation about the batching trickery
> >>>> used please.
> >>>> What is batched, when, how is batching flushed, why are we
> >>>> only batching in-order ...
> >>>
> >>> I'm not sure I get things like this, what you want seems to be the
> >>> behaviour of the device which has been stated by the spec or I may
> >>> miss something here.
> >>
> >> "a single used ring entry with the id corresponding to the
> >>   head entry of the descriptor chain describing the last buffer in the
> >>   batch"
> >> ?
> >
> > Exactly.
> >
> >>
> >> so together they form this used ring entry describing the last buffer?
> >> "head" is the id and "len" the length?
> >
> > Yes.
> >
> >>
> >> maybe
> >>
> >>          /*
> >>           * With IN_ORDER, devices write a single used ring entry with
> >>           * the id corresponding to the head entry of the descriptor chain
> >>           * describing the last buffer in the batch
> >>           */
> >>          struct used_entry {
> >>                  u32 id;
> >>                  u32 len;
> >>          } batch_last;
> >>
> >> ?
> >
> > This should be fine.
> >
> >>
> >>
> >>
> >>
> >>>>
> >>>>
> >>>>
> >>>>
> >>>>>        /* Number we've added since last sync. */
> >>>>>        unsigned int num_added;
> >>>>>
> >>>>> @@ -256,10 +266,14 @@ static void vring_free(struct virtqueue *_vq);
> >>>>>
> >>>>>   #define to_vvq(_vq) container_of_const(_vq, struct vring_virtqueue, vq)
> >>>>>
> >>>>> -
> >>>>>   static inline bool virtqueue_is_packed(const struct vring_virtqueue *vq)
> >>>>>   {
> >>>>> -     return vq->layout == PACKED;
> >>>>> +     return vq->layout == PACKED || vq->layout == PACKED_IN_ORDER;
> >>>>> +}
> >>>>> +
> >>>>> +static inline bool virtqueue_is_in_order(const struct vring_virtqueue *vq)
> >>>>> +{
> >>>>> +     return vq->layout == SPLIT_IN_ORDER || vq->layout == PACKED_IN_ORDER;
> >>>>>   }
> >>>>>
> >>>>>   static bool virtqueue_use_indirect(const struct vring_virtqueue *vq,
> >>>>> @@ -570,7 +584,7 @@ static inline int virtqueue_add_split(struct vring_virtqueue *vq,
> >>>>>        struct vring_desc_extra *extra;
> >>>>>        struct scatterlist *sg;
> >>>>>        struct vring_desc *desc;
> >>>>> -     unsigned int i, n, c, avail, descs_used, err_idx;
> >>>>> +     unsigned int i, n, c, avail, descs_used, err_idx, total_len = 0;
> >>>>
> >>>>
> >>>> I would add a comment here:
> >>>>
> >>>> /* Total length for in-order */
> >>>> unsigned int total_len = 0;
> >>>
> >>> Ok.
> >>>
> >>> Thanks
> >>
> >
>