[v1] vDPA shadow virtqueue

[PATCH 18/31] vhost: Shadow virtqueue buffers forwarding

Posted by Eugenio Pérez 4 years ago

Initial version of shadow virtqueue that actually forward buffers. There
is no iommu support at the moment, and that will be addressed in future
patches of this series. Since all vhost-vdpa devices use forced IOMMU,
this means that SVQ is not usable at this point of the series on any
device.

For simplicity it only supports modern devices, that expects vring
in little endian, with split ring and no event idx or indirect
descriptors. Support for them will not be added in this series.

It reuses the VirtQueue code for the device part. The driver part is
based on Linux's virtio_ring driver, but with stripped functionality
and optimizations so it's easier to review.

However, forwarding buffers have some particular pieces: One of the most
unexpected ones is that a guest's buffer can expand through more than
one descriptor in SVQ. While this is handled gracefully by qemu's
emulated virtio devices, it may cause unexpected SVQ queue full. This
patch also solves it by checking for this condition at both guest's
kicks and device's calls. The code may be more elegant in the future if
SVQ code runs in its own iocontext.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.h |   2 +
 hw/virtio/vhost-shadow-virtqueue.c | 365 ++++++++++++++++++++++++++++-
 hw/virtio/vhost-vdpa.c             | 111 ++++++++-
 3 files changed, 462 insertions(+), 16 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index 39aef5ffdf..19c934af49 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -33,6 +33,8 @@ uint16_t vhost_svq_get_num(const VhostShadowVirtqueue *svq);
 size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
 size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
 
+void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
+                     VirtQueue *vq);
 void vhost_svq_stop(VhostShadowVirtqueue *svq);
 
 VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize);
diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index 7c168075d7..a1a404f68f 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -9,6 +9,8 @@
 
 #include "qemu/osdep.h"
 #include "hw/virtio/vhost-shadow-virtqueue.h"
+#include "hw/virtio/vhost.h"
+#include "hw/virtio/virtio-access.h"
 #include "standard-headers/linux/vhost_types.h"
 
 #include "qemu/error-report.h"
@@ -36,6 +38,33 @@ typedef struct VhostShadowVirtqueue {
 
     /* Guest's call notifier, where SVQ calls guest. */
     EventNotifier svq_call;
+
+    /* Virtio queue shadowing */
+    VirtQueue *vq;
+
+    /* Virtio device */
+    VirtIODevice *vdev;
+
+    /* Map for returning guest's descriptors */
+    VirtQueueElement **ring_id_maps;
+
+    /* Next VirtQueue element that guest made available */
+    VirtQueueElement *next_guest_avail_elem;
+
+    /* Next head to expose to device */
+    uint16_t avail_idx_shadow;
+
+    /* Next free descriptor */
+    uint16_t free_head;
+
+    /* Last seen used idx */
+    uint16_t shadow_used_idx;
+
+    /* Next head to consume from device */
+    uint16_t last_used_idx;
+
+    /* Cache for the exposed notification flag */
+    bool notification;
 } VhostShadowVirtqueue;
 
 #define INVALID_SVQ_KICK_FD -1
@@ -148,30 +177,294 @@ bool vhost_svq_ack_guest_features(uint64_t dev_features,
     return true;
 }
 
-/* Forward guest notifications */
-static void vhost_handle_guest_kick(EventNotifier *n)
+/**
+ * Number of descriptors that SVQ can make available from the guest.
+ *
+ * @svq   The svq
+ */
+static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
 {
-    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
-                                             svq_kick);
+    return svq->vring.num - (svq->avail_idx_shadow - svq->shadow_used_idx);
+}
+
+static void vhost_svq_set_notification(VhostShadowVirtqueue *svq, bool enable)
+{
+    uint16_t notification_flag;
 
-    if (unlikely(!event_notifier_test_and_clear(n))) {
+    if (svq->notification == enable) {
+        return;
+    }
+
+    notification_flag = cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
+
+    svq->notification = enable;
+    if (enable) {
+        svq->vring.avail->flags &= ~notification_flag;
+    } else {
+        svq->vring.avail->flags |= notification_flag;
+    }
+}
+
+static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
+                                    const struct iovec *iovec,
+                                    size_t num, bool more_descs, bool write)
+{
+    uint16_t i = svq->free_head, last = svq->free_head;
+    unsigned n;
+    uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0;
+    vring_desc_t *descs = svq->vring.desc;
+
+    if (num == 0) {
+        return;
+    }
+
+    for (n = 0; n < num; n++) {
+        if (more_descs || (n + 1 < num)) {
+            descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT);
+        } else {
+            descs[i].flags = flags;
+        }
+        descs[i].addr = cpu_to_le64((hwaddr)iovec[n].iov_base);
+        descs[i].len = cpu_to_le32(iovec[n].iov_len);
+
+        last = i;
+        i = cpu_to_le16(descs[i].next);
+    }
+
+    svq->free_head = le16_to_cpu(descs[last].next);
+}
+
+static unsigned vhost_svq_add_split(VhostShadowVirtqueue *svq,
+                                    VirtQueueElement *elem)
+{
+    int head;
+    unsigned avail_idx;
+    vring_avail_t *avail = svq->vring.avail;
+
+    head = svq->free_head;
+
+    /* We need some descriptors here */
+    assert(elem->out_num || elem->in_num);
+
+    vhost_vring_write_descs(svq, elem->out_sg, elem->out_num,
+                            elem->in_num > 0, false);
+    vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
+
+    /*
+     * Put entry in available array (but don't update avail->idx until they
+     * do sync).
+     */
+    avail_idx = svq->avail_idx_shadow & (svq->vring.num - 1);
+    avail->ring[avail_idx] = cpu_to_le16(head);
+    svq->avail_idx_shadow++;
+
+    /* Update avail index after the descriptor is wrote */
+    smp_wmb();
+    avail->idx = cpu_to_le16(svq->avail_idx_shadow);
+
+    return head;
+}
+
+static void vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
+{
+    unsigned qemu_head = vhost_svq_add_split(svq, elem);
+
+    svq->ring_id_maps[qemu_head] = elem;
+}
+
+static void vhost_svq_kick(VhostShadowVirtqueue *svq)
+{
+    /* We need to expose available array entries before checking used flags */
+    smp_mb();
+    if (svq->vring.used->flags & VRING_USED_F_NO_NOTIFY) {
         return;
     }
 
     event_notifier_set(&svq->hdev_kick);
 }
 
-/* Forward vhost notifications */
+/**
+ * Forward available buffers.
+ *
+ * @svq Shadow VirtQueue
+ *
+ * Note that this function does not guarantee that all guest's available
+ * buffers are available to the device in SVQ avail ring. The guest may have
+ * exposed a GPA / GIOVA congiuous buffer, but it may not be contiguous in qemu
+ * vaddr.
+ *
+ * If that happens, guest's kick notifications will be disabled until device
+ * makes some buffers used.
+ */
+static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
+{
+    /* Clear event notifier */
+    event_notifier_test_and_clear(&svq->svq_kick);
+
+    /* Make available as many buffers as possible */
+    do {
+        if (virtio_queue_get_notification(svq->vq)) {
+            virtio_queue_set_notification(svq->vq, false);
+        }
+
+        while (true) {
+            VirtQueueElement *elem;
+
+            if (svq->next_guest_avail_elem) {
+                elem = g_steal_pointer(&svq->next_guest_avail_elem);
+            } else {
+                elem = virtqueue_pop(svq->vq, sizeof(*elem));
+            }
+
+            if (!elem) {
+                break;
+            }
+
+            if (elem->out_num + elem->in_num >
+                vhost_svq_available_slots(svq)) {
+                /*
+                 * This condition is possible since a contiguous buffer in GPA
+                 * does not imply a contiguous buffer in qemu's VA
+                 * scatter-gather segments. If that happen, the buffer exposed
+                 * to the device needs to be a chain of descriptors at this
+                 * moment.
+                 *
+                 * SVQ cannot hold more available buffers if we are here:
+                 * queue the current guest descriptor and ignore further kicks
+                 * until some elements are used.
+                 */
+                svq->next_guest_avail_elem = elem;
+                return;
+            }
+
+            vhost_svq_add(svq, elem);
+            vhost_svq_kick(svq);
+        }
+
+        virtio_queue_set_notification(svq->vq, true);
+    } while (!virtio_queue_empty(svq->vq));
+}
+
+/**
+ * Handle guest's kick.
+ *
+ * @n guest kick event notifier, the one that guest set to notify svq.
+ */
+static void vhost_handle_guest_kick_notifier(EventNotifier *n)
+{
+    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
+                                             svq_kick);
+    vhost_handle_guest_kick(svq);
+}
+
+static bool vhost_svq_more_used(VhostShadowVirtqueue *svq)
+{
+    if (svq->last_used_idx != svq->shadow_used_idx) {
+        return true;
+    }
+
+    svq->shadow_used_idx = cpu_to_le16(svq->vring.used->idx);
+
+    return svq->last_used_idx != svq->shadow_used_idx;
+}
+
+static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq)
+{
+    vring_desc_t *descs = svq->vring.desc;
+    const vring_used_t *used = svq->vring.used;
+    vring_used_elem_t used_elem;
+    uint16_t last_used;
+
+    if (!vhost_svq_more_used(svq)) {
+        return NULL;
+    }
+
+    /* Only get used array entries after they have been exposed by dev */
+    smp_rmb();
+    last_used = svq->last_used_idx & (svq->vring.num - 1);
+    used_elem.id = le32_to_cpu(used->ring[last_used].id);
+    used_elem.len = le32_to_cpu(used->ring[last_used].len);
+
+    svq->last_used_idx++;
+    if (unlikely(used_elem.id >= svq->vring.num)) {
+        error_report("Device %s says index %u is used", svq->vdev->name,
+                     used_elem.id);
+        return NULL;
+    }
+
+    if (unlikely(!svq->ring_id_maps[used_elem.id])) {
+        error_report(
+            "Device %s says index %u is used, but it was not available",
+            svq->vdev->name, used_elem.id);
+        return NULL;
+    }
+
+    descs[used_elem.id].next = svq->free_head;
+    svq->free_head = used_elem.id;
+
+    svq->ring_id_maps[used_elem.id]->len = used_elem.len;
+    return g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
+}
+
+static void vhost_svq_flush(VhostShadowVirtqueue *svq,
+                            bool check_for_avail_queue)
+{
+    VirtQueue *vq = svq->vq;
+
+    /* Make as many buffers as possible used. */
+    do {
+        unsigned i = 0;
+
+        vhost_svq_set_notification(svq, false);
+        while (true) {
+            g_autofree VirtQueueElement *elem = vhost_svq_get_buf(svq);
+            if (!elem) {
+                break;
+            }
+
+            if (unlikely(i >= svq->vring.num)) {
+                virtio_error(svq->vdev,
+                         "More than %u used buffers obtained in a %u size SVQ",
+                         i, svq->vring.num);
+                virtqueue_fill(vq, elem, elem->len, i);
+                virtqueue_flush(vq, i);
+                i = 0;
+            }
+            virtqueue_fill(vq, elem, elem->len, i++);
+        }
+
+        virtqueue_flush(vq, i);
+        event_notifier_set(&svq->svq_call);
+
+        if (check_for_avail_queue && svq->next_guest_avail_elem) {
+            /*
+             * Avail ring was full when vhost_svq_flush was called, so it's a
+             * good moment to make more descriptors available if possible
+             */
+            vhost_handle_guest_kick(svq);
+        }
+
+        vhost_svq_set_notification(svq, true);
+    } while (vhost_svq_more_used(svq));
+}
+
+/**
+ * Forward used buffers.
+ *
+ * @n hdev call event notifier, the one that device set to notify svq.
+ *
+ * Note that we are not making any buffers available in the loop, there is no
+ * way that it runs more than virtqueue size times.
+ */
 static void vhost_svq_handle_call(EventNotifier *n)
 {
     VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
                                              hdev_call);
 
-    if (unlikely(!event_notifier_test_and_clear(n))) {
-        return;
-    }
+    /* Clear event notifier */
+    event_notifier_test_and_clear(n);
 
-    event_notifier_set(&svq->svq_call);
+    vhost_svq_flush(svq, true);
 }
 
 /**
@@ -258,13 +551,38 @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
      * need to explicitely check for them.
      */
     event_notifier_init_fd(&svq->svq_kick, svq_kick_fd);
-    event_notifier_set_handler(&svq->svq_kick, vhost_handle_guest_kick);
+    event_notifier_set_handler(&svq->svq_kick,
+                               vhost_handle_guest_kick_notifier);
 
     if (!check_old || event_notifier_test_and_clear(&tmp)) {
         event_notifier_set(&svq->hdev_kick);
     }
 }
 
+/**
+ * Start shadow virtqueue operation.
+ *
+ * @svq Shadow Virtqueue
+ * @vdev        VirtIO device
+ * @vq          Virtqueue to shadow
+ */
+void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
+                     VirtQueue *vq)
+{
+    svq->next_guest_avail_elem = NULL;
+    svq->avail_idx_shadow = 0;
+    svq->shadow_used_idx = 0;
+    svq->last_used_idx = 0;
+    svq->vdev = vdev;
+    svq->vq = vq;
+
+    memset(svq->vring.avail, 0, sizeof(*svq->vring.avail));
+    memset(svq->vring.used, 0, sizeof(*svq->vring.avail));
+    for (unsigned i = 0; i < svq->vring.num - 1; i++) {
+        svq->vring.desc[i].next = cpu_to_le16(i + 1);
+    }
+}
+
 /**
  * Stop shadow virtqueue operation.
  * @svq Shadow Virtqueue
@@ -272,6 +590,28 @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
 void vhost_svq_stop(VhostShadowVirtqueue *svq)
 {
     event_notifier_set_handler(&svq->svq_kick, NULL);
+    g_autofree VirtQueueElement *next_avail_elem = NULL;
+
+    if (!svq->vq) {
+        return;
+    }
+
+    /* Send all pending used descriptors to guest */
+    vhost_svq_flush(svq, false);
+
+    for (unsigned i = 0; i < svq->vring.num; ++i) {
+        g_autofree VirtQueueElement *elem = NULL;
+        elem = g_steal_pointer(&svq->ring_id_maps[i]);
+        if (elem) {
+            virtqueue_detach_element(svq->vq, elem, elem->len);
+        }
+    }
+
+    next_avail_elem = g_steal_pointer(&svq->next_guest_avail_elem);
+    if (next_avail_elem) {
+        virtqueue_detach_element(svq->vq, next_avail_elem,
+                                 next_avail_elem->len);
+    }
 }
 
 /**
@@ -316,7 +656,7 @@ VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize)
     memset(svq->vring.desc, 0, driver_size);
     svq->vring.used = qemu_memalign(qemu_real_host_page_size, device_size);
     memset(svq->vring.used, 0, device_size);
-
+    svq->ring_id_maps = g_new0(VirtQueueElement *, qsize);
     event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
     return g_steal_pointer(&svq);
 
@@ -335,6 +675,7 @@ void vhost_svq_free(VhostShadowVirtqueue *vq)
     event_notifier_cleanup(&vq->hdev_kick);
     event_notifier_set_handler(&vq->hdev_call, NULL);
     event_notifier_cleanup(&vq->hdev_call);
+    g_free(vq->ring_id_maps);
     qemu_vfree(vq->vring.desc);
     qemu_vfree(vq->vring.used);
     g_free(vq);
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index 53e14bafa0..0e5c00ed7e 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -752,9 +752,9 @@ static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
  * Note that this function does not rewind kick file descriptor if cannot set
  * call one.
  */
-static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
-                                VhostShadowVirtqueue *svq,
-                                unsigned idx)
+static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
+                                  VhostShadowVirtqueue *svq,
+                                  unsigned idx)
 {
     struct vhost_vring_file file = {
         .index = dev->vq_index + idx,
@@ -767,7 +767,7 @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
     r = vhost_vdpa_set_vring_dev_kick(dev, &file);
     if (unlikely(r != 0)) {
         error_report("Can't set device kick fd (%d)", -r);
-        return false;
+        return r;
     }
 
     event_notifier = vhost_svq_get_svq_call_notifier(svq);
@@ -777,6 +777,99 @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
         error_report("Can't set device call fd (%d)", -r);
     }
 
+    return r;
+}
+
+/**
+ * Unmap SVQ area in the device
+ */
+static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
+                                      hwaddr size)
+{
+    int r;
+
+    size = ROUND_UP(size, qemu_real_host_page_size);
+    r = vhost_vdpa_dma_unmap(v, iova, size);
+    return r == 0;
+}
+
+static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
+                                       const VhostShadowVirtqueue *svq)
+{
+    struct vhost_vdpa *v = dev->opaque;
+    struct vhost_vring_addr svq_addr;
+    size_t device_size = vhost_svq_device_area_size(svq);
+    size_t driver_size = vhost_svq_driver_area_size(svq);
+    bool ok;
+
+    vhost_svq_get_vring_addr(svq, &svq_addr);
+
+    ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size);
+    if (unlikely(!ok)) {
+        return false;
+    }
+
+    return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size);
+}
+
+/**
+ * Map shadow virtqueue rings in device
+ *
+ * @dev   The vhost device
+ * @svq   The shadow virtqueue
+ */
+static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
+                                     const VhostShadowVirtqueue *svq)
+{
+    struct vhost_vdpa *v = dev->opaque;
+    struct vhost_vring_addr svq_addr;
+    size_t device_size = vhost_svq_device_area_size(svq);
+    size_t driver_size = vhost_svq_driver_area_size(svq);
+    int r;
+
+    vhost_svq_get_vring_addr(svq, &svq_addr);
+
+    r = vhost_vdpa_dma_map(v, svq_addr.desc_user_addr, driver_size,
+                           (void *)svq_addr.desc_user_addr, true);
+    if (unlikely(r != 0)) {
+        return false;
+    }
+
+    r = vhost_vdpa_dma_map(v, svq_addr.used_user_addr, device_size,
+                           (void *)svq_addr.used_user_addr, false);
+    return r == 0;
+}
+
+static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
+                                VhostShadowVirtqueue *svq,
+                                unsigned idx)
+{
+    uint16_t vq_index = dev->vq_index + idx;
+    struct vhost_vring_state s = {
+        .index = vq_index,
+    };
+    int r;
+    bool ok;
+
+    r = vhost_vdpa_set_dev_vring_base(dev, &s);
+    if (unlikely(r)) {
+        error_report("Can't set vring base (%d)", r);
+        return false;
+    }
+
+    s.num = vhost_svq_get_num(svq);
+    r = vhost_vdpa_set_dev_vring_num(dev, &s);
+    if (unlikely(r)) {
+        error_report("Can't set vring num (%d)", r);
+        return false;
+    }
+
+    ok = vhost_vdpa_svq_map_rings(dev, svq);
+    if (unlikely(!ok)) {
+        return false;
+    }
+
+    r = vhost_vdpa_svq_set_fds(dev, svq, idx);
     return r == 0;
 }
 
@@ -788,14 +881,24 @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
     if (started) {
         vhost_vdpa_host_notifiers_init(dev);
         for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
+            VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
             VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
             bool ok = vhost_vdpa_svq_setup(dev, svq, i);
             if (unlikely(!ok)) {
                 return -1;
             }
+            vhost_svq_start(svq, dev->vdev, vq);
         }
         vhost_vdpa_set_vring_ready(dev);
     } else {
+        for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
+            VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs,
+                                                          i);
+            bool ok = vhost_vdpa_svq_unmap_rings(dev, svq);
+            if (unlikely(!ok)) {
+                return -1;
+            }
+        }
         vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
     }
 
-- 
2.27.0

Re: [PATCH 18/31] vhost: Shadow virtqueue buffers forwarding

Posted by Jason Wang 4 years ago

在 2022/1/22 上午4:27, Eugenio Pérez 写道:
> Initial version of shadow virtqueue that actually forward buffers. There
> is no iommu support at the moment, and that will be addressed in future
> patches of this series. Since all vhost-vdpa devices use forced IOMMU,
> this means that SVQ is not usable at this point of the series on any
> device.
>
> For simplicity it only supports modern devices, that expects vring
> in little endian, with split ring and no event idx or indirect
> descriptors. Support for them will not be added in this series.
>
> It reuses the VirtQueue code for the device part. The driver part is
> based on Linux's virtio_ring driver, but with stripped functionality
> and optimizations so it's easier to review.
>
> However, forwarding buffers have some particular pieces: One of the most
> unexpected ones is that a guest's buffer can expand through more than
> one descriptor in SVQ. While this is handled gracefully by qemu's
> emulated virtio devices, it may cause unexpected SVQ queue full. This
> patch also solves it by checking for this condition at both guest's
> kicks and device's calls. The code may be more elegant in the future if
> SVQ code runs in its own iocontext.
>
> Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> ---
>   hw/virtio/vhost-shadow-virtqueue.h |   2 +
>   hw/virtio/vhost-shadow-virtqueue.c | 365 ++++++++++++++++++++++++++++-
>   hw/virtio/vhost-vdpa.c             | 111 ++++++++-
>   3 files changed, 462 insertions(+), 16 deletions(-)
>
> diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
> index 39aef5ffdf..19c934af49 100644
> --- a/hw/virtio/vhost-shadow-virtqueue.h
> +++ b/hw/virtio/vhost-shadow-virtqueue.h
> @@ -33,6 +33,8 @@ uint16_t vhost_svq_get_num(const VhostShadowVirtqueue *svq);
>   size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
>   size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
>   
> +void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
> +                     VirtQueue *vq);
>   void vhost_svq_stop(VhostShadowVirtqueue *svq);
>   
>   VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize);
> diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
> index 7c168075d7..a1a404f68f 100644
> --- a/hw/virtio/vhost-shadow-virtqueue.c
> +++ b/hw/virtio/vhost-shadow-virtqueue.c
> @@ -9,6 +9,8 @@
>   
>   #include "qemu/osdep.h"
>   #include "hw/virtio/vhost-shadow-virtqueue.h"
> +#include "hw/virtio/vhost.h"
> +#include "hw/virtio/virtio-access.h"
>   #include "standard-headers/linux/vhost_types.h"
>   
>   #include "qemu/error-report.h"
> @@ -36,6 +38,33 @@ typedef struct VhostShadowVirtqueue {
>   
>       /* Guest's call notifier, where SVQ calls guest. */
>       EventNotifier svq_call;
> +
> +    /* Virtio queue shadowing */
> +    VirtQueue *vq;
> +
> +    /* Virtio device */
> +    VirtIODevice *vdev;
> +
> +    /* Map for returning guest's descriptors */
> +    VirtQueueElement **ring_id_maps;
> +
> +    /* Next VirtQueue element that guest made available */
> +    VirtQueueElement *next_guest_avail_elem;
> +
> +    /* Next head to expose to device */
> +    uint16_t avail_idx_shadow;
> +
> +    /* Next free descriptor */
> +    uint16_t free_head;
> +
> +    /* Last seen used idx */
> +    uint16_t shadow_used_idx;
> +
> +    /* Next head to consume from device */
> +    uint16_t last_used_idx;
> +
> +    /* Cache for the exposed notification flag */
> +    bool notification;
>   } VhostShadowVirtqueue;
>   
>   #define INVALID_SVQ_KICK_FD -1
> @@ -148,30 +177,294 @@ bool vhost_svq_ack_guest_features(uint64_t dev_features,
>       return true;
>   }
>   
> -/* Forward guest notifications */
> -static void vhost_handle_guest_kick(EventNotifier *n)
> +/**
> + * Number of descriptors that SVQ can make available from the guest.
> + *
> + * @svq   The svq
> + */
> +static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
>   {
> -    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
> -                                             svq_kick);
> +    return svq->vring.num - (svq->avail_idx_shadow - svq->shadow_used_idx);
> +}
> +
> +static void vhost_svq_set_notification(VhostShadowVirtqueue *svq, bool enable)
> +{
> +    uint16_t notification_flag;
>   
> -    if (unlikely(!event_notifier_test_and_clear(n))) {
> +    if (svq->notification == enable) {
> +        return;
> +    }
> +
> +    notification_flag = cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
> +
> +    svq->notification = enable;
> +    if (enable) {
> +        svq->vring.avail->flags &= ~notification_flag;
> +    } else {
> +        svq->vring.avail->flags |= notification_flag;
> +    }
> +}
> +
> +static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
> +                                    const struct iovec *iovec,
> +                                    size_t num, bool more_descs, bool write)
> +{
> +    uint16_t i = svq->free_head, last = svq->free_head;
> +    unsigned n;
> +    uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0;
> +    vring_desc_t *descs = svq->vring.desc;
> +
> +    if (num == 0) {
> +        return;
> +    }
> +
> +    for (n = 0; n < num; n++) {
> +        if (more_descs || (n + 1 < num)) {
> +            descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT);
> +        } else {
> +            descs[i].flags = flags;
> +        }
> +        descs[i].addr = cpu_to_le64((hwaddr)iovec[n].iov_base);
> +        descs[i].len = cpu_to_le32(iovec[n].iov_len);
> +
> +        last = i;
> +        i = cpu_to_le16(descs[i].next);
> +    }
> +
> +    svq->free_head = le16_to_cpu(descs[last].next);
> +}
> +
> +static unsigned vhost_svq_add_split(VhostShadowVirtqueue *svq,
> +                                    VirtQueueElement *elem)
> +{
> +    int head;
> +    unsigned avail_idx;
> +    vring_avail_t *avail = svq->vring.avail;
> +
> +    head = svq->free_head;
> +
> +    /* We need some descriptors here */
> +    assert(elem->out_num || elem->in_num);


Looks like this could be triggered by guest, we need fail instead assert 
here.


> +
> +    vhost_vring_write_descs(svq, elem->out_sg, elem->out_num,
> +                            elem->in_num > 0, false);
> +    vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
> +
> +    /*
> +     * Put entry in available array (but don't update avail->idx until they
> +     * do sync).
> +     */
> +    avail_idx = svq->avail_idx_shadow & (svq->vring.num - 1);
> +    avail->ring[avail_idx] = cpu_to_le16(head);
> +    svq->avail_idx_shadow++;
> +
> +    /* Update avail index after the descriptor is wrote */
> +    smp_wmb();
> +    avail->idx = cpu_to_le16(svq->avail_idx_shadow);
> +
> +    return head;
> +}
> +
> +static void vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
> +{
> +    unsigned qemu_head = vhost_svq_add_split(svq, elem);
> +
> +    svq->ring_id_maps[qemu_head] = elem;
> +}
> +
> +static void vhost_svq_kick(VhostShadowVirtqueue *svq)
> +{
> +    /* We need to expose available array entries before checking used flags */
> +    smp_mb();
> +    if (svq->vring.used->flags & VRING_USED_F_NO_NOTIFY) {
>           return;
>       }
>   
>       event_notifier_set(&svq->hdev_kick);
>   }
>   
> -/* Forward vhost notifications */
> +/**
> + * Forward available buffers.
> + *
> + * @svq Shadow VirtQueue
> + *
> + * Note that this function does not guarantee that all guest's available
> + * buffers are available to the device in SVQ avail ring. The guest may have
> + * exposed a GPA / GIOVA congiuous buffer, but it may not be contiguous in qemu
> + * vaddr.
> + *
> + * If that happens, guest's kick notifications will be disabled until device
> + * makes some buffers used.
> + */
> +static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
> +{
> +    /* Clear event notifier */
> +    event_notifier_test_and_clear(&svq->svq_kick);
> +
> +    /* Make available as many buffers as possible */
> +    do {
> +        if (virtio_queue_get_notification(svq->vq)) {
> +            virtio_queue_set_notification(svq->vq, false);


This looks like an optimization the should belong to 
virtio_queue_set_notification() itself.


> +        }
> +
> +        while (true) {
> +            VirtQueueElement *elem;
> +
> +            if (svq->next_guest_avail_elem) {
> +                elem = g_steal_pointer(&svq->next_guest_avail_elem);
> +            } else {
> +                elem = virtqueue_pop(svq->vq, sizeof(*elem));
> +            }
> +
> +            if (!elem) {
> +                break;
> +            }
> +
> +            if (elem->out_num + elem->in_num >
> +                vhost_svq_available_slots(svq)) {
> +                /*
> +                 * This condition is possible since a contiguous buffer in GPA
> +                 * does not imply a contiguous buffer in qemu's VA
> +                 * scatter-gather segments. If that happen, the buffer exposed
> +                 * to the device needs to be a chain of descriptors at this
> +                 * moment.
> +                 *
> +                 * SVQ cannot hold more available buffers if we are here:
> +                 * queue the current guest descriptor and ignore further kicks
> +                 * until some elements are used.
> +                 */
> +                svq->next_guest_avail_elem = elem;
> +                return;
> +            }
> +
> +            vhost_svq_add(svq, elem);
> +            vhost_svq_kick(svq);
> +        }
> +
> +        virtio_queue_set_notification(svq->vq, true);
> +    } while (!virtio_queue_empty(svq->vq));
> +}
> +
> +/**
> + * Handle guest's kick.
> + *
> + * @n guest kick event notifier, the one that guest set to notify svq.
> + */
> +static void vhost_handle_guest_kick_notifier(EventNotifier *n)
> +{
> +    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
> +                                             svq_kick);
> +    vhost_handle_guest_kick(svq);
> +}
> +
> +static bool vhost_svq_more_used(VhostShadowVirtqueue *svq)
> +{
> +    if (svq->last_used_idx != svq->shadow_used_idx) {
> +        return true;
> +    }
> +
> +    svq->shadow_used_idx = cpu_to_le16(svq->vring.used->idx);
> +
> +    return svq->last_used_idx != svq->shadow_used_idx;
> +}
> +
> +static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq)
> +{
> +    vring_desc_t *descs = svq->vring.desc;
> +    const vring_used_t *used = svq->vring.used;
> +    vring_used_elem_t used_elem;
> +    uint16_t last_used;
> +
> +    if (!vhost_svq_more_used(svq)) {
> +        return NULL;
> +    }
> +
> +    /* Only get used array entries after they have been exposed by dev */
> +    smp_rmb();
> +    last_used = svq->last_used_idx & (svq->vring.num - 1);
> +    used_elem.id = le32_to_cpu(used->ring[last_used].id);
> +    used_elem.len = le32_to_cpu(used->ring[last_used].len);
> +
> +    svq->last_used_idx++;
> +    if (unlikely(used_elem.id >= svq->vring.num)) {
> +        error_report("Device %s says index %u is used", svq->vdev->name,
> +                     used_elem.id);
> +        return NULL;
> +    }
> +
> +    if (unlikely(!svq->ring_id_maps[used_elem.id])) {
> +        error_report(
> +            "Device %s says index %u is used, but it was not available",
> +            svq->vdev->name, used_elem.id);
> +        return NULL;
> +    }
> +
> +    descs[used_elem.id].next = svq->free_head;
> +    svq->free_head = used_elem.id;
> +
> +    svq->ring_id_maps[used_elem.id]->len = used_elem.len;
> +    return g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
> +}
> +
> +static void vhost_svq_flush(VhostShadowVirtqueue *svq,
> +                            bool check_for_avail_queue)
> +{
> +    VirtQueue *vq = svq->vq;
> +
> +    /* Make as many buffers as possible used. */
> +    do {
> +        unsigned i = 0;
> +
> +        vhost_svq_set_notification(svq, false);
> +        while (true) {
> +            g_autofree VirtQueueElement *elem = vhost_svq_get_buf(svq);
> +            if (!elem) {
> +                break;
> +            }
> +
> +            if (unlikely(i >= svq->vring.num)) {
> +                virtio_error(svq->vdev,
> +                         "More than %u used buffers obtained in a %u size SVQ",
> +                         i, svq->vring.num);
> +                virtqueue_fill(vq, elem, elem->len, i);
> +                virtqueue_flush(vq, i);


Let's simply use virtqueue_push() here?


> +                i = 0;


Do we need to bail out here?


> +            }
> +            virtqueue_fill(vq, elem, elem->len, i++);
> +        }
> +
> +        virtqueue_flush(vq, i);
> +        event_notifier_set(&svq->svq_call);
> +
> +        if (check_for_avail_queue && svq->next_guest_avail_elem) {
> +            /*
> +             * Avail ring was full when vhost_svq_flush was called, so it's a
> +             * good moment to make more descriptors available if possible
> +             */
> +            vhost_handle_guest_kick(svq);


Is there better to have a similar check as vhost_handle_guest_kick() did?

             if (elem->out_num + elem->in_num >
                 vhost_svq_available_slots(svq)) {


> +        }
> +
> +        vhost_svq_set_notification(svq, true);


A mb() is needed here? Otherwise we may lost a call here (where 
vhost_svq_more_used() is run before vhost_svq_set_notification()).


> +    } while (vhost_svq_more_used(svq));
> +}
> +
> +/**
> + * Forward used buffers.
> + *
> + * @n hdev call event notifier, the one that device set to notify svq.
> + *
> + * Note that we are not making any buffers available in the loop, there is no
> + * way that it runs more than virtqueue size times.
> + */
>   static void vhost_svq_handle_call(EventNotifier *n)
>   {
>       VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
>                                                hdev_call);
>   
> -    if (unlikely(!event_notifier_test_and_clear(n))) {
> -        return;
> -    }
> +    /* Clear event notifier */
> +    event_notifier_test_and_clear(n);


Any reason that we remove the above check?


>   
> -    event_notifier_set(&svq->svq_call);
> +    vhost_svq_flush(svq, true);
>   }
>   
>   /**
> @@ -258,13 +551,38 @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
>        * need to explicitely check for them.
>        */
>       event_notifier_init_fd(&svq->svq_kick, svq_kick_fd);
> -    event_notifier_set_handler(&svq->svq_kick, vhost_handle_guest_kick);
> +    event_notifier_set_handler(&svq->svq_kick,
> +                               vhost_handle_guest_kick_notifier);
>   
>       if (!check_old || event_notifier_test_and_clear(&tmp)) {
>           event_notifier_set(&svq->hdev_kick);
>       }
>   }
>   
> +/**
> + * Start shadow virtqueue operation.
> + *
> + * @svq Shadow Virtqueue
> + * @vdev        VirtIO device
> + * @vq          Virtqueue to shadow
> + */
> +void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
> +                     VirtQueue *vq)
> +{
> +    svq->next_guest_avail_elem = NULL;
> +    svq->avail_idx_shadow = 0;
> +    svq->shadow_used_idx = 0;
> +    svq->last_used_idx = 0;
> +    svq->vdev = vdev;
> +    svq->vq = vq;
> +
> +    memset(svq->vring.avail, 0, sizeof(*svq->vring.avail));
> +    memset(svq->vring.used, 0, sizeof(*svq->vring.avail));
> +    for (unsigned i = 0; i < svq->vring.num - 1; i++) {
> +        svq->vring.desc[i].next = cpu_to_le16(i + 1);
> +    }
> +}
> +
>   /**
>    * Stop shadow virtqueue operation.
>    * @svq Shadow Virtqueue
> @@ -272,6 +590,28 @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
>   void vhost_svq_stop(VhostShadowVirtqueue *svq)
>   {
>       event_notifier_set_handler(&svq->svq_kick, NULL);
> +    g_autofree VirtQueueElement *next_avail_elem = NULL;
> +
> +    if (!svq->vq) {
> +        return;
> +    }
> +
> +    /* Send all pending used descriptors to guest */
> +    vhost_svq_flush(svq, false);
> +
> +    for (unsigned i = 0; i < svq->vring.num; ++i) {
> +        g_autofree VirtQueueElement *elem = NULL;
> +        elem = g_steal_pointer(&svq->ring_id_maps[i]);
> +        if (elem) {
> +            virtqueue_detach_element(svq->vq, elem, elem->len);
> +        }
> +    }
> +
> +    next_avail_elem = g_steal_pointer(&svq->next_guest_avail_elem);
> +    if (next_avail_elem) {
> +        virtqueue_detach_element(svq->vq, next_avail_elem,
> +                                 next_avail_elem->len);
> +    }
>   }
>   
>   /**
> @@ -316,7 +656,7 @@ VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize)
>       memset(svq->vring.desc, 0, driver_size);
>       svq->vring.used = qemu_memalign(qemu_real_host_page_size, device_size);
>       memset(svq->vring.used, 0, device_size);
> -
> +    svq->ring_id_maps = g_new0(VirtQueueElement *, qsize);
>       event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
>       return g_steal_pointer(&svq);
>   
> @@ -335,6 +675,7 @@ void vhost_svq_free(VhostShadowVirtqueue *vq)
>       event_notifier_cleanup(&vq->hdev_kick);
>       event_notifier_set_handler(&vq->hdev_call, NULL);
>       event_notifier_cleanup(&vq->hdev_call);
> +    g_free(vq->ring_id_maps);
>       qemu_vfree(vq->vring.desc);
>       qemu_vfree(vq->vring.used);
>       g_free(vq);
> diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
> index 53e14bafa0..0e5c00ed7e 100644
> --- a/hw/virtio/vhost-vdpa.c
> +++ b/hw/virtio/vhost-vdpa.c
> @@ -752,9 +752,9 @@ static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
>    * Note that this function does not rewind kick file descriptor if cannot set
>    * call one.
>    */
> -static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
> -                                VhostShadowVirtqueue *svq,
> -                                unsigned idx)
> +static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
> +                                  VhostShadowVirtqueue *svq,
> +                                  unsigned idx)
>   {
>       struct vhost_vring_file file = {
>           .index = dev->vq_index + idx,
> @@ -767,7 +767,7 @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
>       r = vhost_vdpa_set_vring_dev_kick(dev, &file);
>       if (unlikely(r != 0)) {
>           error_report("Can't set device kick fd (%d)", -r);
> -        return false;
> +        return r;
>       }
>   
>       event_notifier = vhost_svq_get_svq_call_notifier(svq);
> @@ -777,6 +777,99 @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
>           error_report("Can't set device call fd (%d)", -r);
>       }
>   
> +    return r;
> +}
> +
> +/**
> + * Unmap SVQ area in the device
> + */
> +static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
> +                                      hwaddr size)
> +{
> +    int r;
> +
> +    size = ROUND_UP(size, qemu_real_host_page_size);
> +    r = vhost_vdpa_dma_unmap(v, iova, size);
> +    return r == 0;
> +}
> +
> +static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
> +                                       const VhostShadowVirtqueue *svq)
> +{
> +    struct vhost_vdpa *v = dev->opaque;
> +    struct vhost_vring_addr svq_addr;
> +    size_t device_size = vhost_svq_device_area_size(svq);
> +    size_t driver_size = vhost_svq_driver_area_size(svq);
> +    bool ok;
> +
> +    vhost_svq_get_vring_addr(svq, &svq_addr);
> +
> +    ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size);
> +    if (unlikely(!ok)) {
> +        return false;
> +    }
> +
> +    return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size);
> +}
> +
> +/**
> + * Map shadow virtqueue rings in device
> + *
> + * @dev   The vhost device
> + * @svq   The shadow virtqueue
> + */
> +static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
> +                                     const VhostShadowVirtqueue *svq)
> +{
> +    struct vhost_vdpa *v = dev->opaque;
> +    struct vhost_vring_addr svq_addr;
> +    size_t device_size = vhost_svq_device_area_size(svq);
> +    size_t driver_size = vhost_svq_driver_area_size(svq);
> +    int r;
> +
> +    vhost_svq_get_vring_addr(svq, &svq_addr);
> +
> +    r = vhost_vdpa_dma_map(v, svq_addr.desc_user_addr, driver_size,
> +                           (void *)svq_addr.desc_user_addr, true);
> +    if (unlikely(r != 0)) {
> +        return false;
> +    }
> +
> +    r = vhost_vdpa_dma_map(v, svq_addr.used_user_addr, device_size,
> +                           (void *)svq_addr.used_user_addr, false);


Do we need unmap the driver area if we fail here?

Thanks


> +    return r == 0;
> +}
> +
> +static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
> +                                VhostShadowVirtqueue *svq,
> +                                unsigned idx)
> +{
> +    uint16_t vq_index = dev->vq_index + idx;
> +    struct vhost_vring_state s = {
> +        .index = vq_index,
> +    };
> +    int r;
> +    bool ok;
> +
> +    r = vhost_vdpa_set_dev_vring_base(dev, &s);
> +    if (unlikely(r)) {
> +        error_report("Can't set vring base (%d)", r);
> +        return false;
> +    }
> +
> +    s.num = vhost_svq_get_num(svq);
> +    r = vhost_vdpa_set_dev_vring_num(dev, &s);
> +    if (unlikely(r)) {
> +        error_report("Can't set vring num (%d)", r);
> +        return false;
> +    }
> +
> +    ok = vhost_vdpa_svq_map_rings(dev, svq);
> +    if (unlikely(!ok)) {
> +        return false;
> +    }
> +
> +    r = vhost_vdpa_svq_set_fds(dev, svq, idx);
>       return r == 0;
>   }
>   
> @@ -788,14 +881,24 @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
>       if (started) {
>           vhost_vdpa_host_notifiers_init(dev);
>           for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
> +            VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
>               VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
>               bool ok = vhost_vdpa_svq_setup(dev, svq, i);
>               if (unlikely(!ok)) {
>                   return -1;
>               }
> +            vhost_svq_start(svq, dev->vdev, vq);
>           }
>           vhost_vdpa_set_vring_ready(dev);
>       } else {
> +        for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
> +            VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs,
> +                                                          i);
> +            bool ok = vhost_vdpa_svq_unmap_rings(dev, svq);
> +            if (unlikely(!ok)) {
> +                return -1;
> +            }
> +        }
>           vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
>       }
>

Re: [PATCH 18/31] vhost: Shadow virtqueue buffers forwarding

Posted by Eugenio Perez Martin 4 years ago

On Sun, Jan 30, 2022 at 5:43 AM Jason Wang <jasowang@redhat.com> wrote:
>
>
> 在 2022/1/22 上午4:27, Eugenio Pérez 写道:
> > Initial version of shadow virtqueue that actually forward buffers. There
> > is no iommu support at the moment, and that will be addressed in future
> > patches of this series. Since all vhost-vdpa devices use forced IOMMU,
> > this means that SVQ is not usable at this point of the series on any
> > device.
> >
> > For simplicity it only supports modern devices, that expects vring
> > in little endian, with split ring and no event idx or indirect
> > descriptors. Support for them will not be added in this series.
> >
> > It reuses the VirtQueue code for the device part. The driver part is
> > based on Linux's virtio_ring driver, but with stripped functionality
> > and optimizations so it's easier to review.
> >
> > However, forwarding buffers have some particular pieces: One of the most
> > unexpected ones is that a guest's buffer can expand through more than
> > one descriptor in SVQ. While this is handled gracefully by qemu's
> > emulated virtio devices, it may cause unexpected SVQ queue full. This
> > patch also solves it by checking for this condition at both guest's
> > kicks and device's calls. The code may be more elegant in the future if
> > SVQ code runs in its own iocontext.
> >
> > Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> > ---
> >   hw/virtio/vhost-shadow-virtqueue.h |   2 +
> >   hw/virtio/vhost-shadow-virtqueue.c | 365 ++++++++++++++++++++++++++++-
> >   hw/virtio/vhost-vdpa.c             | 111 ++++++++-
> >   3 files changed, 462 insertions(+), 16 deletions(-)
> >
> > diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
> > index 39aef5ffdf..19c934af49 100644
> > --- a/hw/virtio/vhost-shadow-virtqueue.h
> > +++ b/hw/virtio/vhost-shadow-virtqueue.h
> > @@ -33,6 +33,8 @@ uint16_t vhost_svq_get_num(const VhostShadowVirtqueue *svq);
> >   size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
> >   size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
> >
> > +void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
> > +                     VirtQueue *vq);
> >   void vhost_svq_stop(VhostShadowVirtqueue *svq);
> >
> >   VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize);
> > diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
> > index 7c168075d7..a1a404f68f 100644
> > --- a/hw/virtio/vhost-shadow-virtqueue.c
> > +++ b/hw/virtio/vhost-shadow-virtqueue.c
> > @@ -9,6 +9,8 @@
> >
> >   #include "qemu/osdep.h"
> >   #include "hw/virtio/vhost-shadow-virtqueue.h"
> > +#include "hw/virtio/vhost.h"
> > +#include "hw/virtio/virtio-access.h"
> >   #include "standard-headers/linux/vhost_types.h"
> >
> >   #include "qemu/error-report.h"
> > @@ -36,6 +38,33 @@ typedef struct VhostShadowVirtqueue {
> >
> >       /* Guest's call notifier, where SVQ calls guest. */
> >       EventNotifier svq_call;
> > +
> > +    /* Virtio queue shadowing */
> > +    VirtQueue *vq;
> > +
> > +    /* Virtio device */
> > +    VirtIODevice *vdev;
> > +
> > +    /* Map for returning guest's descriptors */
> > +    VirtQueueElement **ring_id_maps;
> > +
> > +    /* Next VirtQueue element that guest made available */
> > +    VirtQueueElement *next_guest_avail_elem;
> > +
> > +    /* Next head to expose to device */
> > +    uint16_t avail_idx_shadow;
> > +
> > +    /* Next free descriptor */
> > +    uint16_t free_head;
> > +
> > +    /* Last seen used idx */
> > +    uint16_t shadow_used_idx;
> > +
> > +    /* Next head to consume from device */
> > +    uint16_t last_used_idx;
> > +
> > +    /* Cache for the exposed notification flag */
> > +    bool notification;
> >   } VhostShadowVirtqueue;
> >
> >   #define INVALID_SVQ_KICK_FD -1
> > @@ -148,30 +177,294 @@ bool vhost_svq_ack_guest_features(uint64_t dev_features,
> >       return true;
> >   }
> >
> > -/* Forward guest notifications */
> > -static void vhost_handle_guest_kick(EventNotifier *n)
> > +/**
> > + * Number of descriptors that SVQ can make available from the guest.
> > + *
> > + * @svq   The svq
> > + */
> > +static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
> >   {
> > -    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
> > -                                             svq_kick);
> > +    return svq->vring.num - (svq->avail_idx_shadow - svq->shadow_used_idx);
> > +}
> > +
> > +static void vhost_svq_set_notification(VhostShadowVirtqueue *svq, bool enable)
> > +{
> > +    uint16_t notification_flag;
> >
> > -    if (unlikely(!event_notifier_test_and_clear(n))) {
> > +    if (svq->notification == enable) {
> > +        return;
> > +    }
> > +
> > +    notification_flag = cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
> > +
> > +    svq->notification = enable;
> > +    if (enable) {
> > +        svq->vring.avail->flags &= ~notification_flag;
> > +    } else {
> > +        svq->vring.avail->flags |= notification_flag;
> > +    }
> > +}
> > +
> > +static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
> > +                                    const struct iovec *iovec,
> > +                                    size_t num, bool more_descs, bool write)
> > +{
> > +    uint16_t i = svq->free_head, last = svq->free_head;
> > +    unsigned n;
> > +    uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0;
> > +    vring_desc_t *descs = svq->vring.desc;
> > +
> > +    if (num == 0) {
> > +        return;
> > +    }
> > +
> > +    for (n = 0; n < num; n++) {
> > +        if (more_descs || (n + 1 < num)) {
> > +            descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT);
> > +        } else {
> > +            descs[i].flags = flags;
> > +        }
> > +        descs[i].addr = cpu_to_le64((hwaddr)iovec[n].iov_base);
> > +        descs[i].len = cpu_to_le32(iovec[n].iov_len);
> > +
> > +        last = i;
> > +        i = cpu_to_le16(descs[i].next);
> > +    }
> > +
> > +    svq->free_head = le16_to_cpu(descs[last].next);
> > +}
> > +
> > +static unsigned vhost_svq_add_split(VhostShadowVirtqueue *svq,
> > +                                    VirtQueueElement *elem)
> > +{
> > +    int head;
> > +    unsigned avail_idx;
> > +    vring_avail_t *avail = svq->vring.avail;
> > +
> > +    head = svq->free_head;
> > +
> > +    /* We need some descriptors here */
> > +    assert(elem->out_num || elem->in_num);
>
>
> Looks like this could be triggered by guest, we need fail instead assert
> here.
>

My understanding was that virtqueue_pop already sanitized that case,
but I'm not able to find where now. I will recheck and, in case it's
not, I will move to a failure.

>
> > +
> > +    vhost_vring_write_descs(svq, elem->out_sg, elem->out_num,
> > +                            elem->in_num > 0, false);
> > +    vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
> > +
> > +    /*
> > +     * Put entry in available array (but don't update avail->idx until they
> > +     * do sync).
> > +     */
> > +    avail_idx = svq->avail_idx_shadow & (svq->vring.num - 1);
> > +    avail->ring[avail_idx] = cpu_to_le16(head);
> > +    svq->avail_idx_shadow++;
> > +
> > +    /* Update avail index after the descriptor is wrote */
> > +    smp_wmb();
> > +    avail->idx = cpu_to_le16(svq->avail_idx_shadow);
> > +
> > +    return head;
> > +}
> > +
> > +static void vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
> > +{
> > +    unsigned qemu_head = vhost_svq_add_split(svq, elem);
> > +
> > +    svq->ring_id_maps[qemu_head] = elem;
> > +}
> > +
> > +static void vhost_svq_kick(VhostShadowVirtqueue *svq)
> > +{
> > +    /* We need to expose available array entries before checking used flags */
> > +    smp_mb();
> > +    if (svq->vring.used->flags & VRING_USED_F_NO_NOTIFY) {
> >           return;
> >       }
> >
> >       event_notifier_set(&svq->hdev_kick);
> >   }
> >
> > -/* Forward vhost notifications */
> > +/**
> > + * Forward available buffers.
> > + *
> > + * @svq Shadow VirtQueue
> > + *
> > + * Note that this function does not guarantee that all guest's available
> > + * buffers are available to the device in SVQ avail ring. The guest may have
> > + * exposed a GPA / GIOVA congiuous buffer, but it may not be contiguous in qemu
> > + * vaddr.
> > + *
> > + * If that happens, guest's kick notifications will be disabled until device
> > + * makes some buffers used.
> > + */
> > +static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
> > +{
> > +    /* Clear event notifier */
> > +    event_notifier_test_and_clear(&svq->svq_kick);
> > +
> > +    /* Make available as many buffers as possible */
> > +    do {
> > +        if (virtio_queue_get_notification(svq->vq)) {
> > +            virtio_queue_set_notification(svq->vq, false);
>
>
> This looks like an optimization the should belong to
> virtio_queue_set_notification() itself.
>

Sure we can move.

>
> > +        }
> > +
> > +        while (true) {
> > +            VirtQueueElement *elem;
> > +
> > +            if (svq->next_guest_avail_elem) {
> > +                elem = g_steal_pointer(&svq->next_guest_avail_elem);
> > +            } else {
> > +                elem = virtqueue_pop(svq->vq, sizeof(*elem));
> > +            }
> > +
> > +            if (!elem) {
> > +                break;
> > +            }
> > +
> > +            if (elem->out_num + elem->in_num >
> > +                vhost_svq_available_slots(svq)) {
> > +                /*
> > +                 * This condition is possible since a contiguous buffer in GPA
> > +                 * does not imply a contiguous buffer in qemu's VA
> > +                 * scatter-gather segments. If that happen, the buffer exposed
> > +                 * to the device needs to be a chain of descriptors at this
> > +                 * moment.
> > +                 *
> > +                 * SVQ cannot hold more available buffers if we are here:
> > +                 * queue the current guest descriptor and ignore further kicks
> > +                 * until some elements are used.
> > +                 */
> > +                svq->next_guest_avail_elem = elem;
> > +                return;
> > +            }
> > +
> > +            vhost_svq_add(svq, elem);
> > +            vhost_svq_kick(svq);
> > +        }
> > +
> > +        virtio_queue_set_notification(svq->vq, true);
> > +    } while (!virtio_queue_empty(svq->vq));
> > +}
> > +
> > +/**
> > + * Handle guest's kick.
> > + *
> > + * @n guest kick event notifier, the one that guest set to notify svq.
> > + */
> > +static void vhost_handle_guest_kick_notifier(EventNotifier *n)
> > +{
> > +    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
> > +                                             svq_kick);
> > +    vhost_handle_guest_kick(svq);
> > +}
> > +
> > +static bool vhost_svq_more_used(VhostShadowVirtqueue *svq)
> > +{
> > +    if (svq->last_used_idx != svq->shadow_used_idx) {
> > +        return true;
> > +    }
> > +
> > +    svq->shadow_used_idx = cpu_to_le16(svq->vring.used->idx);
> > +
> > +    return svq->last_used_idx != svq->shadow_used_idx;
> > +}
> > +
> > +static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq)
> > +{
> > +    vring_desc_t *descs = svq->vring.desc;
> > +    const vring_used_t *used = svq->vring.used;
> > +    vring_used_elem_t used_elem;
> > +    uint16_t last_used;
> > +
> > +    if (!vhost_svq_more_used(svq)) {
> > +        return NULL;
> > +    }
> > +
> > +    /* Only get used array entries after they have been exposed by dev */
> > +    smp_rmb();
> > +    last_used = svq->last_used_idx & (svq->vring.num - 1);
> > +    used_elem.id = le32_to_cpu(used->ring[last_used].id);
> > +    used_elem.len = le32_to_cpu(used->ring[last_used].len);
> > +
> > +    svq->last_used_idx++;
> > +    if (unlikely(used_elem.id >= svq->vring.num)) {
> > +        error_report("Device %s says index %u is used", svq->vdev->name,
> > +                     used_elem.id);
> > +        return NULL;
> > +    }
> > +
> > +    if (unlikely(!svq->ring_id_maps[used_elem.id])) {
> > +        error_report(
> > +            "Device %s says index %u is used, but it was not available",
> > +            svq->vdev->name, used_elem.id);
> > +        return NULL;
> > +    }
> > +
> > +    descs[used_elem.id].next = svq->free_head;
> > +    svq->free_head = used_elem.id;
> > +
> > +    svq->ring_id_maps[used_elem.id]->len = used_elem.len;
> > +    return g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
> > +}
> > +
> > +static void vhost_svq_flush(VhostShadowVirtqueue *svq,
> > +                            bool check_for_avail_queue)
> > +{
> > +    VirtQueue *vq = svq->vq;
> > +
> > +    /* Make as many buffers as possible used. */
> > +    do {
> > +        unsigned i = 0;
> > +
> > +        vhost_svq_set_notification(svq, false);
> > +        while (true) {
> > +            g_autofree VirtQueueElement *elem = vhost_svq_get_buf(svq);
> > +            if (!elem) {
> > +                break;
> > +            }
> > +
> > +            if (unlikely(i >= svq->vring.num)) {
> > +                virtio_error(svq->vdev,
> > +                         "More than %u used buffers obtained in a %u size SVQ",
> > +                         i, svq->vring.num);
> > +                virtqueue_fill(vq, elem, elem->len, i);
> > +                virtqueue_flush(vq, i);
>
>
> Let's simply use virtqueue_push() here?
>

virtqueue_push support to fill and flush only one element, instead of
batch. I'm fine with either but I think the less updates to the used
idx, the better.

>
> > +                i = 0;
>
>
> Do we need to bail out here?
>

Yes I guess we can simply return.

>
> > +            }
> > +            virtqueue_fill(vq, elem, elem->len, i++);
> > +        }
> > +
> > +        virtqueue_flush(vq, i);
> > +        event_notifier_set(&svq->svq_call);
> > +
> > +        if (check_for_avail_queue && svq->next_guest_avail_elem) {
> > +            /*
> > +             * Avail ring was full when vhost_svq_flush was called, so it's a
> > +             * good moment to make more descriptors available if possible
> > +             */
> > +            vhost_handle_guest_kick(svq);
>
>
> Is there better to have a similar check as vhost_handle_guest_kick() did?
>
>              if (elem->out_num + elem->in_num >
>                  vhost_svq_available_slots(svq)) {
>

It will be duplicated when we call vhost_handle_guest_kick, won't it?

>
> > +        }
> > +
> > +        vhost_svq_set_notification(svq, true);
>
>
> A mb() is needed here? Otherwise we may lost a call here (where
> vhost_svq_more_used() is run before vhost_svq_set_notification()).
>

I'm confused here then, I thought you said this is just a hint so
there was no need? [1]. I think the memory barrier is needed too.

>
> > +    } while (vhost_svq_more_used(svq));
> > +}
> > +
> > +/**
> > + * Forward used buffers.
> > + *
> > + * @n hdev call event notifier, the one that device set to notify svq.
> > + *
> > + * Note that we are not making any buffers available in the loop, there is no
> > + * way that it runs more than virtqueue size times.
> > + */
> >   static void vhost_svq_handle_call(EventNotifier *n)
> >   {
> >       VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
> >                                                hdev_call);
> >
> > -    if (unlikely(!event_notifier_test_and_clear(n))) {
> > -        return;
> > -    }
> > +    /* Clear event notifier */
> > +    event_notifier_test_and_clear(n);
>
>
> Any reason that we remove the above check?
>

This comes from the previous versions, where this made sure we missed
no used buffers in the process of switching to SVQ mode.

If we enable SVQ from the beginning I think we can rely on getting all
the device's used buffer notifications, so let me think a little bit
and I can move to check the eventfd.

>
> >
> > -    event_notifier_set(&svq->svq_call);
> > +    vhost_svq_flush(svq, true);
> >   }
> >
> >   /**
> > @@ -258,13 +551,38 @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
> >        * need to explicitely check for them.
> >        */
> >       event_notifier_init_fd(&svq->svq_kick, svq_kick_fd);
> > -    event_notifier_set_handler(&svq->svq_kick, vhost_handle_guest_kick);
> > +    event_notifier_set_handler(&svq->svq_kick,
> > +                               vhost_handle_guest_kick_notifier);
> >
> >       if (!check_old || event_notifier_test_and_clear(&tmp)) {
> >           event_notifier_set(&svq->hdev_kick);
> >       }
> >   }
> >
> > +/**
> > + * Start shadow virtqueue operation.
> > + *
> > + * @svq Shadow Virtqueue
> > + * @vdev        VirtIO device
> > + * @vq          Virtqueue to shadow
> > + */
> > +void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
> > +                     VirtQueue *vq)
> > +{
> > +    svq->next_guest_avail_elem = NULL;
> > +    svq->avail_idx_shadow = 0;
> > +    svq->shadow_used_idx = 0;
> > +    svq->last_used_idx = 0;
> > +    svq->vdev = vdev;
> > +    svq->vq = vq;
> > +
> > +    memset(svq->vring.avail, 0, sizeof(*svq->vring.avail));
> > +    memset(svq->vring.used, 0, sizeof(*svq->vring.avail));
> > +    for (unsigned i = 0; i < svq->vring.num - 1; i++) {
> > +        svq->vring.desc[i].next = cpu_to_le16(i + 1);
> > +    }
> > +}
> > +
> >   /**
> >    * Stop shadow virtqueue operation.
> >    * @svq Shadow Virtqueue
> > @@ -272,6 +590,28 @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
> >   void vhost_svq_stop(VhostShadowVirtqueue *svq)
> >   {
> >       event_notifier_set_handler(&svq->svq_kick, NULL);
> > +    g_autofree VirtQueueElement *next_avail_elem = NULL;
> > +
> > +    if (!svq->vq) {
> > +        return;
> > +    }
> > +
> > +    /* Send all pending used descriptors to guest */
> > +    vhost_svq_flush(svq, false);
> > +
> > +    for (unsigned i = 0; i < svq->vring.num; ++i) {
> > +        g_autofree VirtQueueElement *elem = NULL;
> > +        elem = g_steal_pointer(&svq->ring_id_maps[i]);
> > +        if (elem) {
> > +            virtqueue_detach_element(svq->vq, elem, elem->len);
> > +        }
> > +    }
> > +
> > +    next_avail_elem = g_steal_pointer(&svq->next_guest_avail_elem);
> > +    if (next_avail_elem) {
> > +        virtqueue_detach_element(svq->vq, next_avail_elem,
> > +                                 next_avail_elem->len);
> > +    }
> >   }
> >
> >   /**
> > @@ -316,7 +656,7 @@ VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize)
> >       memset(svq->vring.desc, 0, driver_size);
> >       svq->vring.used = qemu_memalign(qemu_real_host_page_size, device_size);
> >       memset(svq->vring.used, 0, device_size);
> > -
> > +    svq->ring_id_maps = g_new0(VirtQueueElement *, qsize);
> >       event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
> >       return g_steal_pointer(&svq);
> >
> > @@ -335,6 +675,7 @@ void vhost_svq_free(VhostShadowVirtqueue *vq)
> >       event_notifier_cleanup(&vq->hdev_kick);
> >       event_notifier_set_handler(&vq->hdev_call, NULL);
> >       event_notifier_cleanup(&vq->hdev_call);
> > +    g_free(vq->ring_id_maps);
> >       qemu_vfree(vq->vring.desc);
> >       qemu_vfree(vq->vring.used);
> >       g_free(vq);
> > diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
> > index 53e14bafa0..0e5c00ed7e 100644
> > --- a/hw/virtio/vhost-vdpa.c
> > +++ b/hw/virtio/vhost-vdpa.c
> > @@ -752,9 +752,9 @@ static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
> >    * Note that this function does not rewind kick file descriptor if cannot set
> >    * call one.
> >    */
> > -static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
> > -                                VhostShadowVirtqueue *svq,
> > -                                unsigned idx)
> > +static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
> > +                                  VhostShadowVirtqueue *svq,
> > +                                  unsigned idx)
> >   {
> >       struct vhost_vring_file file = {
> >           .index = dev->vq_index + idx,
> > @@ -767,7 +767,7 @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
> >       r = vhost_vdpa_set_vring_dev_kick(dev, &file);
> >       if (unlikely(r != 0)) {
> >           error_report("Can't set device kick fd (%d)", -r);
> > -        return false;
> > +        return r;
> >       }
> >
> >       event_notifier = vhost_svq_get_svq_call_notifier(svq);
> > @@ -777,6 +777,99 @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
> >           error_report("Can't set device call fd (%d)", -r);
> >       }
> >
> > +    return r;
> > +}
> > +
> > +/**
> > + * Unmap SVQ area in the device
> > + */
> > +static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
> > +                                      hwaddr size)
> > +{
> > +    int r;
> > +
> > +    size = ROUND_UP(size, qemu_real_host_page_size);
> > +    r = vhost_vdpa_dma_unmap(v, iova, size);
> > +    return r == 0;
> > +}
> > +
> > +static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
> > +                                       const VhostShadowVirtqueue *svq)
> > +{
> > +    struct vhost_vdpa *v = dev->opaque;
> > +    struct vhost_vring_addr svq_addr;
> > +    size_t device_size = vhost_svq_device_area_size(svq);
> > +    size_t driver_size = vhost_svq_driver_area_size(svq);
> > +    bool ok;
> > +
> > +    vhost_svq_get_vring_addr(svq, &svq_addr);
> > +
> > +    ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size);
> > +    if (unlikely(!ok)) {
> > +        return false;
> > +    }
> > +
> > +    return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size);
> > +}
> > +
> > +/**
> > + * Map shadow virtqueue rings in device
> > + *
> > + * @dev   The vhost device
> > + * @svq   The shadow virtqueue
> > + */
> > +static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
> > +                                     const VhostShadowVirtqueue *svq)
> > +{
> > +    struct vhost_vdpa *v = dev->opaque;
> > +    struct vhost_vring_addr svq_addr;
> > +    size_t device_size = vhost_svq_device_area_size(svq);
> > +    size_t driver_size = vhost_svq_driver_area_size(svq);
> > +    int r;
> > +
> > +    vhost_svq_get_vring_addr(svq, &svq_addr);
> > +
> > +    r = vhost_vdpa_dma_map(v, svq_addr.desc_user_addr, driver_size,
> > +                           (void *)svq_addr.desc_user_addr, true);
> > +    if (unlikely(r != 0)) {
> > +        return false;
> > +    }
> > +
> > +    r = vhost_vdpa_dma_map(v, svq_addr.used_user_addr, device_size,
> > +                           (void *)svq_addr.used_user_addr, false);
>
>
> Do we need unmap the driver area if we fail here?
>

Yes, this used to trust in unmap them at the disabling of SVQ. Now I
think we need to unmap as you say.

Thanks!

[1] https://lists.linuxfoundation.org/pipermail/virtualization/2021-March/053322.html

> Thanks
>
>
> > +    return r == 0;
> > +}
> > +
> > +static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
> > +                                VhostShadowVirtqueue *svq,
> > +                                unsigned idx)
> > +{
> > +    uint16_t vq_index = dev->vq_index + idx;
> > +    struct vhost_vring_state s = {
> > +        .index = vq_index,
> > +    };
> > +    int r;
> > +    bool ok;
> > +
> > +    r = vhost_vdpa_set_dev_vring_base(dev, &s);
> > +    if (unlikely(r)) {
> > +        error_report("Can't set vring base (%d)", r);
> > +        return false;
> > +    }
> > +
> > +    s.num = vhost_svq_get_num(svq);
> > +    r = vhost_vdpa_set_dev_vring_num(dev, &s);
> > +    if (unlikely(r)) {
> > +        error_report("Can't set vring num (%d)", r);
> > +        return false;
> > +    }
> > +
> > +    ok = vhost_vdpa_svq_map_rings(dev, svq);
> > +    if (unlikely(!ok)) {
> > +        return false;
> > +    }
> > +
> > +    r = vhost_vdpa_svq_set_fds(dev, svq, idx);
> >       return r == 0;
> >   }
> >
> > @@ -788,14 +881,24 @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
> >       if (started) {
> >           vhost_vdpa_host_notifiers_init(dev);
> >           for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
> > +            VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
> >               VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
> >               bool ok = vhost_vdpa_svq_setup(dev, svq, i);
> >               if (unlikely(!ok)) {
> >                   return -1;
> >               }
> > +            vhost_svq_start(svq, dev->vdev, vq);
> >           }
> >           vhost_vdpa_set_vring_ready(dev);
> >       } else {
> > +        for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
> > +            VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs,
> > +                                                          i);
> > +            bool ok = vhost_vdpa_svq_unmap_rings(dev, svq);
> > +            if (unlikely(!ok)) {
> > +                return -1;
> > +            }
> > +        }
> >           vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
> >       }
> >
>

Re: [PATCH 18/31] vhost: Shadow virtqueue buffers forwarding

Posted by Jason Wang 4 years ago

在 2022/2/2 上午1:08, Eugenio Perez Martin 写道:
> On Sun, Jan 30, 2022 at 5:43 AM Jason Wang <jasowang@redhat.com> wrote:
>>
>> 在 2022/1/22 上午4:27, Eugenio Pérez 写道:
>>> Initial version of shadow virtqueue that actually forward buffers. There
>>> is no iommu support at the moment, and that will be addressed in future
>>> patches of this series. Since all vhost-vdpa devices use forced IOMMU,
>>> this means that SVQ is not usable at this point of the series on any
>>> device.
>>>
>>> For simplicity it only supports modern devices, that expects vring
>>> in little endian, with split ring and no event idx or indirect
>>> descriptors. Support for them will not be added in this series.
>>>
>>> It reuses the VirtQueue code for the device part. The driver part is
>>> based on Linux's virtio_ring driver, but with stripped functionality
>>> and optimizations so it's easier to review.
>>>
>>> However, forwarding buffers have some particular pieces: One of the most
>>> unexpected ones is that a guest's buffer can expand through more than
>>> one descriptor in SVQ. While this is handled gracefully by qemu's
>>> emulated virtio devices, it may cause unexpected SVQ queue full. This
>>> patch also solves it by checking for this condition at both guest's
>>> kicks and device's calls. The code may be more elegant in the future if
>>> SVQ code runs in its own iocontext.
>>>
>>> Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
>>> ---
>>>    hw/virtio/vhost-shadow-virtqueue.h |   2 +
>>>    hw/virtio/vhost-shadow-virtqueue.c | 365 ++++++++++++++++++++++++++++-
>>>    hw/virtio/vhost-vdpa.c             | 111 ++++++++-
>>>    3 files changed, 462 insertions(+), 16 deletions(-)
>>>
>>> diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
>>> index 39aef5ffdf..19c934af49 100644
>>> --- a/hw/virtio/vhost-shadow-virtqueue.h
>>> +++ b/hw/virtio/vhost-shadow-virtqueue.h
>>> @@ -33,6 +33,8 @@ uint16_t vhost_svq_get_num(const VhostShadowVirtqueue *svq);
>>>    size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
>>>    size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
>>>
>>> +void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
>>> +                     VirtQueue *vq);
>>>    void vhost_svq_stop(VhostShadowVirtqueue *svq);
>>>
>>>    VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize);
>>> diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
>>> index 7c168075d7..a1a404f68f 100644
>>> --- a/hw/virtio/vhost-shadow-virtqueue.c
>>> +++ b/hw/virtio/vhost-shadow-virtqueue.c
>>> @@ -9,6 +9,8 @@
>>>
>>>    #include "qemu/osdep.h"
>>>    #include "hw/virtio/vhost-shadow-virtqueue.h"
>>> +#include "hw/virtio/vhost.h"
>>> +#include "hw/virtio/virtio-access.h"
>>>    #include "standard-headers/linux/vhost_types.h"
>>>
>>>    #include "qemu/error-report.h"
>>> @@ -36,6 +38,33 @@ typedef struct VhostShadowVirtqueue {
>>>
>>>        /* Guest's call notifier, where SVQ calls guest. */
>>>        EventNotifier svq_call;
>>> +
>>> +    /* Virtio queue shadowing */
>>> +    VirtQueue *vq;
>>> +
>>> +    /* Virtio device */
>>> +    VirtIODevice *vdev;
>>> +
>>> +    /* Map for returning guest's descriptors */
>>> +    VirtQueueElement **ring_id_maps;
>>> +
>>> +    /* Next VirtQueue element that guest made available */
>>> +    VirtQueueElement *next_guest_avail_elem;
>>> +
>>> +    /* Next head to expose to device */
>>> +    uint16_t avail_idx_shadow;
>>> +
>>> +    /* Next free descriptor */
>>> +    uint16_t free_head;
>>> +
>>> +    /* Last seen used idx */
>>> +    uint16_t shadow_used_idx;
>>> +
>>> +    /* Next head to consume from device */
>>> +    uint16_t last_used_idx;
>>> +
>>> +    /* Cache for the exposed notification flag */
>>> +    bool notification;
>>>    } VhostShadowVirtqueue;
>>>
>>>    #define INVALID_SVQ_KICK_FD -1
>>> @@ -148,30 +177,294 @@ bool vhost_svq_ack_guest_features(uint64_t dev_features,
>>>        return true;
>>>    }
>>>
>>> -/* Forward guest notifications */
>>> -static void vhost_handle_guest_kick(EventNotifier *n)
>>> +/**
>>> + * Number of descriptors that SVQ can make available from the guest.
>>> + *
>>> + * @svq   The svq
>>> + */
>>> +static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
>>>    {
>>> -    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
>>> -                                             svq_kick);
>>> +    return svq->vring.num - (svq->avail_idx_shadow - svq->shadow_used_idx);
>>> +}
>>> +
>>> +static void vhost_svq_set_notification(VhostShadowVirtqueue *svq, bool enable)
>>> +{
>>> +    uint16_t notification_flag;
>>>
>>> -    if (unlikely(!event_notifier_test_and_clear(n))) {
>>> +    if (svq->notification == enable) {
>>> +        return;
>>> +    }
>>> +
>>> +    notification_flag = cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
>>> +
>>> +    svq->notification = enable;
>>> +    if (enable) {
>>> +        svq->vring.avail->flags &= ~notification_flag;
>>> +    } else {
>>> +        svq->vring.avail->flags |= notification_flag;
>>> +    }
>>> +}
>>> +
>>> +static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
>>> +                                    const struct iovec *iovec,
>>> +                                    size_t num, bool more_descs, bool write)
>>> +{
>>> +    uint16_t i = svq->free_head, last = svq->free_head;
>>> +    unsigned n;
>>> +    uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0;
>>> +    vring_desc_t *descs = svq->vring.desc;
>>> +
>>> +    if (num == 0) {
>>> +        return;
>>> +    }
>>> +
>>> +    for (n = 0; n < num; n++) {
>>> +        if (more_descs || (n + 1 < num)) {
>>> +            descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT);
>>> +        } else {
>>> +            descs[i].flags = flags;
>>> +        }
>>> +        descs[i].addr = cpu_to_le64((hwaddr)iovec[n].iov_base);
>>> +        descs[i].len = cpu_to_le32(iovec[n].iov_len);
>>> +
>>> +        last = i;
>>> +        i = cpu_to_le16(descs[i].next);
>>> +    }
>>> +
>>> +    svq->free_head = le16_to_cpu(descs[last].next);
>>> +}
>>> +
>>> +static unsigned vhost_svq_add_split(VhostShadowVirtqueue *svq,
>>> +                                    VirtQueueElement *elem)
>>> +{
>>> +    int head;
>>> +    unsigned avail_idx;
>>> +    vring_avail_t *avail = svq->vring.avail;
>>> +
>>> +    head = svq->free_head;
>>> +
>>> +    /* We need some descriptors here */
>>> +    assert(elem->out_num || elem->in_num);
>>
>> Looks like this could be triggered by guest, we need fail instead assert
>> here.
>>
> My understanding was that virtqueue_pop already sanitized that case,
> but I'm not able to find where now. I will recheck and, in case it's
> not, I will move to a failure.
>
>>> +
>>> +    vhost_vring_write_descs(svq, elem->out_sg, elem->out_num,
>>> +                            elem->in_num > 0, false);
>>> +    vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
>>> +
>>> +    /*
>>> +     * Put entry in available array (but don't update avail->idx until they
>>> +     * do sync).
>>> +     */
>>> +    avail_idx = svq->avail_idx_shadow & (svq->vring.num - 1);
>>> +    avail->ring[avail_idx] = cpu_to_le16(head);
>>> +    svq->avail_idx_shadow++;
>>> +
>>> +    /* Update avail index after the descriptor is wrote */
>>> +    smp_wmb();
>>> +    avail->idx = cpu_to_le16(svq->avail_idx_shadow);
>>> +
>>> +    return head;
>>> +}
>>> +
>>> +static void vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
>>> +{
>>> +    unsigned qemu_head = vhost_svq_add_split(svq, elem);
>>> +
>>> +    svq->ring_id_maps[qemu_head] = elem;
>>> +}
>>> +
>>> +static void vhost_svq_kick(VhostShadowVirtqueue *svq)
>>> +{
>>> +    /* We need to expose available array entries before checking used flags */
>>> +    smp_mb();
>>> +    if (svq->vring.used->flags & VRING_USED_F_NO_NOTIFY) {
>>>            return;
>>>        }
>>>
>>>        event_notifier_set(&svq->hdev_kick);
>>>    }
>>>
>>> -/* Forward vhost notifications */
>>> +/**
>>> + * Forward available buffers.
>>> + *
>>> + * @svq Shadow VirtQueue
>>> + *
>>> + * Note that this function does not guarantee that all guest's available
>>> + * buffers are available to the device in SVQ avail ring. The guest may have
>>> + * exposed a GPA / GIOVA congiuous buffer, but it may not be contiguous in qemu
>>> + * vaddr.
>>> + *
>>> + * If that happens, guest's kick notifications will be disabled until device
>>> + * makes some buffers used.
>>> + */
>>> +static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
>>> +{
>>> +    /* Clear event notifier */
>>> +    event_notifier_test_and_clear(&svq->svq_kick);
>>> +
>>> +    /* Make available as many buffers as possible */
>>> +    do {
>>> +        if (virtio_queue_get_notification(svq->vq)) {
>>> +            virtio_queue_set_notification(svq->vq, false);
>>
>> This looks like an optimization the should belong to
>> virtio_queue_set_notification() itself.
>>
> Sure we can move.
>
>>> +        }
>>> +
>>> +        while (true) {
>>> +            VirtQueueElement *elem;
>>> +
>>> +            if (svq->next_guest_avail_elem) {
>>> +                elem = g_steal_pointer(&svq->next_guest_avail_elem);
>>> +            } else {
>>> +                elem = virtqueue_pop(svq->vq, sizeof(*elem));
>>> +            }
>>> +
>>> +            if (!elem) {
>>> +                break;
>>> +            }
>>> +
>>> +            if (elem->out_num + elem->in_num >
>>> +                vhost_svq_available_slots(svq)) {
>>> +                /*
>>> +                 * This condition is possible since a contiguous buffer in GPA
>>> +                 * does not imply a contiguous buffer in qemu's VA
>>> +                 * scatter-gather segments. If that happen, the buffer exposed
>>> +                 * to the device needs to be a chain of descriptors at this
>>> +                 * moment.
>>> +                 *
>>> +                 * SVQ cannot hold more available buffers if we are here:
>>> +                 * queue the current guest descriptor and ignore further kicks
>>> +                 * until some elements are used.
>>> +                 */
>>> +                svq->next_guest_avail_elem = elem;
>>> +                return;
>>> +            }
>>> +
>>> +            vhost_svq_add(svq, elem);
>>> +            vhost_svq_kick(svq);
>>> +        }
>>> +
>>> +        virtio_queue_set_notification(svq->vq, true);
>>> +    } while (!virtio_queue_empty(svq->vq));
>>> +}
>>> +
>>> +/**
>>> + * Handle guest's kick.
>>> + *
>>> + * @n guest kick event notifier, the one that guest set to notify svq.
>>> + */
>>> +static void vhost_handle_guest_kick_notifier(EventNotifier *n)
>>> +{
>>> +    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
>>> +                                             svq_kick);
>>> +    vhost_handle_guest_kick(svq);
>>> +}
>>> +
>>> +static bool vhost_svq_more_used(VhostShadowVirtqueue *svq)
>>> +{
>>> +    if (svq->last_used_idx != svq->shadow_used_idx) {
>>> +        return true;
>>> +    }
>>> +
>>> +    svq->shadow_used_idx = cpu_to_le16(svq->vring.used->idx);
>>> +
>>> +    return svq->last_used_idx != svq->shadow_used_idx;
>>> +}
>>> +
>>> +static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq)
>>> +{
>>> +    vring_desc_t *descs = svq->vring.desc;
>>> +    const vring_used_t *used = svq->vring.used;
>>> +    vring_used_elem_t used_elem;
>>> +    uint16_t last_used;
>>> +
>>> +    if (!vhost_svq_more_used(svq)) {
>>> +        return NULL;
>>> +    }
>>> +
>>> +    /* Only get used array entries after they have been exposed by dev */
>>> +    smp_rmb();
>>> +    last_used = svq->last_used_idx & (svq->vring.num - 1);
>>> +    used_elem.id = le32_to_cpu(used->ring[last_used].id);
>>> +    used_elem.len = le32_to_cpu(used->ring[last_used].len);
>>> +
>>> +    svq->last_used_idx++;
>>> +    if (unlikely(used_elem.id >= svq->vring.num)) {
>>> +        error_report("Device %s says index %u is used", svq->vdev->name,
>>> +                     used_elem.id);
>>> +        return NULL;
>>> +    }
>>> +
>>> +    if (unlikely(!svq->ring_id_maps[used_elem.id])) {
>>> +        error_report(
>>> +            "Device %s says index %u is used, but it was not available",
>>> +            svq->vdev->name, used_elem.id);
>>> +        return NULL;
>>> +    }
>>> +
>>> +    descs[used_elem.id].next = svq->free_head;
>>> +    svq->free_head = used_elem.id;
>>> +
>>> +    svq->ring_id_maps[used_elem.id]->len = used_elem.len;
>>> +    return g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
>>> +}
>>> +
>>> +static void vhost_svq_flush(VhostShadowVirtqueue *svq,
>>> +                            bool check_for_avail_queue)
>>> +{
>>> +    VirtQueue *vq = svq->vq;
>>> +
>>> +    /* Make as many buffers as possible used. */
>>> +    do {
>>> +        unsigned i = 0;
>>> +
>>> +        vhost_svq_set_notification(svq, false);
>>> +        while (true) {
>>> +            g_autofree VirtQueueElement *elem = vhost_svq_get_buf(svq);
>>> +            if (!elem) {
>>> +                break;
>>> +            }
>>> +
>>> +            if (unlikely(i >= svq->vring.num)) {
>>> +                virtio_error(svq->vdev,
>>> +                         "More than %u used buffers obtained in a %u size SVQ",
>>> +                         i, svq->vring.num);
>>> +                virtqueue_fill(vq, elem, elem->len, i);
>>> +                virtqueue_flush(vq, i);
>>
>> Let's simply use virtqueue_push() here?
>>
> virtqueue_push support to fill and flush only one element, instead of
> batch. I'm fine with either but I think the less updates to the used
> idx, the better.


Fine.


>
>>> +                i = 0;
>>
>> Do we need to bail out here?
>>
> Yes I guess we can simply return.
>
>>> +            }
>>> +            virtqueue_fill(vq, elem, elem->len, i++);
>>> +        }
>>> +
>>> +        virtqueue_flush(vq, i);
>>> +        event_notifier_set(&svq->svq_call);
>>> +
>>> +        if (check_for_avail_queue && svq->next_guest_avail_elem) {
>>> +            /*
>>> +             * Avail ring was full when vhost_svq_flush was called, so it's a
>>> +             * good moment to make more descriptors available if possible
>>> +             */
>>> +            vhost_handle_guest_kick(svq);
>>
>> Is there better to have a similar check as vhost_handle_guest_kick() did?
>>
>>               if (elem->out_num + elem->in_num >
>>                   vhost_svq_available_slots(svq)) {
>>
> It will be duplicated when we call vhost_handle_guest_kick, won't it?


Right, I mis-read the code.


>
>>> +        }
>>> +
>>> +        vhost_svq_set_notification(svq, true);
>>
>> A mb() is needed here? Otherwise we may lost a call here (where
>> vhost_svq_more_used() is run before vhost_svq_set_notification()).
>>
> I'm confused here then, I thought you said this is just a hint so
> there was no need? [1]. I think the memory barrier is needed too.


Yes, it's a hint but:

1) When we disable the notification, consider the notification disable 
is just a hint, device can still raise an interrupt, so the ordering is 
meaningless and a memory barrier is not necessary (the 
vhost_svq_set_notification(svq, false))

2) When we enable the notification, though it's a hint, the device can 
choose to implement it by enabling the interrupt, in this case, the 
notification enable should be done before checking the used. Otherwise, 
the checking of more used might be done before enable the notification:

1) driver check more used
2) device add more used but no notification
3) driver enable the notification then we lost a notification here


>>> +    } while (vhost_svq_more_used(svq));
>>> +}
>>> +
>>> +/**
>>> + * Forward used buffers.
>>> + *
>>> + * @n hdev call event notifier, the one that device set to notify svq.
>>> + *
>>> + * Note that we are not making any buffers available in the loop, there is no
>>> + * way that it runs more than virtqueue size times.
>>> + */
>>>    static void vhost_svq_handle_call(EventNotifier *n)
>>>    {
>>>        VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
>>>                                                 hdev_call);
>>>
>>> -    if (unlikely(!event_notifier_test_and_clear(n))) {
>>> -        return;
>>> -    }
>>> +    /* Clear event notifier */
>>> +    event_notifier_test_and_clear(n);
>>
>> Any reason that we remove the above check?
>>
> This comes from the previous versions, where this made sure we missed
> no used buffers in the process of switching to SVQ mode.


I'm not sure I get here. Even if for the switching, it should be more 
safe the handle the flush unconditionally?

Thanks


>
> If we enable SVQ from the beginning I think we can rely on getting all
> the device's used buffer notifications, so let me think a little bit
> and I can move to check the eventfd.
>
>>> -    event_notifier_set(&svq->svq_call);
>>> +    vhost_svq_flush(svq, true);
>>>    }
>>>
>>>    /**
>>> @@ -258,13 +551,38 @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
>>>         * need to explicitely check for them.
>>>         */
>>>        event_notifier_init_fd(&svq->svq_kick, svq_kick_fd);
>>> -    event_notifier_set_handler(&svq->svq_kick, vhost_handle_guest_kick);
>>> +    event_notifier_set_handler(&svq->svq_kick,
>>> +                               vhost_handle_guest_kick_notifier);
>>>
>>>        if (!check_old || event_notifier_test_and_clear(&tmp)) {
>>>            event_notifier_set(&svq->hdev_kick);
>>>        }
>>>    }
>>>
>>> +/**
>>> + * Start shadow virtqueue operation.
>>> + *
>>> + * @svq Shadow Virtqueue
>>> + * @vdev        VirtIO device
>>> + * @vq          Virtqueue to shadow
>>> + */
>>> +void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
>>> +                     VirtQueue *vq)
>>> +{
>>> +    svq->next_guest_avail_elem = NULL;
>>> +    svq->avail_idx_shadow = 0;
>>> +    svq->shadow_used_idx = 0;
>>> +    svq->last_used_idx = 0;
>>> +    svq->vdev = vdev;
>>> +    svq->vq = vq;
>>> +
>>> +    memset(svq->vring.avail, 0, sizeof(*svq->vring.avail));
>>> +    memset(svq->vring.used, 0, sizeof(*svq->vring.avail));
>>> +    for (unsigned i = 0; i < svq->vring.num - 1; i++) {
>>> +        svq->vring.desc[i].next = cpu_to_le16(i + 1);
>>> +    }
>>> +}
>>> +
>>>    /**
>>>     * Stop shadow virtqueue operation.
>>>     * @svq Shadow Virtqueue
>>> @@ -272,6 +590,28 @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
>>>    void vhost_svq_stop(VhostShadowVirtqueue *svq)
>>>    {
>>>        event_notifier_set_handler(&svq->svq_kick, NULL);
>>> +    g_autofree VirtQueueElement *next_avail_elem = NULL;
>>> +
>>> +    if (!svq->vq) {
>>> +        return;
>>> +    }
>>> +
>>> +    /* Send all pending used descriptors to guest */
>>> +    vhost_svq_flush(svq, false);
>>> +
>>> +    for (unsigned i = 0; i < svq->vring.num; ++i) {
>>> +        g_autofree VirtQueueElement *elem = NULL;
>>> +        elem = g_steal_pointer(&svq->ring_id_maps[i]);
>>> +        if (elem) {
>>> +            virtqueue_detach_element(svq->vq, elem, elem->len);
>>> +        }
>>> +    }
>>> +
>>> +    next_avail_elem = g_steal_pointer(&svq->next_guest_avail_elem);
>>> +    if (next_avail_elem) {
>>> +        virtqueue_detach_element(svq->vq, next_avail_elem,
>>> +                                 next_avail_elem->len);
>>> +    }
>>>    }
>>>
>>>    /**
>>> @@ -316,7 +656,7 @@ VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize)
>>>        memset(svq->vring.desc, 0, driver_size);
>>>        svq->vring.used = qemu_memalign(qemu_real_host_page_size, device_size);
>>>        memset(svq->vring.used, 0, device_size);
>>> -
>>> +    svq->ring_id_maps = g_new0(VirtQueueElement *, qsize);
>>>        event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
>>>        return g_steal_pointer(&svq);
>>>
>>> @@ -335,6 +675,7 @@ void vhost_svq_free(VhostShadowVirtqueue *vq)
>>>        event_notifier_cleanup(&vq->hdev_kick);
>>>        event_notifier_set_handler(&vq->hdev_call, NULL);
>>>        event_notifier_cleanup(&vq->hdev_call);
>>> +    g_free(vq->ring_id_maps);
>>>        qemu_vfree(vq->vring.desc);
>>>        qemu_vfree(vq->vring.used);
>>>        g_free(vq);
>>> diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
>>> index 53e14bafa0..0e5c00ed7e 100644
>>> --- a/hw/virtio/vhost-vdpa.c
>>> +++ b/hw/virtio/vhost-vdpa.c
>>> @@ -752,9 +752,9 @@ static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
>>>     * Note that this function does not rewind kick file descriptor if cannot set
>>>     * call one.
>>>     */
>>> -static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
>>> -                                VhostShadowVirtqueue *svq,
>>> -                                unsigned idx)
>>> +static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
>>> +                                  VhostShadowVirtqueue *svq,
>>> +                                  unsigned idx)
>>>    {
>>>        struct vhost_vring_file file = {
>>>            .index = dev->vq_index + idx,
>>> @@ -767,7 +767,7 @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
>>>        r = vhost_vdpa_set_vring_dev_kick(dev, &file);
>>>        if (unlikely(r != 0)) {
>>>            error_report("Can't set device kick fd (%d)", -r);
>>> -        return false;
>>> +        return r;
>>>        }
>>>
>>>        event_notifier = vhost_svq_get_svq_call_notifier(svq);
>>> @@ -777,6 +777,99 @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
>>>            error_report("Can't set device call fd (%d)", -r);
>>>        }
>>>
>>> +    return r;
>>> +}
>>> +
>>> +/**
>>> + * Unmap SVQ area in the device
>>> + */
>>> +static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
>>> +                                      hwaddr size)
>>> +{
>>> +    int r;
>>> +
>>> +    size = ROUND_UP(size, qemu_real_host_page_size);
>>> +    r = vhost_vdpa_dma_unmap(v, iova, size);
>>> +    return r == 0;
>>> +}
>>> +
>>> +static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
>>> +                                       const VhostShadowVirtqueue *svq)
>>> +{
>>> +    struct vhost_vdpa *v = dev->opaque;
>>> +    struct vhost_vring_addr svq_addr;
>>> +    size_t device_size = vhost_svq_device_area_size(svq);
>>> +    size_t driver_size = vhost_svq_driver_area_size(svq);
>>> +    bool ok;
>>> +
>>> +    vhost_svq_get_vring_addr(svq, &svq_addr);
>>> +
>>> +    ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size);
>>> +    if (unlikely(!ok)) {
>>> +        return false;
>>> +    }
>>> +
>>> +    return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size);
>>> +}
>>> +
>>> +/**
>>> + * Map shadow virtqueue rings in device
>>> + *
>>> + * @dev   The vhost device
>>> + * @svq   The shadow virtqueue
>>> + */
>>> +static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
>>> +                                     const VhostShadowVirtqueue *svq)
>>> +{
>>> +    struct vhost_vdpa *v = dev->opaque;
>>> +    struct vhost_vring_addr svq_addr;
>>> +    size_t device_size = vhost_svq_device_area_size(svq);
>>> +    size_t driver_size = vhost_svq_driver_area_size(svq);
>>> +    int r;
>>> +
>>> +    vhost_svq_get_vring_addr(svq, &svq_addr);
>>> +
>>> +    r = vhost_vdpa_dma_map(v, svq_addr.desc_user_addr, driver_size,
>>> +                           (void *)svq_addr.desc_user_addr, true);
>>> +    if (unlikely(r != 0)) {
>>> +        return false;
>>> +    }
>>> +
>>> +    r = vhost_vdpa_dma_map(v, svq_addr.used_user_addr, device_size,
>>> +                           (void *)svq_addr.used_user_addr, false);
>>
>> Do we need unmap the driver area if we fail here?
>>
> Yes, this used to trust in unmap them at the disabling of SVQ. Now I
> think we need to unmap as you say.
>
> Thanks!
>
> [1] https://lists.linuxfoundation.org/pipermail/virtualization/2021-March/053322.html
>
>> Thanks
>>
>>
>>> +    return r == 0;
>>> +}
>>> +
>>> +static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
>>> +                                VhostShadowVirtqueue *svq,
>>> +                                unsigned idx)
>>> +{
>>> +    uint16_t vq_index = dev->vq_index + idx;
>>> +    struct vhost_vring_state s = {
>>> +        .index = vq_index,
>>> +    };
>>> +    int r;
>>> +    bool ok;
>>> +
>>> +    r = vhost_vdpa_set_dev_vring_base(dev, &s);
>>> +    if (unlikely(r)) {
>>> +        error_report("Can't set vring base (%d)", r);
>>> +        return false;
>>> +    }
>>> +
>>> +    s.num = vhost_svq_get_num(svq);
>>> +    r = vhost_vdpa_set_dev_vring_num(dev, &s);
>>> +    if (unlikely(r)) {
>>> +        error_report("Can't set vring num (%d)", r);
>>> +        return false;
>>> +    }
>>> +
>>> +    ok = vhost_vdpa_svq_map_rings(dev, svq);
>>> +    if (unlikely(!ok)) {
>>> +        return false;
>>> +    }
>>> +
>>> +    r = vhost_vdpa_svq_set_fds(dev, svq, idx);
>>>        return r == 0;
>>>    }
>>>
>>> @@ -788,14 +881,24 @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
>>>        if (started) {
>>>            vhost_vdpa_host_notifiers_init(dev);
>>>            for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
>>> +            VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
>>>                VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
>>>                bool ok = vhost_vdpa_svq_setup(dev, svq, i);
>>>                if (unlikely(!ok)) {
>>>                    return -1;
>>>                }
>>> +            vhost_svq_start(svq, dev->vdev, vq);
>>>            }
>>>            vhost_vdpa_set_vring_ready(dev);
>>>        } else {
>>> +        for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
>>> +            VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs,
>>> +                                                          i);
>>> +            bool ok = vhost_vdpa_svq_unmap_rings(dev, svq);
>>> +            if (unlikely(!ok)) {
>>> +                return -1;
>>> +            }
>>> +        }
>>>            vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
>>>        }
>>>

Re: [PATCH 18/31] vhost: Shadow virtqueue buffers forwarding

Posted by Eugenio Perez Martin 3 years, 11 months ago

On Tue, Feb 8, 2022 at 9:11 AM Jason Wang <jasowang@redhat.com> wrote:
>
>
> 在 2022/2/2 上午1:08, Eugenio Perez Martin 写道:
> > On Sun, Jan 30, 2022 at 5:43 AM Jason Wang <jasowang@redhat.com> wrote:
> >>
> >> 在 2022/1/22 上午4:27, Eugenio Pérez 写道:
> >>> Initial version of shadow virtqueue that actually forward buffers. There
> >>> is no iommu support at the moment, and that will be addressed in future
> >>> patches of this series. Since all vhost-vdpa devices use forced IOMMU,
> >>> this means that SVQ is not usable at this point of the series on any
> >>> device.
> >>>
> >>> For simplicity it only supports modern devices, that expects vring
> >>> in little endian, with split ring and no event idx or indirect
> >>> descriptors. Support for them will not be added in this series.
> >>>
> >>> It reuses the VirtQueue code for the device part. The driver part is
> >>> based on Linux's virtio_ring driver, but with stripped functionality
> >>> and optimizations so it's easier to review.
> >>>
> >>> However, forwarding buffers have some particular pieces: One of the most
> >>> unexpected ones is that a guest's buffer can expand through more than
> >>> one descriptor in SVQ. While this is handled gracefully by qemu's
> >>> emulated virtio devices, it may cause unexpected SVQ queue full. This
> >>> patch also solves it by checking for this condition at both guest's
> >>> kicks and device's calls. The code may be more elegant in the future if
> >>> SVQ code runs in its own iocontext.
> >>>
> >>> Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> >>> ---
> >>>    hw/virtio/vhost-shadow-virtqueue.h |   2 +
> >>>    hw/virtio/vhost-shadow-virtqueue.c | 365 ++++++++++++++++++++++++++++-
> >>>    hw/virtio/vhost-vdpa.c             | 111 ++++++++-
> >>>    3 files changed, 462 insertions(+), 16 deletions(-)
> >>>
> >>> diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
> >>> index 39aef5ffdf..19c934af49 100644
> >>> --- a/hw/virtio/vhost-shadow-virtqueue.h
> >>> +++ b/hw/virtio/vhost-shadow-virtqueue.h
> >>> @@ -33,6 +33,8 @@ uint16_t vhost_svq_get_num(const VhostShadowVirtqueue *svq);
> >>>    size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
> >>>    size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
> >>>
> >>> +void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
> >>> +                     VirtQueue *vq);
> >>>    void vhost_svq_stop(VhostShadowVirtqueue *svq);
> >>>
> >>>    VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize);
> >>> diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
> >>> index 7c168075d7..a1a404f68f 100644
> >>> --- a/hw/virtio/vhost-shadow-virtqueue.c
> >>> +++ b/hw/virtio/vhost-shadow-virtqueue.c
> >>> @@ -9,6 +9,8 @@
> >>>
> >>>    #include "qemu/osdep.h"
> >>>    #include "hw/virtio/vhost-shadow-virtqueue.h"
> >>> +#include "hw/virtio/vhost.h"
> >>> +#include "hw/virtio/virtio-access.h"
> >>>    #include "standard-headers/linux/vhost_types.h"
> >>>
> >>>    #include "qemu/error-report.h"
> >>> @@ -36,6 +38,33 @@ typedef struct VhostShadowVirtqueue {
> >>>
> >>>        /* Guest's call notifier, where SVQ calls guest. */
> >>>        EventNotifier svq_call;
> >>> +
> >>> +    /* Virtio queue shadowing */
> >>> +    VirtQueue *vq;
> >>> +
> >>> +    /* Virtio device */
> >>> +    VirtIODevice *vdev;
> >>> +
> >>> +    /* Map for returning guest's descriptors */
> >>> +    VirtQueueElement **ring_id_maps;
> >>> +
> >>> +    /* Next VirtQueue element that guest made available */
> >>> +    VirtQueueElement *next_guest_avail_elem;
> >>> +
> >>> +    /* Next head to expose to device */
> >>> +    uint16_t avail_idx_shadow;
> >>> +
> >>> +    /* Next free descriptor */
> >>> +    uint16_t free_head;
> >>> +
> >>> +    /* Last seen used idx */
> >>> +    uint16_t shadow_used_idx;
> >>> +
> >>> +    /* Next head to consume from device */
> >>> +    uint16_t last_used_idx;
> >>> +
> >>> +    /* Cache for the exposed notification flag */
> >>> +    bool notification;
> >>>    } VhostShadowVirtqueue;
> >>>
> >>>    #define INVALID_SVQ_KICK_FD -1
> >>> @@ -148,30 +177,294 @@ bool vhost_svq_ack_guest_features(uint64_t dev_features,
> >>>        return true;
> >>>    }
> >>>
> >>> -/* Forward guest notifications */
> >>> -static void vhost_handle_guest_kick(EventNotifier *n)
> >>> +/**
> >>> + * Number of descriptors that SVQ can make available from the guest.
> >>> + *
> >>> + * @svq   The svq
> >>> + */
> >>> +static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
> >>>    {
> >>> -    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
> >>> -                                             svq_kick);
> >>> +    return svq->vring.num - (svq->avail_idx_shadow - svq->shadow_used_idx);
> >>> +}
> >>> +
> >>> +static void vhost_svq_set_notification(VhostShadowVirtqueue *svq, bool enable)
> >>> +{
> >>> +    uint16_t notification_flag;
> >>>
> >>> -    if (unlikely(!event_notifier_test_and_clear(n))) {
> >>> +    if (svq->notification == enable) {
> >>> +        return;
> >>> +    }
> >>> +
> >>> +    notification_flag = cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
> >>> +
> >>> +    svq->notification = enable;
> >>> +    if (enable) {
> >>> +        svq->vring.avail->flags &= ~notification_flag;
> >>> +    } else {
> >>> +        svq->vring.avail->flags |= notification_flag;
> >>> +    }
> >>> +}
> >>> +
> >>> +static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
> >>> +                                    const struct iovec *iovec,
> >>> +                                    size_t num, bool more_descs, bool write)
> >>> +{
> >>> +    uint16_t i = svq->free_head, last = svq->free_head;
> >>> +    unsigned n;
> >>> +    uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0;
> >>> +    vring_desc_t *descs = svq->vring.desc;
> >>> +
> >>> +    if (num == 0) {
> >>> +        return;
> >>> +    }
> >>> +
> >>> +    for (n = 0; n < num; n++) {
> >>> +        if (more_descs || (n + 1 < num)) {
> >>> +            descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT);
> >>> +        } else {
> >>> +            descs[i].flags = flags;
> >>> +        }
> >>> +        descs[i].addr = cpu_to_le64((hwaddr)iovec[n].iov_base);
> >>> +        descs[i].len = cpu_to_le32(iovec[n].iov_len);
> >>> +
> >>> +        last = i;
> >>> +        i = cpu_to_le16(descs[i].next);
> >>> +    }
> >>> +
> >>> +    svq->free_head = le16_to_cpu(descs[last].next);
> >>> +}
> >>> +
> >>> +static unsigned vhost_svq_add_split(VhostShadowVirtqueue *svq,
> >>> +                                    VirtQueueElement *elem)
> >>> +{
> >>> +    int head;
> >>> +    unsigned avail_idx;
> >>> +    vring_avail_t *avail = svq->vring.avail;
> >>> +
> >>> +    head = svq->free_head;
> >>> +
> >>> +    /* We need some descriptors here */
> >>> +    assert(elem->out_num || elem->in_num);
> >>
> >> Looks like this could be triggered by guest, we need fail instead assert
> >> here.
> >>
> > My understanding was that virtqueue_pop already sanitized that case,
> > but I'm not able to find where now. I will recheck and, in case it's
> > not, I will move to a failure.
> >
> >>> +
> >>> +    vhost_vring_write_descs(svq, elem->out_sg, elem->out_num,
> >>> +                            elem->in_num > 0, false);
> >>> +    vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
> >>> +
> >>> +    /*
> >>> +     * Put entry in available array (but don't update avail->idx until they
> >>> +     * do sync).
> >>> +     */
> >>> +    avail_idx = svq->avail_idx_shadow & (svq->vring.num - 1);
> >>> +    avail->ring[avail_idx] = cpu_to_le16(head);
> >>> +    svq->avail_idx_shadow++;
> >>> +
> >>> +    /* Update avail index after the descriptor is wrote */
> >>> +    smp_wmb();
> >>> +    avail->idx = cpu_to_le16(svq->avail_idx_shadow);
> >>> +
> >>> +    return head;
> >>> +}
> >>> +
> >>> +static void vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
> >>> +{
> >>> +    unsigned qemu_head = vhost_svq_add_split(svq, elem);
> >>> +
> >>> +    svq->ring_id_maps[qemu_head] = elem;
> >>> +}
> >>> +
> >>> +static void vhost_svq_kick(VhostShadowVirtqueue *svq)
> >>> +{
> >>> +    /* We need to expose available array entries before checking used flags */
> >>> +    smp_mb();
> >>> +    if (svq->vring.used->flags & VRING_USED_F_NO_NOTIFY) {
> >>>            return;
> >>>        }
> >>>
> >>>        event_notifier_set(&svq->hdev_kick);
> >>>    }
> >>>
> >>> -/* Forward vhost notifications */
> >>> +/**
> >>> + * Forward available buffers.
> >>> + *
> >>> + * @svq Shadow VirtQueue
> >>> + *
> >>> + * Note that this function does not guarantee that all guest's available
> >>> + * buffers are available to the device in SVQ avail ring. The guest may have
> >>> + * exposed a GPA / GIOVA congiuous buffer, but it may not be contiguous in qemu
> >>> + * vaddr.
> >>> + *
> >>> + * If that happens, guest's kick notifications will be disabled until device
> >>> + * makes some buffers used.
> >>> + */
> >>> +static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
> >>> +{
> >>> +    /* Clear event notifier */
> >>> +    event_notifier_test_and_clear(&svq->svq_kick);
> >>> +
> >>> +    /* Make available as many buffers as possible */
> >>> +    do {
> >>> +        if (virtio_queue_get_notification(svq->vq)) {
> >>> +            virtio_queue_set_notification(svq->vq, false);
> >>
> >> This looks like an optimization the should belong to
> >> virtio_queue_set_notification() itself.
> >>
> > Sure we can move.
> >
> >>> +        }
> >>> +
> >>> +        while (true) {
> >>> +            VirtQueueElement *elem;
> >>> +
> >>> +            if (svq->next_guest_avail_elem) {
> >>> +                elem = g_steal_pointer(&svq->next_guest_avail_elem);
> >>> +            } else {
> >>> +                elem = virtqueue_pop(svq->vq, sizeof(*elem));
> >>> +            }
> >>> +
> >>> +            if (!elem) {
> >>> +                break;
> >>> +            }
> >>> +
> >>> +            if (elem->out_num + elem->in_num >
> >>> +                vhost_svq_available_slots(svq)) {
> >>> +                /*
> >>> +                 * This condition is possible since a contiguous buffer in GPA
> >>> +                 * does not imply a contiguous buffer in qemu's VA
> >>> +                 * scatter-gather segments. If that happen, the buffer exposed
> >>> +                 * to the device needs to be a chain of descriptors at this
> >>> +                 * moment.
> >>> +                 *
> >>> +                 * SVQ cannot hold more available buffers if we are here:
> >>> +                 * queue the current guest descriptor and ignore further kicks
> >>> +                 * until some elements are used.
> >>> +                 */
> >>> +                svq->next_guest_avail_elem = elem;
> >>> +                return;
> >>> +            }
> >>> +
> >>> +            vhost_svq_add(svq, elem);
> >>> +            vhost_svq_kick(svq);
> >>> +        }
> >>> +
> >>> +        virtio_queue_set_notification(svq->vq, true);
> >>> +    } while (!virtio_queue_empty(svq->vq));
> >>> +}
> >>> +
> >>> +/**
> >>> + * Handle guest's kick.
> >>> + *
> >>> + * @n guest kick event notifier, the one that guest set to notify svq.
> >>> + */
> >>> +static void vhost_handle_guest_kick_notifier(EventNotifier *n)
> >>> +{
> >>> +    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
> >>> +                                             svq_kick);
> >>> +    vhost_handle_guest_kick(svq);
> >>> +}
> >>> +
> >>> +static bool vhost_svq_more_used(VhostShadowVirtqueue *svq)
> >>> +{
> >>> +    if (svq->last_used_idx != svq->shadow_used_idx) {
> >>> +        return true;
> >>> +    }
> >>> +
> >>> +    svq->shadow_used_idx = cpu_to_le16(svq->vring.used->idx);
> >>> +
> >>> +    return svq->last_used_idx != svq->shadow_used_idx;
> >>> +}
> >>> +
> >>> +static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq)
> >>> +{
> >>> +    vring_desc_t *descs = svq->vring.desc;
> >>> +    const vring_used_t *used = svq->vring.used;
> >>> +    vring_used_elem_t used_elem;
> >>> +    uint16_t last_used;
> >>> +
> >>> +    if (!vhost_svq_more_used(svq)) {
> >>> +        return NULL;
> >>> +    }
> >>> +
> >>> +    /* Only get used array entries after they have been exposed by dev */
> >>> +    smp_rmb();
> >>> +    last_used = svq->last_used_idx & (svq->vring.num - 1);
> >>> +    used_elem.id = le32_to_cpu(used->ring[last_used].id);
> >>> +    used_elem.len = le32_to_cpu(used->ring[last_used].len);
> >>> +
> >>> +    svq->last_used_idx++;
> >>> +    if (unlikely(used_elem.id >= svq->vring.num)) {
> >>> +        error_report("Device %s says index %u is used", svq->vdev->name,
> >>> +                     used_elem.id);
> >>> +        return NULL;
> >>> +    }
> >>> +
> >>> +    if (unlikely(!svq->ring_id_maps[used_elem.id])) {
> >>> +        error_report(
> >>> +            "Device %s says index %u is used, but it was not available",
> >>> +            svq->vdev->name, used_elem.id);
> >>> +        return NULL;
> >>> +    }
> >>> +
> >>> +    descs[used_elem.id].next = svq->free_head;
> >>> +    svq->free_head = used_elem.id;
> >>> +
> >>> +    svq->ring_id_maps[used_elem.id]->len = used_elem.len;
> >>> +    return g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
> >>> +}
> >>> +
> >>> +static void vhost_svq_flush(VhostShadowVirtqueue *svq,
> >>> +                            bool check_for_avail_queue)
> >>> +{
> >>> +    VirtQueue *vq = svq->vq;
> >>> +
> >>> +    /* Make as many buffers as possible used. */
> >>> +    do {
> >>> +        unsigned i = 0;
> >>> +
> >>> +        vhost_svq_set_notification(svq, false);
> >>> +        while (true) {
> >>> +            g_autofree VirtQueueElement *elem = vhost_svq_get_buf(svq);
> >>> +            if (!elem) {
> >>> +                break;
> >>> +            }
> >>> +
> >>> +            if (unlikely(i >= svq->vring.num)) {
> >>> +                virtio_error(svq->vdev,
> >>> +                         "More than %u used buffers obtained in a %u size SVQ",
> >>> +                         i, svq->vring.num);
> >>> +                virtqueue_fill(vq, elem, elem->len, i);
> >>> +                virtqueue_flush(vq, i);
> >>
> >> Let's simply use virtqueue_push() here?
> >>
> > virtqueue_push support to fill and flush only one element, instead of
> > batch. I'm fine with either but I think the less updates to the used
> > idx, the better.
>
>
> Fine.
>
>
> >
> >>> +                i = 0;
> >>
> >> Do we need to bail out here?
> >>
> > Yes I guess we can simply return.
> >
> >>> +            }
> >>> +            virtqueue_fill(vq, elem, elem->len, i++);
> >>> +        }
> >>> +
> >>> +        virtqueue_flush(vq, i);
> >>> +        event_notifier_set(&svq->svq_call);
> >>> +
> >>> +        if (check_for_avail_queue && svq->next_guest_avail_elem) {
> >>> +            /*
> >>> +             * Avail ring was full when vhost_svq_flush was called, so it's a
> >>> +             * good moment to make more descriptors available if possible
> >>> +             */
> >>> +            vhost_handle_guest_kick(svq);
> >>
> >> Is there better to have a similar check as vhost_handle_guest_kick() did?
> >>
> >>               if (elem->out_num + elem->in_num >
> >>                   vhost_svq_available_slots(svq)) {
> >>
> > It will be duplicated when we call vhost_handle_guest_kick, won't it?
>
>
> Right, I mis-read the code.
>
>
> >
> >>> +        }
> >>> +
> >>> +        vhost_svq_set_notification(svq, true);
> >>
> >> A mb() is needed here? Otherwise we may lost a call here (where
> >> vhost_svq_more_used() is run before vhost_svq_set_notification()).
> >>
> > I'm confused here then, I thought you said this is just a hint so
> > there was no need? [1]. I think the memory barrier is needed too.
>
>
> Yes, it's a hint but:
>
> 1) When we disable the notification, consider the notification disable
> is just a hint, device can still raise an interrupt, so the ordering is
> meaningless and a memory barrier is not necessary (the
> vhost_svq_set_notification(svq, false))
>
> 2) When we enable the notification, though it's a hint, the device can
> choose to implement it by enabling the interrupt, in this case, the
> notification enable should be done before checking the used. Otherwise,
> the checking of more used might be done before enable the notification:
>
> 1) driver check more used
> 2) device add more used but no notification
> 3) driver enable the notification then we lost a notification here
>

That was my understanding too. So the right way is to only add the
memory barrier in case 2), when setting the flag, right?

>
> >>> +    } while (vhost_svq_more_used(svq));
> >>> +}
> >>> +
> >>> +/**
> >>> + * Forward used buffers.
> >>> + *
> >>> + * @n hdev call event notifier, the one that device set to notify svq.
> >>> + *
> >>> + * Note that we are not making any buffers available in the loop, there is no
> >>> + * way that it runs more than virtqueue size times.
> >>> + */
> >>>    static void vhost_svq_handle_call(EventNotifier *n)
> >>>    {
> >>>        VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
> >>>                                                 hdev_call);
> >>>
> >>> -    if (unlikely(!event_notifier_test_and_clear(n))) {
> >>> -        return;
> >>> -    }
> >>> +    /* Clear event notifier */
> >>> +    event_notifier_test_and_clear(n);
> >>
> >> Any reason that we remove the above check?
> >>
> > This comes from the previous versions, where this made sure we missed
> > no used buffers in the process of switching to SVQ mode.
>
>
> I'm not sure I get here. Even if for the switching, it should be more
> safe the handle the flush unconditionally?
>

Yes, I also think it's better to forward and kick/call unconditionally.

Thanks!

> Thanks
>
>
> >
> > If we enable SVQ from the beginning I think we can rely on getting all
> > the device's used buffer notifications, so let me think a little bit
> > and I can move to check the eventfd.
> >
> >>> -    event_notifier_set(&svq->svq_call);
> >>> +    vhost_svq_flush(svq, true);
> >>>    }
> >>>
> >>>    /**
> >>> @@ -258,13 +551,38 @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
> >>>         * need to explicitely check for them.
> >>>         */
> >>>        event_notifier_init_fd(&svq->svq_kick, svq_kick_fd);
> >>> -    event_notifier_set_handler(&svq->svq_kick, vhost_handle_guest_kick);
> >>> +    event_notifier_set_handler(&svq->svq_kick,
> >>> +                               vhost_handle_guest_kick_notifier);
> >>>
> >>>        if (!check_old || event_notifier_test_and_clear(&tmp)) {
> >>>            event_notifier_set(&svq->hdev_kick);
> >>>        }
> >>>    }
> >>>
> >>> +/**
> >>> + * Start shadow virtqueue operation.
> >>> + *
> >>> + * @svq Shadow Virtqueue
> >>> + * @vdev        VirtIO device
> >>> + * @vq          Virtqueue to shadow
> >>> + */
> >>> +void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
> >>> +                     VirtQueue *vq)
> >>> +{
> >>> +    svq->next_guest_avail_elem = NULL;
> >>> +    svq->avail_idx_shadow = 0;
> >>> +    svq->shadow_used_idx = 0;
> >>> +    svq->last_used_idx = 0;
> >>> +    svq->vdev = vdev;
> >>> +    svq->vq = vq;
> >>> +
> >>> +    memset(svq->vring.avail, 0, sizeof(*svq->vring.avail));
> >>> +    memset(svq->vring.used, 0, sizeof(*svq->vring.avail));
> >>> +    for (unsigned i = 0; i < svq->vring.num - 1; i++) {
> >>> +        svq->vring.desc[i].next = cpu_to_le16(i + 1);
> >>> +    }
> >>> +}
> >>> +
> >>>    /**
> >>>     * Stop shadow virtqueue operation.
> >>>     * @svq Shadow Virtqueue
> >>> @@ -272,6 +590,28 @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
> >>>    void vhost_svq_stop(VhostShadowVirtqueue *svq)
> >>>    {
> >>>        event_notifier_set_handler(&svq->svq_kick, NULL);
> >>> +    g_autofree VirtQueueElement *next_avail_elem = NULL;
> >>> +
> >>> +    if (!svq->vq) {
> >>> +        return;
> >>> +    }
> >>> +
> >>> +    /* Send all pending used descriptors to guest */
> >>> +    vhost_svq_flush(svq, false);
> >>> +
> >>> +    for (unsigned i = 0; i < svq->vring.num; ++i) {
> >>> +        g_autofree VirtQueueElement *elem = NULL;
> >>> +        elem = g_steal_pointer(&svq->ring_id_maps[i]);
> >>> +        if (elem) {
> >>> +            virtqueue_detach_element(svq->vq, elem, elem->len);
> >>> +        }
> >>> +    }
> >>> +
> >>> +    next_avail_elem = g_steal_pointer(&svq->next_guest_avail_elem);
> >>> +    if (next_avail_elem) {
> >>> +        virtqueue_detach_element(svq->vq, next_avail_elem,
> >>> +                                 next_avail_elem->len);
> >>> +    }
> >>>    }
> >>>
> >>>    /**
> >>> @@ -316,7 +656,7 @@ VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize)
> >>>        memset(svq->vring.desc, 0, driver_size);
> >>>        svq->vring.used = qemu_memalign(qemu_real_host_page_size, device_size);
> >>>        memset(svq->vring.used, 0, device_size);
> >>> -
> >>> +    svq->ring_id_maps = g_new0(VirtQueueElement *, qsize);
> >>>        event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
> >>>        return g_steal_pointer(&svq);
> >>>
> >>> @@ -335,6 +675,7 @@ void vhost_svq_free(VhostShadowVirtqueue *vq)
> >>>        event_notifier_cleanup(&vq->hdev_kick);
> >>>        event_notifier_set_handler(&vq->hdev_call, NULL);
> >>>        event_notifier_cleanup(&vq->hdev_call);
> >>> +    g_free(vq->ring_id_maps);
> >>>        qemu_vfree(vq->vring.desc);
> >>>        qemu_vfree(vq->vring.used);
> >>>        g_free(vq);
> >>> diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
> >>> index 53e14bafa0..0e5c00ed7e 100644
> >>> --- a/hw/virtio/vhost-vdpa.c
> >>> +++ b/hw/virtio/vhost-vdpa.c
> >>> @@ -752,9 +752,9 @@ static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
> >>>     * Note that this function does not rewind kick file descriptor if cannot set
> >>>     * call one.
> >>>     */
> >>> -static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
> >>> -                                VhostShadowVirtqueue *svq,
> >>> -                                unsigned idx)
> >>> +static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
> >>> +                                  VhostShadowVirtqueue *svq,
> >>> +                                  unsigned idx)
> >>>    {
> >>>        struct vhost_vring_file file = {
> >>>            .index = dev->vq_index + idx,
> >>> @@ -767,7 +767,7 @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
> >>>        r = vhost_vdpa_set_vring_dev_kick(dev, &file);
> >>>        if (unlikely(r != 0)) {
> >>>            error_report("Can't set device kick fd (%d)", -r);
> >>> -        return false;
> >>> +        return r;
> >>>        }
> >>>
> >>>        event_notifier = vhost_svq_get_svq_call_notifier(svq);
> >>> @@ -777,6 +777,99 @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
> >>>            error_report("Can't set device call fd (%d)", -r);
> >>>        }
> >>>
> >>> +    return r;
> >>> +}
> >>> +
> >>> +/**
> >>> + * Unmap SVQ area in the device
> >>> + */
> >>> +static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
> >>> +                                      hwaddr size)
> >>> +{
> >>> +    int r;
> >>> +
> >>> +    size = ROUND_UP(size, qemu_real_host_page_size);
> >>> +    r = vhost_vdpa_dma_unmap(v, iova, size);
> >>> +    return r == 0;
> >>> +}
> >>> +
> >>> +static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
> >>> +                                       const VhostShadowVirtqueue *svq)
> >>> +{
> >>> +    struct vhost_vdpa *v = dev->opaque;
> >>> +    struct vhost_vring_addr svq_addr;
> >>> +    size_t device_size = vhost_svq_device_area_size(svq);
> >>> +    size_t driver_size = vhost_svq_driver_area_size(svq);
> >>> +    bool ok;
> >>> +
> >>> +    vhost_svq_get_vring_addr(svq, &svq_addr);
> >>> +
> >>> +    ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size);
> >>> +    if (unlikely(!ok)) {
> >>> +        return false;
> >>> +    }
> >>> +
> >>> +    return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size);
> >>> +}
> >>> +
> >>> +/**
> >>> + * Map shadow virtqueue rings in device
> >>> + *
> >>> + * @dev   The vhost device
> >>> + * @svq   The shadow virtqueue
> >>> + */
> >>> +static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
> >>> +                                     const VhostShadowVirtqueue *svq)
> >>> +{
> >>> +    struct vhost_vdpa *v = dev->opaque;
> >>> +    struct vhost_vring_addr svq_addr;
> >>> +    size_t device_size = vhost_svq_device_area_size(svq);
> >>> +    size_t driver_size = vhost_svq_driver_area_size(svq);
> >>> +    int r;
> >>> +
> >>> +    vhost_svq_get_vring_addr(svq, &svq_addr);
> >>> +
> >>> +    r = vhost_vdpa_dma_map(v, svq_addr.desc_user_addr, driver_size,
> >>> +                           (void *)svq_addr.desc_user_addr, true);
> >>> +    if (unlikely(r != 0)) {
> >>> +        return false;
> >>> +    }
> >>> +
> >>> +    r = vhost_vdpa_dma_map(v, svq_addr.used_user_addr, device_size,
> >>> +                           (void *)svq_addr.used_user_addr, false);
> >>
> >> Do we need unmap the driver area if we fail here?
> >>
> > Yes, this used to trust in unmap them at the disabling of SVQ. Now I
> > think we need to unmap as you say.
> >
> > Thanks!
> >
> > [1] https://lists.linuxfoundation.org/pipermail/virtualization/2021-March/053322.html
> >
> >> Thanks
> >>
> >>
> >>> +    return r == 0;
> >>> +}
> >>> +
> >>> +static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
> >>> +                                VhostShadowVirtqueue *svq,
> >>> +                                unsigned idx)
> >>> +{
> >>> +    uint16_t vq_index = dev->vq_index + idx;
> >>> +    struct vhost_vring_state s = {
> >>> +        .index = vq_index,
> >>> +    };
> >>> +    int r;
> >>> +    bool ok;
> >>> +
> >>> +    r = vhost_vdpa_set_dev_vring_base(dev, &s);
> >>> +    if (unlikely(r)) {
> >>> +        error_report("Can't set vring base (%d)", r);
> >>> +        return false;
> >>> +    }
> >>> +
> >>> +    s.num = vhost_svq_get_num(svq);
> >>> +    r = vhost_vdpa_set_dev_vring_num(dev, &s);
> >>> +    if (unlikely(r)) {
> >>> +        error_report("Can't set vring num (%d)", r);
> >>> +        return false;
> >>> +    }
> >>> +
> >>> +    ok = vhost_vdpa_svq_map_rings(dev, svq);
> >>> +    if (unlikely(!ok)) {
> >>> +        return false;
> >>> +    }
> >>> +
> >>> +    r = vhost_vdpa_svq_set_fds(dev, svq, idx);
> >>>        return r == 0;
> >>>    }
> >>>
> >>> @@ -788,14 +881,24 @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
> >>>        if (started) {
> >>>            vhost_vdpa_host_notifiers_init(dev);
> >>>            for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
> >>> +            VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
> >>>                VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
> >>>                bool ok = vhost_vdpa_svq_setup(dev, svq, i);
> >>>                if (unlikely(!ok)) {
> >>>                    return -1;
> >>>                }
> >>> +            vhost_svq_start(svq, dev->vdev, vq);
> >>>            }
> >>>            vhost_vdpa_set_vring_ready(dev);
> >>>        } else {
> >>> +        for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
> >>> +            VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs,
> >>> +                                                          i);
> >>> +            bool ok = vhost_vdpa_svq_unmap_rings(dev, svq);
> >>> +            if (unlikely(!ok)) {
> >>> +                return -1;
> >>> +            }
> >>> +        }
> >>>            vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
> >>>        }
> >>>
>

Re: [PATCH 18/31] vhost: Shadow virtqueue buffers forwarding

Posted by Jason Wang 3 years, 11 months ago

On Wed, Feb 23, 2022 at 3:01 AM Eugenio Perez Martin
<eperezma@redhat.com> wrote:
>
> On Tue, Feb 8, 2022 at 9:11 AM Jason Wang <jasowang@redhat.com> wrote:
> >
> >
> > 在 2022/2/2 上午1:08, Eugenio Perez Martin 写道:
> > > On Sun, Jan 30, 2022 at 5:43 AM Jason Wang <jasowang@redhat.com> wrote:
> > >>
> > >> 在 2022/1/22 上午4:27, Eugenio Pérez 写道:
> > >>> Initial version of shadow virtqueue that actually forward buffers. There
> > >>> is no iommu support at the moment, and that will be addressed in future
> > >>> patches of this series. Since all vhost-vdpa devices use forced IOMMU,
> > >>> this means that SVQ is not usable at this point of the series on any
> > >>> device.
> > >>>
> > >>> For simplicity it only supports modern devices, that expects vring
> > >>> in little endian, with split ring and no event idx or indirect
> > >>> descriptors. Support for them will not be added in this series.
> > >>>
> > >>> It reuses the VirtQueue code for the device part. The driver part is
> > >>> based on Linux's virtio_ring driver, but with stripped functionality
> > >>> and optimizations so it's easier to review.
> > >>>
> > >>> However, forwarding buffers have some particular pieces: One of the most
> > >>> unexpected ones is that a guest's buffer can expand through more than
> > >>> one descriptor in SVQ. While this is handled gracefully by qemu's
> > >>> emulated virtio devices, it may cause unexpected SVQ queue full. This
> > >>> patch also solves it by checking for this condition at both guest's
> > >>> kicks and device's calls. The code may be more elegant in the future if
> > >>> SVQ code runs in its own iocontext.
> > >>>
> > >>> Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> > >>> ---
> > >>>    hw/virtio/vhost-shadow-virtqueue.h |   2 +
> > >>>    hw/virtio/vhost-shadow-virtqueue.c | 365 ++++++++++++++++++++++++++++-
> > >>>    hw/virtio/vhost-vdpa.c             | 111 ++++++++-
> > >>>    3 files changed, 462 insertions(+), 16 deletions(-)
> > >>>
> > >>> diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
> > >>> index 39aef5ffdf..19c934af49 100644
> > >>> --- a/hw/virtio/vhost-shadow-virtqueue.h
> > >>> +++ b/hw/virtio/vhost-shadow-virtqueue.h
> > >>> @@ -33,6 +33,8 @@ uint16_t vhost_svq_get_num(const VhostShadowVirtqueue *svq);
> > >>>    size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
> > >>>    size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
> > >>>
> > >>> +void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
> > >>> +                     VirtQueue *vq);
> > >>>    void vhost_svq_stop(VhostShadowVirtqueue *svq);
> > >>>
> > >>>    VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize);
> > >>> diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
> > >>> index 7c168075d7..a1a404f68f 100644
> > >>> --- a/hw/virtio/vhost-shadow-virtqueue.c
> > >>> +++ b/hw/virtio/vhost-shadow-virtqueue.c
> > >>> @@ -9,6 +9,8 @@
> > >>>
> > >>>    #include "qemu/osdep.h"
> > >>>    #include "hw/virtio/vhost-shadow-virtqueue.h"
> > >>> +#include "hw/virtio/vhost.h"
> > >>> +#include "hw/virtio/virtio-access.h"
> > >>>    #include "standard-headers/linux/vhost_types.h"
> > >>>
> > >>>    #include "qemu/error-report.h"
> > >>> @@ -36,6 +38,33 @@ typedef struct VhostShadowVirtqueue {
> > >>>
> > >>>        /* Guest's call notifier, where SVQ calls guest. */
> > >>>        EventNotifier svq_call;
> > >>> +
> > >>> +    /* Virtio queue shadowing */
> > >>> +    VirtQueue *vq;
> > >>> +
> > >>> +    /* Virtio device */
> > >>> +    VirtIODevice *vdev;
> > >>> +
> > >>> +    /* Map for returning guest's descriptors */
> > >>> +    VirtQueueElement **ring_id_maps;
> > >>> +
> > >>> +    /* Next VirtQueue element that guest made available */
> > >>> +    VirtQueueElement *next_guest_avail_elem;
> > >>> +
> > >>> +    /* Next head to expose to device */
> > >>> +    uint16_t avail_idx_shadow;
> > >>> +
> > >>> +    /* Next free descriptor */
> > >>> +    uint16_t free_head;
> > >>> +
> > >>> +    /* Last seen used idx */
> > >>> +    uint16_t shadow_used_idx;
> > >>> +
> > >>> +    /* Next head to consume from device */
> > >>> +    uint16_t last_used_idx;
> > >>> +
> > >>> +    /* Cache for the exposed notification flag */
> > >>> +    bool notification;
> > >>>    } VhostShadowVirtqueue;
> > >>>
> > >>>    #define INVALID_SVQ_KICK_FD -1
> > >>> @@ -148,30 +177,294 @@ bool vhost_svq_ack_guest_features(uint64_t dev_features,
> > >>>        return true;
> > >>>    }
> > >>>
> > >>> -/* Forward guest notifications */
> > >>> -static void vhost_handle_guest_kick(EventNotifier *n)
> > >>> +/**
> > >>> + * Number of descriptors that SVQ can make available from the guest.
> > >>> + *
> > >>> + * @svq   The svq
> > >>> + */
> > >>> +static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
> > >>>    {
> > >>> -    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
> > >>> -                                             svq_kick);
> > >>> +    return svq->vring.num - (svq->avail_idx_shadow - svq->shadow_used_idx);
> > >>> +}
> > >>> +
> > >>> +static void vhost_svq_set_notification(VhostShadowVirtqueue *svq, bool enable)
> > >>> +{
> > >>> +    uint16_t notification_flag;
> > >>>
> > >>> -    if (unlikely(!event_notifier_test_and_clear(n))) {
> > >>> +    if (svq->notification == enable) {
> > >>> +        return;
> > >>> +    }
> > >>> +
> > >>> +    notification_flag = cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
> > >>> +
> > >>> +    svq->notification = enable;
> > >>> +    if (enable) {
> > >>> +        svq->vring.avail->flags &= ~notification_flag;
> > >>> +    } else {
> > >>> +        svq->vring.avail->flags |= notification_flag;
> > >>> +    }
> > >>> +}
> > >>> +
> > >>> +static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
> > >>> +                                    const struct iovec *iovec,
> > >>> +                                    size_t num, bool more_descs, bool write)
> > >>> +{
> > >>> +    uint16_t i = svq->free_head, last = svq->free_head;
> > >>> +    unsigned n;
> > >>> +    uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0;
> > >>> +    vring_desc_t *descs = svq->vring.desc;
> > >>> +
> > >>> +    if (num == 0) {
> > >>> +        return;
> > >>> +    }
> > >>> +
> > >>> +    for (n = 0; n < num; n++) {
> > >>> +        if (more_descs || (n + 1 < num)) {
> > >>> +            descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT);
> > >>> +        } else {
> > >>> +            descs[i].flags = flags;
> > >>> +        }
> > >>> +        descs[i].addr = cpu_to_le64((hwaddr)iovec[n].iov_base);
> > >>> +        descs[i].len = cpu_to_le32(iovec[n].iov_len);
> > >>> +
> > >>> +        last = i;
> > >>> +        i = cpu_to_le16(descs[i].next);
> > >>> +    }
> > >>> +
> > >>> +    svq->free_head = le16_to_cpu(descs[last].next);
> > >>> +}
> > >>> +
> > >>> +static unsigned vhost_svq_add_split(VhostShadowVirtqueue *svq,
> > >>> +                                    VirtQueueElement *elem)
> > >>> +{
> > >>> +    int head;
> > >>> +    unsigned avail_idx;
> > >>> +    vring_avail_t *avail = svq->vring.avail;
> > >>> +
> > >>> +    head = svq->free_head;
> > >>> +
> > >>> +    /* We need some descriptors here */
> > >>> +    assert(elem->out_num || elem->in_num);
> > >>
> > >> Looks like this could be triggered by guest, we need fail instead assert
> > >> here.
> > >>
> > > My understanding was that virtqueue_pop already sanitized that case,
> > > but I'm not able to find where now. I will recheck and, in case it's
> > > not, I will move to a failure.
> > >
> > >>> +
> > >>> +    vhost_vring_write_descs(svq, elem->out_sg, elem->out_num,
> > >>> +                            elem->in_num > 0, false);
> > >>> +    vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
> > >>> +
> > >>> +    /*
> > >>> +     * Put entry in available array (but don't update avail->idx until they
> > >>> +     * do sync).
> > >>> +     */
> > >>> +    avail_idx = svq->avail_idx_shadow & (svq->vring.num - 1);
> > >>> +    avail->ring[avail_idx] = cpu_to_le16(head);
> > >>> +    svq->avail_idx_shadow++;
> > >>> +
> > >>> +    /* Update avail index after the descriptor is wrote */
> > >>> +    smp_wmb();
> > >>> +    avail->idx = cpu_to_le16(svq->avail_idx_shadow);
> > >>> +
> > >>> +    return head;
> > >>> +}
> > >>> +
> > >>> +static void vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
> > >>> +{
> > >>> +    unsigned qemu_head = vhost_svq_add_split(svq, elem);
> > >>> +
> > >>> +    svq->ring_id_maps[qemu_head] = elem;
> > >>> +}
> > >>> +
> > >>> +static void vhost_svq_kick(VhostShadowVirtqueue *svq)
> > >>> +{
> > >>> +    /* We need to expose available array entries before checking used flags */
> > >>> +    smp_mb();
> > >>> +    if (svq->vring.used->flags & VRING_USED_F_NO_NOTIFY) {
> > >>>            return;
> > >>>        }
> > >>>
> > >>>        event_notifier_set(&svq->hdev_kick);
> > >>>    }
> > >>>
> > >>> -/* Forward vhost notifications */
> > >>> +/**
> > >>> + * Forward available buffers.
> > >>> + *
> > >>> + * @svq Shadow VirtQueue
> > >>> + *
> > >>> + * Note that this function does not guarantee that all guest's available
> > >>> + * buffers are available to the device in SVQ avail ring. The guest may have
> > >>> + * exposed a GPA / GIOVA congiuous buffer, but it may not be contiguous in qemu
> > >>> + * vaddr.
> > >>> + *
> > >>> + * If that happens, guest's kick notifications will be disabled until device
> > >>> + * makes some buffers used.
> > >>> + */
> > >>> +static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
> > >>> +{
> > >>> +    /* Clear event notifier */
> > >>> +    event_notifier_test_and_clear(&svq->svq_kick);
> > >>> +
> > >>> +    /* Make available as many buffers as possible */
> > >>> +    do {
> > >>> +        if (virtio_queue_get_notification(svq->vq)) {
> > >>> +            virtio_queue_set_notification(svq->vq, false);
> > >>
> > >> This looks like an optimization the should belong to
> > >> virtio_queue_set_notification() itself.
> > >>
> > > Sure we can move.
> > >
> > >>> +        }
> > >>> +
> > >>> +        while (true) {
> > >>> +            VirtQueueElement *elem;
> > >>> +
> > >>> +            if (svq->next_guest_avail_elem) {
> > >>> +                elem = g_steal_pointer(&svq->next_guest_avail_elem);
> > >>> +            } else {
> > >>> +                elem = virtqueue_pop(svq->vq, sizeof(*elem));
> > >>> +            }
> > >>> +
> > >>> +            if (!elem) {
> > >>> +                break;
> > >>> +            }
> > >>> +
> > >>> +            if (elem->out_num + elem->in_num >
> > >>> +                vhost_svq_available_slots(svq)) {
> > >>> +                /*
> > >>> +                 * This condition is possible since a contiguous buffer in GPA
> > >>> +                 * does not imply a contiguous buffer in qemu's VA
> > >>> +                 * scatter-gather segments. If that happen, the buffer exposed
> > >>> +                 * to the device needs to be a chain of descriptors at this
> > >>> +                 * moment.
> > >>> +                 *
> > >>> +                 * SVQ cannot hold more available buffers if we are here:
> > >>> +                 * queue the current guest descriptor and ignore further kicks
> > >>> +                 * until some elements are used.
> > >>> +                 */
> > >>> +                svq->next_guest_avail_elem = elem;
> > >>> +                return;
> > >>> +            }
> > >>> +
> > >>> +            vhost_svq_add(svq, elem);
> > >>> +            vhost_svq_kick(svq);
> > >>> +        }
> > >>> +
> > >>> +        virtio_queue_set_notification(svq->vq, true);
> > >>> +    } while (!virtio_queue_empty(svq->vq));
> > >>> +}
> > >>> +
> > >>> +/**
> > >>> + * Handle guest's kick.
> > >>> + *
> > >>> + * @n guest kick event notifier, the one that guest set to notify svq.
> > >>> + */
> > >>> +static void vhost_handle_guest_kick_notifier(EventNotifier *n)
> > >>> +{
> > >>> +    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
> > >>> +                                             svq_kick);
> > >>> +    vhost_handle_guest_kick(svq);
> > >>> +}
> > >>> +
> > >>> +static bool vhost_svq_more_used(VhostShadowVirtqueue *svq)
> > >>> +{
> > >>> +    if (svq->last_used_idx != svq->shadow_used_idx) {
> > >>> +        return true;
> > >>> +    }
> > >>> +
> > >>> +    svq->shadow_used_idx = cpu_to_le16(svq->vring.used->idx);
> > >>> +
> > >>> +    return svq->last_used_idx != svq->shadow_used_idx;
> > >>> +}
> > >>> +
> > >>> +static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq)
> > >>> +{
> > >>> +    vring_desc_t *descs = svq->vring.desc;
> > >>> +    const vring_used_t *used = svq->vring.used;
> > >>> +    vring_used_elem_t used_elem;
> > >>> +    uint16_t last_used;
> > >>> +
> > >>> +    if (!vhost_svq_more_used(svq)) {
> > >>> +        return NULL;
> > >>> +    }
> > >>> +
> > >>> +    /* Only get used array entries after they have been exposed by dev */
> > >>> +    smp_rmb();
> > >>> +    last_used = svq->last_used_idx & (svq->vring.num - 1);
> > >>> +    used_elem.id = le32_to_cpu(used->ring[last_used].id);
> > >>> +    used_elem.len = le32_to_cpu(used->ring[last_used].len);
> > >>> +
> > >>> +    svq->last_used_idx++;
> > >>> +    if (unlikely(used_elem.id >= svq->vring.num)) {
> > >>> +        error_report("Device %s says index %u is used", svq->vdev->name,
> > >>> +                     used_elem.id);
> > >>> +        return NULL;
> > >>> +    }
> > >>> +
> > >>> +    if (unlikely(!svq->ring_id_maps[used_elem.id])) {
> > >>> +        error_report(
> > >>> +            "Device %s says index %u is used, but it was not available",
> > >>> +            svq->vdev->name, used_elem.id);
> > >>> +        return NULL;
> > >>> +    }
> > >>> +
> > >>> +    descs[used_elem.id].next = svq->free_head;
> > >>> +    svq->free_head = used_elem.id;
> > >>> +
> > >>> +    svq->ring_id_maps[used_elem.id]->len = used_elem.len;
> > >>> +    return g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
> > >>> +}
> > >>> +
> > >>> +static void vhost_svq_flush(VhostShadowVirtqueue *svq,
> > >>> +                            bool check_for_avail_queue)
> > >>> +{
> > >>> +    VirtQueue *vq = svq->vq;
> > >>> +
> > >>> +    /* Make as many buffers as possible used. */
> > >>> +    do {
> > >>> +        unsigned i = 0;
> > >>> +
> > >>> +        vhost_svq_set_notification(svq, false);
> > >>> +        while (true) {
> > >>> +            g_autofree VirtQueueElement *elem = vhost_svq_get_buf(svq);
> > >>> +            if (!elem) {
> > >>> +                break;
> > >>> +            }
> > >>> +
> > >>> +            if (unlikely(i >= svq->vring.num)) {
> > >>> +                virtio_error(svq->vdev,
> > >>> +                         "More than %u used buffers obtained in a %u size SVQ",
> > >>> +                         i, svq->vring.num);
> > >>> +                virtqueue_fill(vq, elem, elem->len, i);
> > >>> +                virtqueue_flush(vq, i);
> > >>
> > >> Let's simply use virtqueue_push() here?
> > >>
> > > virtqueue_push support to fill and flush only one element, instead of
> > > batch. I'm fine with either but I think the less updates to the used
> > > idx, the better.
> >
> >
> > Fine.
> >
> >
> > >
> > >>> +                i = 0;
> > >>
> > >> Do we need to bail out here?
> > >>
> > > Yes I guess we can simply return.
> > >
> > >>> +            }
> > >>> +            virtqueue_fill(vq, elem, elem->len, i++);
> > >>> +        }
> > >>> +
> > >>> +        virtqueue_flush(vq, i);
> > >>> +        event_notifier_set(&svq->svq_call);
> > >>> +
> > >>> +        if (check_for_avail_queue && svq->next_guest_avail_elem) {
> > >>> +            /*
> > >>> +             * Avail ring was full when vhost_svq_flush was called, so it's a
> > >>> +             * good moment to make more descriptors available if possible
> > >>> +             */
> > >>> +            vhost_handle_guest_kick(svq);
> > >>
> > >> Is there better to have a similar check as vhost_handle_guest_kick() did?
> > >>
> > >>               if (elem->out_num + elem->in_num >
> > >>                   vhost_svq_available_slots(svq)) {
> > >>
> > > It will be duplicated when we call vhost_handle_guest_kick, won't it?
> >
> >
> > Right, I mis-read the code.
> >
> >
> > >
> > >>> +        }
> > >>> +
> > >>> +        vhost_svq_set_notification(svq, true);
> > >>
> > >> A mb() is needed here? Otherwise we may lost a call here (where
> > >> vhost_svq_more_used() is run before vhost_svq_set_notification()).
> > >>
> > > I'm confused here then, I thought you said this is just a hint so
> > > there was no need? [1]. I think the memory barrier is needed too.
> >
> >
> > Yes, it's a hint but:
> >
> > 1) When we disable the notification, consider the notification disable
> > is just a hint, device can still raise an interrupt, so the ordering is
> > meaningless and a memory barrier is not necessary (the
> > vhost_svq_set_notification(svq, false))
> >
> > 2) When we enable the notification, though it's a hint, the device can
> > choose to implement it by enabling the interrupt, in this case, the
> > notification enable should be done before checking the used. Otherwise,
> > the checking of more used might be done before enable the notification:
> >
> > 1) driver check more used
> > 2) device add more used but no notification
> > 3) driver enable the notification then we lost a notification here
> >
>
> That was my understanding too. So the right way is to only add the
> memory barrier in case 2), when setting the flag, right?

Yes.

>
> >
> > >>> +    } while (vhost_svq_more_used(svq));
> > >>> +}
> > >>> +
> > >>> +/**
> > >>> + * Forward used buffers.
> > >>> + *
> > >>> + * @n hdev call event notifier, the one that device set to notify svq.
> > >>> + *
> > >>> + * Note that we are not making any buffers available in the loop, there is no
> > >>> + * way that it runs more than virtqueue size times.
> > >>> + */
> > >>>    static void vhost_svq_handle_call(EventNotifier *n)
> > >>>    {
> > >>>        VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
> > >>>                                                 hdev_call);
> > >>>
> > >>> -    if (unlikely(!event_notifier_test_and_clear(n))) {
> > >>> -        return;
> > >>> -    }
> > >>> +    /* Clear event notifier */
> > >>> +    event_notifier_test_and_clear(n);
> > >>
> > >> Any reason that we remove the above check?
> > >>
> > > This comes from the previous versions, where this made sure we missed
> > > no used buffers in the process of switching to SVQ mode.
> >
> >
> > I'm not sure I get here. Even if for the switching, it should be more
> > safe the handle the flush unconditionally?
> >
>
> Yes, I also think it's better to forward and kick/call unconditionally.
>
> Thanks!

Ok.

Thanks

>
> > Thanks
> >
> >
> > >
> > > If we enable SVQ from the beginning I think we can rely on getting all
> > > the device's used buffer notifications, so let me think a little bit
> > > and I can move to check the eventfd.
> > >
> > >>> -    event_notifier_set(&svq->svq_call);
> > >>> +    vhost_svq_flush(svq, true);
> > >>>    }
> > >>>
> > >>>    /**
> > >>> @@ -258,13 +551,38 @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
> > >>>         * need to explicitely check for them.
> > >>>         */
> > >>>        event_notifier_init_fd(&svq->svq_kick, svq_kick_fd);
> > >>> -    event_notifier_set_handler(&svq->svq_kick, vhost_handle_guest_kick);
> > >>> +    event_notifier_set_handler(&svq->svq_kick,
> > >>> +                               vhost_handle_guest_kick_notifier);
> > >>>
> > >>>        if (!check_old || event_notifier_test_and_clear(&tmp)) {
> > >>>            event_notifier_set(&svq->hdev_kick);
> > >>>        }
> > >>>    }
> > >>>
> > >>> +/**
> > >>> + * Start shadow virtqueue operation.
> > >>> + *
> > >>> + * @svq Shadow Virtqueue
> > >>> + * @vdev        VirtIO device
> > >>> + * @vq          Virtqueue to shadow
> > >>> + */
> > >>> +void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
> > >>> +                     VirtQueue *vq)
> > >>> +{
> > >>> +    svq->next_guest_avail_elem = NULL;
> > >>> +    svq->avail_idx_shadow = 0;
> > >>> +    svq->shadow_used_idx = 0;
> > >>> +    svq->last_used_idx = 0;
> > >>> +    svq->vdev = vdev;
> > >>> +    svq->vq = vq;
> > >>> +
> > >>> +    memset(svq->vring.avail, 0, sizeof(*svq->vring.avail));
> > >>> +    memset(svq->vring.used, 0, sizeof(*svq->vring.avail));
> > >>> +    for (unsigned i = 0; i < svq->vring.num - 1; i++) {
> > >>> +        svq->vring.desc[i].next = cpu_to_le16(i + 1);
> > >>> +    }
> > >>> +}
> > >>> +
> > >>>    /**
> > >>>     * Stop shadow virtqueue operation.
> > >>>     * @svq Shadow Virtqueue
> > >>> @@ -272,6 +590,28 @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
> > >>>    void vhost_svq_stop(VhostShadowVirtqueue *svq)
> > >>>    {
> > >>>        event_notifier_set_handler(&svq->svq_kick, NULL);
> > >>> +    g_autofree VirtQueueElement *next_avail_elem = NULL;
> > >>> +
> > >>> +    if (!svq->vq) {
> > >>> +        return;
> > >>> +    }
> > >>> +
> > >>> +    /* Send all pending used descriptors to guest */
> > >>> +    vhost_svq_flush(svq, false);
> > >>> +
> > >>> +    for (unsigned i = 0; i < svq->vring.num; ++i) {
> > >>> +        g_autofree VirtQueueElement *elem = NULL;
> > >>> +        elem = g_steal_pointer(&svq->ring_id_maps[i]);
> > >>> +        if (elem) {
> > >>> +            virtqueue_detach_element(svq->vq, elem, elem->len);
> > >>> +        }
> > >>> +    }
> > >>> +
> > >>> +    next_avail_elem = g_steal_pointer(&svq->next_guest_avail_elem);
> > >>> +    if (next_avail_elem) {
> > >>> +        virtqueue_detach_element(svq->vq, next_avail_elem,
> > >>> +                                 next_avail_elem->len);
> > >>> +    }
> > >>>    }
> > >>>
> > >>>    /**
> > >>> @@ -316,7 +656,7 @@ VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize)
> > >>>        memset(svq->vring.desc, 0, driver_size);
> > >>>        svq->vring.used = qemu_memalign(qemu_real_host_page_size, device_size);
> > >>>        memset(svq->vring.used, 0, device_size);
> > >>> -
> > >>> +    svq->ring_id_maps = g_new0(VirtQueueElement *, qsize);
> > >>>        event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
> > >>>        return g_steal_pointer(&svq);
> > >>>
> > >>> @@ -335,6 +675,7 @@ void vhost_svq_free(VhostShadowVirtqueue *vq)
> > >>>        event_notifier_cleanup(&vq->hdev_kick);
> > >>>        event_notifier_set_handler(&vq->hdev_call, NULL);
> > >>>        event_notifier_cleanup(&vq->hdev_call);
> > >>> +    g_free(vq->ring_id_maps);
> > >>>        qemu_vfree(vq->vring.desc);
> > >>>        qemu_vfree(vq->vring.used);
> > >>>        g_free(vq);
> > >>> diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
> > >>> index 53e14bafa0..0e5c00ed7e 100644
> > >>> --- a/hw/virtio/vhost-vdpa.c
> > >>> +++ b/hw/virtio/vhost-vdpa.c
> > >>> @@ -752,9 +752,9 @@ static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
> > >>>     * Note that this function does not rewind kick file descriptor if cannot set
> > >>>     * call one.
> > >>>     */
> > >>> -static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
> > >>> -                                VhostShadowVirtqueue *svq,
> > >>> -                                unsigned idx)
> > >>> +static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
> > >>> +                                  VhostShadowVirtqueue *svq,
> > >>> +                                  unsigned idx)
> > >>>    {
> > >>>        struct vhost_vring_file file = {
> > >>>            .index = dev->vq_index + idx,
> > >>> @@ -767,7 +767,7 @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
> > >>>        r = vhost_vdpa_set_vring_dev_kick(dev, &file);
> > >>>        if (unlikely(r != 0)) {
> > >>>            error_report("Can't set device kick fd (%d)", -r);
> > >>> -        return false;
> > >>> +        return r;
> > >>>        }
> > >>>
> > >>>        event_notifier = vhost_svq_get_svq_call_notifier(svq);
> > >>> @@ -777,6 +777,99 @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
> > >>>            error_report("Can't set device call fd (%d)", -r);
> > >>>        }
> > >>>
> > >>> +    return r;
> > >>> +}
> > >>> +
> > >>> +/**
> > >>> + * Unmap SVQ area in the device
> > >>> + */
> > >>> +static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
> > >>> +                                      hwaddr size)
> > >>> +{
> > >>> +    int r;
> > >>> +
> > >>> +    size = ROUND_UP(size, qemu_real_host_page_size);
> > >>> +    r = vhost_vdpa_dma_unmap(v, iova, size);
> > >>> +    return r == 0;
> > >>> +}
> > >>> +
> > >>> +static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
> > >>> +                                       const VhostShadowVirtqueue *svq)
> > >>> +{
> > >>> +    struct vhost_vdpa *v = dev->opaque;
> > >>> +    struct vhost_vring_addr svq_addr;
> > >>> +    size_t device_size = vhost_svq_device_area_size(svq);
> > >>> +    size_t driver_size = vhost_svq_driver_area_size(svq);
> > >>> +    bool ok;
> > >>> +
> > >>> +    vhost_svq_get_vring_addr(svq, &svq_addr);
> > >>> +
> > >>> +    ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size);
> > >>> +    if (unlikely(!ok)) {
> > >>> +        return false;
> > >>> +    }
> > >>> +
> > >>> +    return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size);
> > >>> +}
> > >>> +
> > >>> +/**
> > >>> + * Map shadow virtqueue rings in device
> > >>> + *
> > >>> + * @dev   The vhost device
> > >>> + * @svq   The shadow virtqueue
> > >>> + */
> > >>> +static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
> > >>> +                                     const VhostShadowVirtqueue *svq)
> > >>> +{
> > >>> +    struct vhost_vdpa *v = dev->opaque;
> > >>> +    struct vhost_vring_addr svq_addr;
> > >>> +    size_t device_size = vhost_svq_device_area_size(svq);
> > >>> +    size_t driver_size = vhost_svq_driver_area_size(svq);
> > >>> +    int r;
> > >>> +
> > >>> +    vhost_svq_get_vring_addr(svq, &svq_addr);
> > >>> +
> > >>> +    r = vhost_vdpa_dma_map(v, svq_addr.desc_user_addr, driver_size,
> > >>> +                           (void *)svq_addr.desc_user_addr, true);
> > >>> +    if (unlikely(r != 0)) {
> > >>> +        return false;
> > >>> +    }
> > >>> +
> > >>> +    r = vhost_vdpa_dma_map(v, svq_addr.used_user_addr, device_size,
> > >>> +                           (void *)svq_addr.used_user_addr, false);
> > >>
> > >> Do we need unmap the driver area if we fail here?
> > >>
> > > Yes, this used to trust in unmap them at the disabling of SVQ. Now I
> > > think we need to unmap as you say.
> > >
> > > Thanks!
> > >
> > > [1] https://lists.linuxfoundation.org/pipermail/virtualization/2021-March/053322.html
> > >
> > >> Thanks
> > >>
> > >>
> > >>> +    return r == 0;
> > >>> +}
> > >>> +
> > >>> +static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
> > >>> +                                VhostShadowVirtqueue *svq,
> > >>> +                                unsigned idx)
> > >>> +{
> > >>> +    uint16_t vq_index = dev->vq_index + idx;
> > >>> +    struct vhost_vring_state s = {
> > >>> +        .index = vq_index,
> > >>> +    };
> > >>> +    int r;
> > >>> +    bool ok;
> > >>> +
> > >>> +    r = vhost_vdpa_set_dev_vring_base(dev, &s);
> > >>> +    if (unlikely(r)) {
> > >>> +        error_report("Can't set vring base (%d)", r);
> > >>> +        return false;
> > >>> +    }
> > >>> +
> > >>> +    s.num = vhost_svq_get_num(svq);
> > >>> +    r = vhost_vdpa_set_dev_vring_num(dev, &s);
> > >>> +    if (unlikely(r)) {
> > >>> +        error_report("Can't set vring num (%d)", r);
> > >>> +        return false;
> > >>> +    }
> > >>> +
> > >>> +    ok = vhost_vdpa_svq_map_rings(dev, svq);
> > >>> +    if (unlikely(!ok)) {
> > >>> +        return false;
> > >>> +    }
> > >>> +
> > >>> +    r = vhost_vdpa_svq_set_fds(dev, svq, idx);
> > >>>        return r == 0;
> > >>>    }
> > >>>
> > >>> @@ -788,14 +881,24 @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
> > >>>        if (started) {
> > >>>            vhost_vdpa_host_notifiers_init(dev);
> > >>>            for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
> > >>> +            VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
> > >>>                VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
> > >>>                bool ok = vhost_vdpa_svq_setup(dev, svq, i);
> > >>>                if (unlikely(!ok)) {
> > >>>                    return -1;
> > >>>                }
> > >>> +            vhost_svq_start(svq, dev->vdev, vq);
> > >>>            }
> > >>>            vhost_vdpa_set_vring_ready(dev);
> > >>>        } else {
> > >>> +        for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
> > >>> +            VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs,
> > >>> +                                                          i);
> > >>> +            bool ok = vhost_vdpa_svq_unmap_rings(dev, svq);
> > >>> +            if (unlikely(!ok)) {
> > >>> +                return -1;
> > >>> +            }
> > >>> +        }
> > >>>            vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
> > >>>        }
> > >>>
> >
>

Re: [PATCH 18/31] vhost: Shadow virtqueue buffers forwarding

Posted by Jason Wang 4 years ago

在 2022/1/22 上午4:27, Eugenio Pérez 写道:
> @@ -272,6 +590,28 @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
>   void vhost_svq_stop(VhostShadowVirtqueue *svq)
>   {
>       event_notifier_set_handler(&svq->svq_kick, NULL);
> +    g_autofree VirtQueueElement *next_avail_elem = NULL;
> +
> +    if (!svq->vq) {
> +        return;
> +    }
> +
> +    /* Send all pending used descriptors to guest */
> +    vhost_svq_flush(svq, false);


Do we need to wait for all the pending descriptors to be completed here?

Thanks


> +
> +    for (unsigned i = 0; i < svq->vring.num; ++i) {
> +        g_autofree VirtQueueElement *elem = NULL;
> +        elem = g_steal_pointer(&svq->ring_id_maps[i]);
> +        if (elem) {
> +            virtqueue_detach_element(svq->vq, elem, elem->len);
> +        }
> +    }
> +
> +    next_avail_elem = g_steal_pointer(&svq->next_guest_avail_elem);
> +    if (next_avail_elem) {
> +        virtqueue_detach_element(svq->vq, next_avail_elem,
> +                                 next_avail_elem->len);
> +    }
>   }

Re: [PATCH 18/31] vhost: Shadow virtqueue buffers forwarding

Posted by Eugenio Perez Martin 4 years ago

On Sun, Jan 30, 2022 at 7:47 AM Jason Wang <jasowang@redhat.com> wrote:
>
>
> 在 2022/1/22 上午4:27, Eugenio Pérez 写道:
> > @@ -272,6 +590,28 @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
> >   void vhost_svq_stop(VhostShadowVirtqueue *svq)
> >   {
> >       event_notifier_set_handler(&svq->svq_kick, NULL);
> > +    g_autofree VirtQueueElement *next_avail_elem = NULL;
> > +
> > +    if (!svq->vq) {
> > +        return;
> > +    }
> > +
> > +    /* Send all pending used descriptors to guest */
> > +    vhost_svq_flush(svq, false);
>
>
> Do we need to wait for all the pending descriptors to be completed here?
>

No, this function does not wait, it only completes the forwarding of
the *used* descriptors.

The best example is the net rx queue in my opinion. This call will
check SVQ's vring used_idx and will forward the last used descriptors
if any, but all available descriptors will remain as available for
qemu's VQ code.

To skip it would miss those last rx descriptors in migration.

Thanks!

> Thanks
>
>
> > +
> > +    for (unsigned i = 0; i < svq->vring.num; ++i) {
> > +        g_autofree VirtQueueElement *elem = NULL;
> > +        elem = g_steal_pointer(&svq->ring_id_maps[i]);
> > +        if (elem) {
> > +            virtqueue_detach_element(svq->vq, elem, elem->len);
> > +        }
> > +    }
> > +
> > +    next_avail_elem = g_steal_pointer(&svq->next_guest_avail_elem);
> > +    if (next_avail_elem) {
> > +        virtqueue_detach_element(svq->vq, next_avail_elem,
> > +                                 next_avail_elem->len);
> > +    }
> >   }
>

Re: [PATCH 18/31] vhost: Shadow virtqueue buffers forwarding

Posted by Jason Wang 4 years ago

在 2022/2/1 下午7:25, Eugenio Perez Martin 写道:
> On Sun, Jan 30, 2022 at 7:47 AM Jason Wang <jasowang@redhat.com> wrote:
>>
>> 在 2022/1/22 上午4:27, Eugenio Pérez 写道:
>>> @@ -272,6 +590,28 @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
>>>    void vhost_svq_stop(VhostShadowVirtqueue *svq)
>>>    {
>>>        event_notifier_set_handler(&svq->svq_kick, NULL);
>>> +    g_autofree VirtQueueElement *next_avail_elem = NULL;
>>> +
>>> +    if (!svq->vq) {
>>> +        return;
>>> +    }
>>> +
>>> +    /* Send all pending used descriptors to guest */
>>> +    vhost_svq_flush(svq, false);
>>
>> Do we need to wait for all the pending descriptors to be completed here?
>>
> No, this function does not wait, it only completes the forwarding of
> the *used* descriptors.
>
> The best example is the net rx queue in my opinion. This call will
> check SVQ's vring used_idx and will forward the last used descriptors
> if any, but all available descriptors will remain as available for
> qemu's VQ code.
>
> To skip it would miss those last rx descriptors in migration.
>
> Thanks!


So it's probably to not the best place to ask. It's more about the 
inflight descriptors so it should be TX instead of RX.

I can imagine the migration last phase, we should stop the vhost-vDPA 
before calling vhost_svq_stop(). Then we should be fine regardless of 
inflight descriptors.

Thanks


>
>> Thanks
>>
>>
>>> +
>>> +    for (unsigned i = 0; i < svq->vring.num; ++i) {
>>> +        g_autofree VirtQueueElement *elem = NULL;
>>> +        elem = g_steal_pointer(&svq->ring_id_maps[i]);
>>> +        if (elem) {
>>> +            virtqueue_detach_element(svq->vq, elem, elem->len);
>>> +        }
>>> +    }
>>> +
>>> +    next_avail_elem = g_steal_pointer(&svq->next_guest_avail_elem);
>>> +    if (next_avail_elem) {
>>> +        virtqueue_detach_element(svq->vq, next_avail_elem,
>>> +                                 next_avail_elem->len);
>>> +    }
>>>    }

Re: [PATCH 18/31] vhost: Shadow virtqueue buffers forwarding

Posted by Eugenio Perez Martin 3 years, 11 months ago

On Tue, Feb 8, 2022 at 9:16 AM Jason Wang <jasowang@redhat.com> wrote:
>
>
> 在 2022/2/1 下午7:25, Eugenio Perez Martin 写道:
> > On Sun, Jan 30, 2022 at 7:47 AM Jason Wang <jasowang@redhat.com> wrote:
> >>
> >> 在 2022/1/22 上午4:27, Eugenio Pérez 写道:
> >>> @@ -272,6 +590,28 @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
> >>>    void vhost_svq_stop(VhostShadowVirtqueue *svq)
> >>>    {
> >>>        event_notifier_set_handler(&svq->svq_kick, NULL);
> >>> +    g_autofree VirtQueueElement *next_avail_elem = NULL;
> >>> +
> >>> +    if (!svq->vq) {
> >>> +        return;
> >>> +    }
> >>> +
> >>> +    /* Send all pending used descriptors to guest */
> >>> +    vhost_svq_flush(svq, false);
> >>
> >> Do we need to wait for all the pending descriptors to be completed here?
> >>
> > No, this function does not wait, it only completes the forwarding of
> > the *used* descriptors.
> >
> > The best example is the net rx queue in my opinion. This call will
> > check SVQ's vring used_idx and will forward the last used descriptors
> > if any, but all available descriptors will remain as available for
> > qemu's VQ code.
> >
> > To skip it would miss those last rx descriptors in migration.
> >
> > Thanks!
>
>
> So it's probably to not the best place to ask. It's more about the
> inflight descriptors so it should be TX instead of RX.
>
> I can imagine the migration last phase, we should stop the vhost-vDPA
> before calling vhost_svq_stop(). Then we should be fine regardless of
> inflight descriptors.
>

I think I'm still missing something here.

To be on the same page. Regarding tx this could cause repeated tx
frames (one at source and other at destination), but never a missed
buffer not transmitted. The "stop before" could be interpreted as "SVQ
is not forwarding available buffers anymore". Would that work?

Thanks!

> Thanks
>
>
> >
> >> Thanks
> >>
> >>
> >>> +
> >>> +    for (unsigned i = 0; i < svq->vring.num; ++i) {
> >>> +        g_autofree VirtQueueElement *elem = NULL;
> >>> +        elem = g_steal_pointer(&svq->ring_id_maps[i]);
> >>> +        if (elem) {
> >>> +            virtqueue_detach_element(svq->vq, elem, elem->len);
> >>> +        }
> >>> +    }
> >>> +
> >>> +    next_avail_elem = g_steal_pointer(&svq->next_guest_avail_elem);
> >>> +    if (next_avail_elem) {
> >>> +        virtqueue_detach_element(svq->vq, next_avail_elem,
> >>> +                                 next_avail_elem->len);
> >>> +    }
> >>>    }
>

Re: [PATCH 18/31] vhost: Shadow virtqueue buffers forwarding

Posted by Jason Wang 3 years, 11 months ago

在 2022/2/17 下午8:48, Eugenio Perez Martin 写道:
> On Tue, Feb 8, 2022 at 9:16 AM Jason Wang <jasowang@redhat.com> wrote:
>>
>> 在 2022/2/1 下午7:25, Eugenio Perez Martin 写道:
>>> On Sun, Jan 30, 2022 at 7:47 AM Jason Wang <jasowang@redhat.com> wrote:
>>>> 在 2022/1/22 上午4:27, Eugenio Pérez 写道:
>>>>> @@ -272,6 +590,28 @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
>>>>>     void vhost_svq_stop(VhostShadowVirtqueue *svq)
>>>>>     {
>>>>>         event_notifier_set_handler(&svq->svq_kick, NULL);
>>>>> +    g_autofree VirtQueueElement *next_avail_elem = NULL;
>>>>> +
>>>>> +    if (!svq->vq) {
>>>>> +        return;
>>>>> +    }
>>>>> +
>>>>> +    /* Send all pending used descriptors to guest */
>>>>> +    vhost_svq_flush(svq, false);
>>>> Do we need to wait for all the pending descriptors to be completed here?
>>>>
>>> No, this function does not wait, it only completes the forwarding of
>>> the *used* descriptors.
>>>
>>> The best example is the net rx queue in my opinion. This call will
>>> check SVQ's vring used_idx and will forward the last used descriptors
>>> if any, but all available descriptors will remain as available for
>>> qemu's VQ code.
>>>
>>> To skip it would miss those last rx descriptors in migration.
>>>
>>> Thanks!
>>
>> So it's probably to not the best place to ask. It's more about the
>> inflight descriptors so it should be TX instead of RX.
>>
>> I can imagine the migration last phase, we should stop the vhost-vDPA
>> before calling vhost_svq_stop(). Then we should be fine regardless of
>> inflight descriptors.
>>
> I think I'm still missing something here.
>
> To be on the same page. Regarding tx this could cause repeated tx
> frames (one at source and other at destination), but never a missed
> buffer not transmitted. The "stop before" could be interpreted as "SVQ
> is not forwarding available buffers anymore". Would that work?


Right, but this only work if

1) a flush to make sure TX DMA for inflight descriptors are all completed

2) just mark all inflight descriptor used

Otherwise there could be buffers that is inflight forever.

Thanks


>
> Thanks!
>
>> Thanks
>>
>>
>>>> Thanks
>>>>
>>>>
>>>>> +
>>>>> +    for (unsigned i = 0; i < svq->vring.num; ++i) {
>>>>> +        g_autofree VirtQueueElement *elem = NULL;
>>>>> +        elem = g_steal_pointer(&svq->ring_id_maps[i]);
>>>>> +        if (elem) {
>>>>> +            virtqueue_detach_element(svq->vq, elem, elem->len);
>>>>> +        }
>>>>> +    }
>>>>> +
>>>>> +    next_avail_elem = g_steal_pointer(&svq->next_guest_avail_elem);
>>>>> +    if (next_avail_elem) {
>>>>> +        virtqueue_detach_element(svq->vq, next_avail_elem,
>>>>> +                                 next_avail_elem->len);
>>>>> +    }
>>>>>     }

Re: [PATCH 18/31] vhost: Shadow virtqueue buffers forwarding

Posted by Eugenio Perez Martin 3 years, 11 months ago

On Mon, Feb 21, 2022 at 8:44 AM Jason Wang <jasowang@redhat.com> wrote:
>
>
> 在 2022/2/17 下午8:48, Eugenio Perez Martin 写道:
> > On Tue, Feb 8, 2022 at 9:16 AM Jason Wang <jasowang@redhat.com> wrote:
> >>
> >> 在 2022/2/1 下午7:25, Eugenio Perez Martin 写道:
> >>> On Sun, Jan 30, 2022 at 7:47 AM Jason Wang <jasowang@redhat.com> wrote:
> >>>> 在 2022/1/22 上午4:27, Eugenio Pérez 写道:
> >>>>> @@ -272,6 +590,28 @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
> >>>>>     void vhost_svq_stop(VhostShadowVirtqueue *svq)
> >>>>>     {
> >>>>>         event_notifier_set_handler(&svq->svq_kick, NULL);
> >>>>> +    g_autofree VirtQueueElement *next_avail_elem = NULL;
> >>>>> +
> >>>>> +    if (!svq->vq) {
> >>>>> +        return;
> >>>>> +    }
> >>>>> +
> >>>>> +    /* Send all pending used descriptors to guest */
> >>>>> +    vhost_svq_flush(svq, false);
> >>>> Do we need to wait for all the pending descriptors to be completed here?
> >>>>
> >>> No, this function does not wait, it only completes the forwarding of
> >>> the *used* descriptors.
> >>>
> >>> The best example is the net rx queue in my opinion. This call will
> >>> check SVQ's vring used_idx and will forward the last used descriptors
> >>> if any, but all available descriptors will remain as available for
> >>> qemu's VQ code.
> >>>
> >>> To skip it would miss those last rx descriptors in migration.
> >>>
> >>> Thanks!
> >>
> >> So it's probably to not the best place to ask. It's more about the
> >> inflight descriptors so it should be TX instead of RX.
> >>
> >> I can imagine the migration last phase, we should stop the vhost-vDPA
> >> before calling vhost_svq_stop(). Then we should be fine regardless of
> >> inflight descriptors.
> >>
> > I think I'm still missing something here.
> >
> > To be on the same page. Regarding tx this could cause repeated tx
> > frames (one at source and other at destination), but never a missed
> > buffer not transmitted. The "stop before" could be interpreted as "SVQ
> > is not forwarding available buffers anymore". Would that work?
>
>
> Right, but this only work if
>
> 1) a flush to make sure TX DMA for inflight descriptors are all completed
>
> 2) just mark all inflight descriptor used
>

It currently trusts on the reverse: Buffers not marked as used (by the
device) will be available in the destination, so expect
retransmissions.

Thanks!

> Otherwise there could be buffers that is inflight forever.
>
> Thanks
>
>
> >
> > Thanks!
> >
> >> Thanks
> >>
> >>
> >>>> Thanks
> >>>>
> >>>>
> >>>>> +
> >>>>> +    for (unsigned i = 0; i < svq->vring.num; ++i) {
> >>>>> +        g_autofree VirtQueueElement *elem = NULL;
> >>>>> +        elem = g_steal_pointer(&svq->ring_id_maps[i]);
> >>>>> +        if (elem) {
> >>>>> +            virtqueue_detach_element(svq->vq, elem, elem->len);
> >>>>> +        }
> >>>>> +    }
> >>>>> +
> >>>>> +    next_avail_elem = g_steal_pointer(&svq->next_guest_avail_elem);
> >>>>> +    if (next_avail_elem) {
> >>>>> +        virtqueue_detach_element(svq->vq, next_avail_elem,
> >>>>> +                                 next_avail_elem->len);
> >>>>> +    }
> >>>>>     }
>

Re: [PATCH 18/31] vhost: Shadow virtqueue buffers forwarding

Posted by Jason Wang 3 years, 11 months ago

在 2022/2/21 下午4:15, Eugenio Perez Martin 写道:
> On Mon, Feb 21, 2022 at 8:44 AM Jason Wang <jasowang@redhat.com> wrote:
>>
>> 在 2022/2/17 下午8:48, Eugenio Perez Martin 写道:
>>> On Tue, Feb 8, 2022 at 9:16 AM Jason Wang <jasowang@redhat.com> wrote:
>>>> 在 2022/2/1 下午7:25, Eugenio Perez Martin 写道:
>>>>> On Sun, Jan 30, 2022 at 7:47 AM Jason Wang <jasowang@redhat.com> wrote:
>>>>>> 在 2022/1/22 上午4:27, Eugenio Pérez 写道:
>>>>>>> @@ -272,6 +590,28 @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
>>>>>>>      void vhost_svq_stop(VhostShadowVirtqueue *svq)
>>>>>>>      {
>>>>>>>          event_notifier_set_handler(&svq->svq_kick, NULL);
>>>>>>> +    g_autofree VirtQueueElement *next_avail_elem = NULL;
>>>>>>> +
>>>>>>> +    if (!svq->vq) {
>>>>>>> +        return;
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    /* Send all pending used descriptors to guest */
>>>>>>> +    vhost_svq_flush(svq, false);
>>>>>> Do we need to wait for all the pending descriptors to be completed here?
>>>>>>
>>>>> No, this function does not wait, it only completes the forwarding of
>>>>> the *used* descriptors.
>>>>>
>>>>> The best example is the net rx queue in my opinion. This call will
>>>>> check SVQ's vring used_idx and will forward the last used descriptors
>>>>> if any, but all available descriptors will remain as available for
>>>>> qemu's VQ code.
>>>>>
>>>>> To skip it would miss those last rx descriptors in migration.
>>>>>
>>>>> Thanks!
>>>> So it's probably to not the best place to ask. It's more about the
>>>> inflight descriptors so it should be TX instead of RX.
>>>>
>>>> I can imagine the migration last phase, we should stop the vhost-vDPA
>>>> before calling vhost_svq_stop(). Then we should be fine regardless of
>>>> inflight descriptors.
>>>>
>>> I think I'm still missing something here.
>>>
>>> To be on the same page. Regarding tx this could cause repeated tx
>>> frames (one at source and other at destination), but never a missed
>>> buffer not transmitted. The "stop before" could be interpreted as "SVQ
>>> is not forwarding available buffers anymore". Would that work?
>>
>> Right, but this only work if
>>
>> 1) a flush to make sure TX DMA for inflight descriptors are all completed
>>
>> 2) just mark all inflight descriptor used
>>
> It currently trusts on the reverse: Buffers not marked as used (by the
> device) will be available in the destination, so expect
> retransmissions.


I may miss something but I think we do migrate last_avail_idx. So there 
won't be a re-transmission, since we depend on qemu virtqueue code to 
deal with vring base?

Thanks


>
> Thanks!
>
>> Otherwise there could be buffers that is inflight forever.
>>
>> Thanks
>>
>>
>>> Thanks!
>>>
>>>> Thanks
>>>>
>>>>
>>>>>> Thanks
>>>>>>
>>>>>>
>>>>>>> +
>>>>>>> +    for (unsigned i = 0; i < svq->vring.num; ++i) {
>>>>>>> +        g_autofree VirtQueueElement *elem = NULL;
>>>>>>> +        elem = g_steal_pointer(&svq->ring_id_maps[i]);
>>>>>>> +        if (elem) {
>>>>>>> +            virtqueue_detach_element(svq->vq, elem, elem->len);
>>>>>>> +        }
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    next_avail_elem = g_steal_pointer(&svq->next_guest_avail_elem);
>>>>>>> +    if (next_avail_elem) {
>>>>>>> +        virtqueue_detach_element(svq->vq, next_avail_elem,
>>>>>>> +                                 next_avail_elem->len);
>>>>>>> +    }
>>>>>>>      }

Re: [PATCH 18/31] vhost: Shadow virtqueue buffers forwarding

Posted by Eugenio Perez Martin 3 years, 11 months ago

On Tue, Feb 22, 2022 at 8:26 AM Jason Wang <jasowang@redhat.com> wrote:
>
>
> 在 2022/2/21 下午4:15, Eugenio Perez Martin 写道:
> > On Mon, Feb 21, 2022 at 8:44 AM Jason Wang <jasowang@redhat.com> wrote:
> >>
> >> 在 2022/2/17 下午8:48, Eugenio Perez Martin 写道:
> >>> On Tue, Feb 8, 2022 at 9:16 AM Jason Wang <jasowang@redhat.com> wrote:
> >>>> 在 2022/2/1 下午7:25, Eugenio Perez Martin 写道:
> >>>>> On Sun, Jan 30, 2022 at 7:47 AM Jason Wang <jasowang@redhat.com> wrote:
> >>>>>> 在 2022/1/22 上午4:27, Eugenio Pérez 写道:
> >>>>>>> @@ -272,6 +590,28 @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
> >>>>>>>      void vhost_svq_stop(VhostShadowVirtqueue *svq)
> >>>>>>>      {
> >>>>>>>          event_notifier_set_handler(&svq->svq_kick, NULL);
> >>>>>>> +    g_autofree VirtQueueElement *next_avail_elem = NULL;
> >>>>>>> +
> >>>>>>> +    if (!svq->vq) {
> >>>>>>> +        return;
> >>>>>>> +    }
> >>>>>>> +
> >>>>>>> +    /* Send all pending used descriptors to guest */
> >>>>>>> +    vhost_svq_flush(svq, false);
> >>>>>> Do we need to wait for all the pending descriptors to be completed here?
> >>>>>>
> >>>>> No, this function does not wait, it only completes the forwarding of
> >>>>> the *used* descriptors.
> >>>>>
> >>>>> The best example is the net rx queue in my opinion. This call will
> >>>>> check SVQ's vring used_idx and will forward the last used descriptors
> >>>>> if any, but all available descriptors will remain as available for
> >>>>> qemu's VQ code.
> >>>>>
> >>>>> To skip it would miss those last rx descriptors in migration.
> >>>>>
> >>>>> Thanks!
> >>>> So it's probably to not the best place to ask. It's more about the
> >>>> inflight descriptors so it should be TX instead of RX.
> >>>>
> >>>> I can imagine the migration last phase, we should stop the vhost-vDPA
> >>>> before calling vhost_svq_stop(). Then we should be fine regardless of
> >>>> inflight descriptors.
> >>>>
> >>> I think I'm still missing something here.
> >>>
> >>> To be on the same page. Regarding tx this could cause repeated tx
> >>> frames (one at source and other at destination), but never a missed
> >>> buffer not transmitted. The "stop before" could be interpreted as "SVQ
> >>> is not forwarding available buffers anymore". Would that work?
> >>
> >> Right, but this only work if
> >>
> >> 1) a flush to make sure TX DMA for inflight descriptors are all completed
> >>
> >> 2) just mark all inflight descriptor used
> >>
> > It currently trusts on the reverse: Buffers not marked as used (by the
> > device) will be available in the destination, so expect
> > retransmissions.
>
>
> I may miss something but I think we do migrate last_avail_idx. So there
> won't be a re-transmission, since we depend on qemu virtqueue code to
> deal with vring base?
>

On stop, vhost_virtqueue_stop calls vhost_vdpa_get_vring_base. In SVQ
mode, it returns last_used_idx. After that, vhost.c code set VirtQueue
last_avail_idx == last_used_idx, and it's migrated after that if I'm
not wrong.

vhost kernel migrates last_avail_idx, but it makes rx buffers
available on-demand, unlike SVQ. So it does not need to unwind buffers
or anything like that. Because of how SVQ works with the rx queue,
this is not possible, since the destination will find no available
buffers for rx. And for tx you already have described the scenario.

In other words, we cannot see SVQ as a vhost device in that regard:
SVQ looks for total drain (as "make all guest's buffers available for
the device ASAP") vs the vhost device which can live with a lot of
available ones and it will use them on demand. Same problem as
masking. So the difference in behavior is justified in my opinion, and
it can be improved in the future with the vdpa in-flight descriptors.

If we restore the state that way in a virtio-net device, it will see
the available ones as expected, not as in-flight.

Another possibility is to transform all of these into in-flight ones,
but I feel it would create problems. Can we migrate all rx queues as
in-flight, with 0 bytes written? Is it worth it? I didn't investigate
that path too much, but I think the virtio-net emulated device does
not support that at the moment. If I'm not wrong, we should copy
something like the body of virtio_blk_load_device if we want to go
that route.

The current approach might be too net-centric, so let me know if this
behavior is unexpected or we can do better otherwise.

Thanks!

> Thanks
>
>
> >
> > Thanks!
> >
> >> Otherwise there could be buffers that is inflight forever.
> >>
> >> Thanks
> >>
> >>
> >>> Thanks!
> >>>
> >>>> Thanks
> >>>>
> >>>>
> >>>>>> Thanks
> >>>>>>
> >>>>>>
> >>>>>>> +
> >>>>>>> +    for (unsigned i = 0; i < svq->vring.num; ++i) {
> >>>>>>> +        g_autofree VirtQueueElement *elem = NULL;
> >>>>>>> +        elem = g_steal_pointer(&svq->ring_id_maps[i]);
> >>>>>>> +        if (elem) {
> >>>>>>> +            virtqueue_detach_element(svq->vq, elem, elem->len);
> >>>>>>> +        }
> >>>>>>> +    }
> >>>>>>> +
> >>>>>>> +    next_avail_elem = g_steal_pointer(&svq->next_guest_avail_elem);
> >>>>>>> +    if (next_avail_elem) {
> >>>>>>> +        virtqueue_detach_element(svq->vq, next_avail_elem,
> >>>>>>> +                                 next_avail_elem->len);
> >>>>>>> +    }
> >>>>>>>      }
>

Re: [PATCH 18/31] vhost: Shadow virtqueue buffers forwarding

Posted by Jason Wang 3 years, 11 months ago

On Tue, Feb 22, 2022 at 4:56 PM Eugenio Perez Martin
<eperezma@redhat.com> wrote:
>
> On Tue, Feb 22, 2022 at 8:26 AM Jason Wang <jasowang@redhat.com> wrote:
> >
> >
> > 在 2022/2/21 下午4:15, Eugenio Perez Martin 写道:
> > > On Mon, Feb 21, 2022 at 8:44 AM Jason Wang <jasowang@redhat.com> wrote:
> > >>
> > >> 在 2022/2/17 下午8:48, Eugenio Perez Martin 写道:
> > >>> On Tue, Feb 8, 2022 at 9:16 AM Jason Wang <jasowang@redhat.com> wrote:
> > >>>> 在 2022/2/1 下午7:25, Eugenio Perez Martin 写道:
> > >>>>> On Sun, Jan 30, 2022 at 7:47 AM Jason Wang <jasowang@redhat.com> wrote:
> > >>>>>> 在 2022/1/22 上午4:27, Eugenio Pérez 写道:
> > >>>>>>> @@ -272,6 +590,28 @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
> > >>>>>>>      void vhost_svq_stop(VhostShadowVirtqueue *svq)
> > >>>>>>>      {
> > >>>>>>>          event_notifier_set_handler(&svq->svq_kick, NULL);
> > >>>>>>> +    g_autofree VirtQueueElement *next_avail_elem = NULL;
> > >>>>>>> +
> > >>>>>>> +    if (!svq->vq) {
> > >>>>>>> +        return;
> > >>>>>>> +    }
> > >>>>>>> +
> > >>>>>>> +    /* Send all pending used descriptors to guest */
> > >>>>>>> +    vhost_svq_flush(svq, false);
> > >>>>>> Do we need to wait for all the pending descriptors to be completed here?
> > >>>>>>
> > >>>>> No, this function does not wait, it only completes the forwarding of
> > >>>>> the *used* descriptors.
> > >>>>>
> > >>>>> The best example is the net rx queue in my opinion. This call will
> > >>>>> check SVQ's vring used_idx and will forward the last used descriptors
> > >>>>> if any, but all available descriptors will remain as available for
> > >>>>> qemu's VQ code.
> > >>>>>
> > >>>>> To skip it would miss those last rx descriptors in migration.
> > >>>>>
> > >>>>> Thanks!
> > >>>> So it's probably to not the best place to ask. It's more about the
> > >>>> inflight descriptors so it should be TX instead of RX.
> > >>>>
> > >>>> I can imagine the migration last phase, we should stop the vhost-vDPA
> > >>>> before calling vhost_svq_stop(). Then we should be fine regardless of
> > >>>> inflight descriptors.
> > >>>>
> > >>> I think I'm still missing something here.
> > >>>
> > >>> To be on the same page. Regarding tx this could cause repeated tx
> > >>> frames (one at source and other at destination), but never a missed
> > >>> buffer not transmitted. The "stop before" could be interpreted as "SVQ
> > >>> is not forwarding available buffers anymore". Would that work?
> > >>
> > >> Right, but this only work if
> > >>
> > >> 1) a flush to make sure TX DMA for inflight descriptors are all completed
> > >>
> > >> 2) just mark all inflight descriptor used
> > >>
> > > It currently trusts on the reverse: Buffers not marked as used (by the
> > > device) will be available in the destination, so expect
> > > retransmissions.
> >
> >
> > I may miss something but I think we do migrate last_avail_idx. So there
> > won't be a re-transmission, since we depend on qemu virtqueue code to
> > deal with vring base?
> >
>
> On stop, vhost_virtqueue_stop calls vhost_vdpa_get_vring_base. In SVQ
> mode, it returns last_used_idx. After that, vhost.c code set VirtQueue
> last_avail_idx == last_used_idx, and it's migrated after that if I'm
> not wrong.

Ok, I miss these details in the review. I suggest mentioning this in
the change log and add a comment in vhost_vdpa_get_vring_base().

>
> vhost kernel migrates last_avail_idx, but it makes rx buffers
> available on-demand, unlike SVQ. So it does not need to unwind buffers
> or anything like that. Because of how SVQ works with the rx queue,
> this is not possible, since the destination will find no available
> buffers for rx. And for tx you already have described the scenario.
>
> In other words, we cannot see SVQ as a vhost device in that regard:
> SVQ looks for total drain (as "make all guest's buffers available for
> the device ASAP") vs the vhost device which can live with a lot of
> available ones and it will use them on demand. Same problem as
> masking. So the difference in behavior is justified in my opinion, and
> it can be improved in the future with the vdpa in-flight descriptors.
>
> If we restore the state that way in a virtio-net device, it will see
> the available ones as expected, not as in-flight.
>
> Another possibility is to transform all of these into in-flight ones,
> but I feel it would create problems. Can we migrate all rx queues as
> in-flight, with 0 bytes written? Is it worth it?

To clarify, for inflight I meant from the device point of view, that
is [last_used_idx, last_avail_idx).

So for RX and SVQ, it should be as simple as stop forwarding buffers
since last_used_idx should be the same as last_avail_idx in this case.
(Though technically the rx buffer might be modified by the NIC).

> I didn't investigate
> that path too much, but I think the virtio-net emulated device does
> not support that at the moment. If I'm not wrong, we should copy
> something like the body of virtio_blk_load_device if we want to go
> that route.
>
> The current approach might be too net-centric, so let me know if this
> behavior is unexpected or we can do better otherwise.

It should be fine to start from a networking device. We can add more
in the future if it is needed.

Thanks

>
> Thanks!
>
> > Thanks
> >
> >
> > >
> > > Thanks!
> > >
> > >> Otherwise there could be buffers that is inflight forever.
> > >>
> > >> Thanks
> > >>
> > >>
> > >>> Thanks!
> > >>>
> > >>>> Thanks
> > >>>>
> > >>>>
> > >>>>>> Thanks
> > >>>>>>
> > >>>>>>
> > >>>>>>> +
> > >>>>>>> +    for (unsigned i = 0; i < svq->vring.num; ++i) {
> > >>>>>>> +        g_autofree VirtQueueElement *elem = NULL;
> > >>>>>>> +        elem = g_steal_pointer(&svq->ring_id_maps[i]);
> > >>>>>>> +        if (elem) {
> > >>>>>>> +            virtqueue_detach_element(svq->vq, elem, elem->len);
> > >>>>>>> +        }
> > >>>>>>> +    }
> > >>>>>>> +
> > >>>>>>> +    next_avail_elem = g_steal_pointer(&svq->next_guest_avail_elem);
> > >>>>>>> +    if (next_avail_elem) {
> > >>>>>>> +        virtqueue_detach_element(svq->vq, next_avail_elem,
> > >>>>>>> +                                 next_avail_elem->len);
> > >>>>>>> +    }
> > >>>>>>>      }
> >
>