[v4] vDPA shadow virtqueue

[RFC PATCH v4 20/20] vdpa: Add custom IOTLB translations to SVQ

Posted by Eugenio Pérez 4 years, 4 months ago

Use translations added in VhostIOVATree in SVQ.

Now every element needs to store the previous address also, so VirtQueue
can consume the elements properly. This adds a little overhead per VQ
element, having to allocate more memory to stash them. As a possible
optimization, this allocation could be avoided if the descriptor is not
a chain but a single one, but this is left undone.

TODO: iova range should be queried before, and add logic to fail when
GPA is outside of its range and memory listener or svq add it.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.h |   4 +-
 hw/virtio/vhost-shadow-virtqueue.c | 130 ++++++++++++++++++++++++-----
 hw/virtio/vhost-vdpa.c             |  40 ++++++++-
 hw/virtio/trace-events             |   1 +
 4 files changed, 152 insertions(+), 23 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index b7baa424a7..a0e6b5267a 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -11,6 +11,7 @@
 #define VHOST_SHADOW_VIRTQUEUE_H
 
 #include "hw/virtio/vhost.h"
+#include "hw/virtio/vhost-iova-tree.h"
 
 typedef struct VhostShadowVirtqueue VhostShadowVirtqueue;
 
@@ -28,7 +29,8 @@ bool vhost_svq_start(struct vhost_dev *dev, unsigned idx,
 void vhost_svq_stop(struct vhost_dev *dev, unsigned idx,
                     VhostShadowVirtqueue *svq);
 
-VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx);
+VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx,
+                                    VhostIOVATree *iova_map);
 
 void vhost_svq_free(VhostShadowVirtqueue *vq);
 
diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index 2fd0bab75d..9db538547e 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -11,12 +11,19 @@
 #include "hw/virtio/vhost-shadow-virtqueue.h"
 #include "hw/virtio/vhost.h"
 #include "hw/virtio/virtio-access.h"
+#include "hw/virtio/vhost-iova-tree.h"
 
 #include "standard-headers/linux/vhost_types.h"
 
 #include "qemu/error-report.h"
 #include "qemu/main-loop.h"
 
+typedef struct SVQElement {
+    VirtQueueElement elem;
+    void **in_sg_stash;
+    void **out_sg_stash;
+} SVQElement;
+
 /* Shadow virtqueue to relay notifications */
 typedef struct VhostShadowVirtqueue {
     /* Shadow vring */
@@ -46,8 +53,11 @@ typedef struct VhostShadowVirtqueue {
     /* Virtio device */
     VirtIODevice *vdev;
 
+    /* IOVA mapping if used */
+    VhostIOVATree *iova_map;
+
     /* Map for returning guest's descriptors */
-    VirtQueueElement **ring_id_maps;
+    SVQElement **ring_id_maps;
 
     /* Next head to expose to device */
     uint16_t avail_idx_shadow;
@@ -79,13 +89,6 @@ bool vhost_svq_valid_device_features(uint64_t *dev_features)
             continue;
 
         case VIRTIO_F_ACCESS_PLATFORM:
-            /* SVQ needs this feature disabled. Can't continue */
-            if (*dev_features & BIT_ULL(b)) {
-                clear_bit(b, dev_features);
-                r = false;
-            }
-            break;
-
         case VIRTIO_F_VERSION_1:
             /* SVQ needs this feature, so can't continue */
             if (!(*dev_features & BIT_ULL(b))) {
@@ -126,6 +129,64 @@ static void vhost_svq_set_notification(VhostShadowVirtqueue *svq, bool enable)
     }
 }
 
+static void vhost_svq_stash_addr(void ***stash, const struct iovec *iov,
+                                 size_t num)
+{
+    size_t i;
+
+    if (num == 0) {
+        return;
+    }
+
+    *stash = g_new(void *, num);
+    for (i = 0; i < num; ++i) {
+        (*stash)[i] = iov[i].iov_base;
+    }
+}
+
+static void vhost_svq_unstash_addr(void **stash, struct iovec *iov, size_t num)
+{
+    size_t i;
+
+    if (num == 0) {
+        return;
+    }
+
+    for (i = 0; i < num; ++i) {
+        iov[i].iov_base = stash[i];
+    }
+    g_free(stash);
+}
+
+static void vhost_svq_translate_addr(const VhostShadowVirtqueue *svq,
+                                     struct iovec *iovec, size_t num)
+{
+    size_t i;
+
+    for (i = 0; i < num; ++i) {
+        VhostDMAMap needle = {
+            .translated_addr = iovec[i].iov_base,
+            .size = iovec[i].iov_len,
+        };
+        size_t off;
+
+        const VhostDMAMap *map = vhost_iova_tree_find_iova(svq->iova_map,
+                                                           &needle);
+        /*
+         * Map cannot be NULL since iova map contains all guest space and
+         * qemu already has a physical address mapped
+         */
+        assert(map);
+
+        /*
+         * Map->iova chunk size is ignored. What to do if descriptor
+         * (addr, size) does not fit is delegated to the device.
+         */
+        off = needle.translated_addr - map->translated_addr;
+        iovec[i].iov_base = (void *)(map->iova + off);
+    }
+}
+
 static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
                                     const struct iovec *iovec,
                                     size_t num, bool more_descs, bool write)
@@ -156,8 +217,9 @@ static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
 }
 
 static unsigned vhost_svq_add_split(VhostShadowVirtqueue *svq,
-                                    VirtQueueElement *elem)
+                                    SVQElement *svq_elem)
 {
+    VirtQueueElement *elem = &svq_elem->elem;
     int head;
     unsigned avail_idx;
     vring_avail_t *avail = svq->vring.avail;
@@ -167,6 +229,12 @@ static unsigned vhost_svq_add_split(VhostShadowVirtqueue *svq,
     /* We need some descriptors here */
     assert(elem->out_num || elem->in_num);
 
+    vhost_svq_stash_addr(&svq_elem->in_sg_stash, elem->in_sg, elem->in_num);
+    vhost_svq_stash_addr(&svq_elem->out_sg_stash, elem->out_sg, elem->out_num);
+
+    vhost_svq_translate_addr(svq, elem->in_sg, elem->in_num);
+    vhost_svq_translate_addr(svq, elem->out_sg, elem->out_num);
+
     vhost_vring_write_descs(svq, elem->out_sg, elem->out_num,
                             elem->in_num > 0, false);
     vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
@@ -187,7 +255,7 @@ static unsigned vhost_svq_add_split(VhostShadowVirtqueue *svq,
 
 }
 
-static void vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
+static void vhost_svq_add(VhostShadowVirtqueue *svq, SVQElement *elem)
 {
     unsigned qemu_head = vhost_svq_add_split(svq, elem);
 
@@ -221,7 +289,7 @@ static void vhost_handle_guest_kick(EventNotifier *n)
         }
 
         while (true) {
-            VirtQueueElement *elem = virtqueue_pop(svq->vq, sizeof(*elem));
+            SVQElement *elem = virtqueue_pop(svq->vq, sizeof(*elem));
             if (!elem) {
                 break;
             }
@@ -247,7 +315,7 @@ static bool vhost_svq_more_used(VhostShadowVirtqueue *svq)
     return svq->used_idx != svq->shadow_used_idx;
 }
 
-static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq)
+static SVQElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq)
 {
     vring_desc_t *descs = svq->vring.desc;
     const vring_used_t *used = svq->vring.used;
@@ -279,7 +347,7 @@ static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq)
     descs[used_elem.id].next = svq->free_head;
     svq->free_head = used_elem.id;
 
-    svq->ring_id_maps[used_elem.id]->len = used_elem.len;
+    svq->ring_id_maps[used_elem.id]->elem.len = used_elem.len;
     return g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
 }
 
@@ -296,12 +364,19 @@ static void vhost_svq_handle_call_no_test(EventNotifier *n)
 
         vhost_svq_set_notification(svq, false);
         while (true) {
-            g_autofree VirtQueueElement *elem = vhost_svq_get_buf(svq);
-            if (!elem) {
+            g_autofree SVQElement *svq_elem = vhost_svq_get_buf(svq);
+            VirtQueueElement *elem;
+            if (!svq_elem) {
                 break;
             }
 
             assert(i < svq->vring.num);
+            elem = &svq_elem->elem;
+
+            vhost_svq_unstash_addr(svq_elem->in_sg_stash, elem->in_sg,
+                                   elem->in_num);
+            vhost_svq_unstash_addr(svq_elem->out_sg_stash, elem->out_sg,
+                                   elem->out_num);
             virtqueue_fill(vq, elem, elem->len, i++);
         }
 
@@ -451,14 +526,24 @@ void vhost_svq_stop(struct vhost_dev *dev, unsigned idx,
     event_notifier_set_handler(&svq->host_notifier, NULL);
 
     for (i = 0; i < svq->vring.num; ++i) {
-        g_autofree VirtQueueElement *elem = svq->ring_id_maps[i];
+        g_autofree SVQElement *svq_elem = svq->ring_id_maps[i];
+        VirtQueueElement *elem;
+
+        if (!svq_elem) {
+            continue;
+        }
+
+        elem = &svq_elem->elem;
+        vhost_svq_unstash_addr(svq_elem->in_sg_stash, elem->in_sg,
+                               elem->in_num);
+        vhost_svq_unstash_addr(svq_elem->out_sg_stash, elem->out_sg,
+                               elem->out_num);
+
         /*
          * Although the doc says we must unpop in order, it's ok to unpop
          * everything.
          */
-        if (elem) {
-            virtqueue_unpop(svq->vq, elem, elem->len);
-        }
+        virtqueue_unpop(svq->vq, elem, elem->len);
     }
 }
 
@@ -466,7 +551,8 @@ void vhost_svq_stop(struct vhost_dev *dev, unsigned idx,
  * Creates vhost shadow virtqueue, and instruct vhost device to use the shadow
  * methods and file descriptors.
  */
-VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx)
+VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx,
+                                    VhostIOVATree *iova_map)
 {
     int vq_idx = dev->vq_index + idx;
     unsigned num = virtio_queue_get_num(dev->vdev, vq_idx);
@@ -500,11 +586,13 @@ VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx)
     memset(svq->vring.desc, 0, driver_size);
     svq->vring.used = qemu_memalign(qemu_real_host_page_size, device_size);
     memset(svq->vring.used, 0, device_size);
+    svq->iova_map = iova_map;
+
     for (i = 0; i < num - 1; i++) {
         svq->vring.desc[i].next = cpu_to_le16(i + 1);
     }
 
-    svq->ring_id_maps = g_new0(VirtQueueElement *, num);
+    svq->ring_id_maps = g_new0(SVQElement *, num);
     event_notifier_set_handler(&svq->call_notifier,
                                vhost_svq_handle_call);
     return g_steal_pointer(&svq);
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index a9c680b487..f5a12fee9d 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -176,6 +176,18 @@ static void vhost_vdpa_listener_region_add(MemoryListener *listener,
                                          vaddr, section->readonly);
 
     llsize = int128_sub(llend, int128_make64(iova));
+    if (v->shadow_vqs_enabled) {
+        VhostDMAMap mem_region = {
+            .translated_addr = vaddr,
+            .size = int128_get64(llsize) - 1,
+            .perm = IOMMU_ACCESS_FLAG(true, section->readonly),
+        };
+
+        int r = vhost_iova_tree_alloc(v->iova_map, &mem_region);
+        assert(r == VHOST_DMA_MAP_OK);
+
+        iova = mem_region.iova;
+    }
 
     ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize),
                              vaddr, section->readonly);
@@ -754,6 +766,23 @@ static bool  vhost_vdpa_force_iommu(struct vhost_dev *dev)
     return true;
 }
 
+static int vhost_vdpa_get_iova_range(struct vhost_dev *dev,
+                                     hwaddr *first, hwaddr *last)
+{
+    int ret;
+    struct vhost_vdpa_iova_range range;
+
+    ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_IOVA_RANGE, &range);
+    if (ret != 0) {
+        return ret;
+    }
+
+    *first = range.first;
+    *last = range.last;
+    trace_vhost_vdpa_get_iova_range(dev, *first, *last);
+    return ret;
+}
+
 /**
  * Maps QEMU vaddr memory to device in a suitable way for shadow virtqueue:
  * - It always reference qemu memory address, not guest's memory.
@@ -881,6 +910,7 @@ static bool vhost_vdpa_svq_start_vq(struct vhost_dev *dev, unsigned idx)
 static unsigned vhost_vdpa_enable_svq(struct vhost_vdpa *v, bool enable)
 {
     struct vhost_dev *hdev = v->dev;
+    hwaddr iova_first, iova_last;
     unsigned n;
     int r;
 
@@ -894,7 +924,7 @@ static unsigned vhost_vdpa_enable_svq(struct vhost_vdpa *v, bool enable)
         /* Allocate resources */
         assert(v->shadow_vqs->len == 0);
         for (n = 0; n < hdev->nvqs; ++n) {
-            VhostShadowVirtqueue *svq = vhost_svq_new(hdev, n);
+            VhostShadowVirtqueue *svq = vhost_svq_new(hdev, n, v->iova_map);
             if (unlikely(!svq)) {
                 g_ptr_array_set_size(v->shadow_vqs, 0);
                 return 0;
@@ -903,6 +933,8 @@ static unsigned vhost_vdpa_enable_svq(struct vhost_vdpa *v, bool enable)
         }
     }
 
+    r = vhost_vdpa_get_iova_range(hdev, &iova_first, &iova_last);
+    assert(r == 0);
     r = vhost_vdpa_vring_pause(hdev);
     assert(r == 0);
 
@@ -913,6 +945,12 @@ static unsigned vhost_vdpa_enable_svq(struct vhost_vdpa *v, bool enable)
         }
     }
 
+    memory_listener_unregister(&v->listener);
+    if (vhost_vdpa_dma_unmap(v, iova_first,
+                             (iova_last - iova_first) & TARGET_PAGE_MASK)) {
+        error_report("Fail to invalidate device iotlb");
+    }
+
     /* Reset device so it can be configured */
     r = vhost_vdpa_dev_start(hdev, false);
     assert(r == 0);
diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
index 8ed19e9d0c..650e521e35 100644
--- a/hw/virtio/trace-events
+++ b/hw/virtio/trace-events
@@ -52,6 +52,7 @@ vhost_vdpa_set_vring_call(void *dev, unsigned int index, int fd) "dev: %p index:
 vhost_vdpa_get_features(void *dev, uint64_t features) "dev: %p features: 0x%"PRIx64
 vhost_vdpa_set_owner(void *dev) "dev: %p"
 vhost_vdpa_vq_get_addr(void *dev, void *vq, uint64_t desc_user_addr, uint64_t avail_user_addr, uint64_t used_user_addr) "dev: %p vq: %p desc_user_addr: 0x%"PRIx64" avail_user_addr: 0x%"PRIx64" used_user_addr: 0x%"PRIx64
+vhost_vdpa_get_iova_range(void *dev, uint64_t first, uint64_t last) "dev: %p first: 0x%"PRIx64" last: 0x%"PRIx64
 
 # virtio.c
 virtqueue_alloc_element(void *elem, size_t sz, unsigned in_num, unsigned out_num) "elem %p size %zd in_num %u out_num %u"
-- 
2.27.0

Re: [RFC PATCH v4 20/20] vdpa: Add custom IOTLB translations to SVQ

Posted by Jason Wang 4 years, 3 months ago

在 2021/10/1 下午3:06, Eugenio Pérez 写道:
> Use translations added in VhostIOVATree in SVQ.
>
> Now every element needs to store the previous address also, so VirtQueue
> can consume the elements properly. This adds a little overhead per VQ
> element, having to allocate more memory to stash them. As a possible
> optimization, this allocation could be avoided if the descriptor is not
> a chain but a single one, but this is left undone.
>
> TODO: iova range should be queried before, and add logic to fail when
> GPA is outside of its range and memory listener or svq add it.
>
> Signed-off-by: Eugenio Pérez<eperezma@redhat.com>
> ---
>   hw/virtio/vhost-shadow-virtqueue.h |   4 +-
>   hw/virtio/vhost-shadow-virtqueue.c | 130 ++++++++++++++++++++++++-----
>   hw/virtio/vhost-vdpa.c             |  40 ++++++++-
>   hw/virtio/trace-events             |   1 +
>   4 files changed, 152 insertions(+), 23 deletions(-)


Think hard about the whole logic. This is safe since qemu memory map 
will fail if guest submits a invalidate IOVA.

Then I wonder if we do something much more simpler:

1) Using qemu VA as IOVA but only maps the VA that belongs to guest
2) Then we don't need any IOVA tree here, what we need is to just map 
vring and use qemu VA without any translation

Thanks

Re: [RFC PATCH v4 20/20] vdpa: Add custom IOTLB translations to SVQ

Posted by Eugenio Perez Martin 4 years, 3 months ago

On Wed, Oct 13, 2021 at 7:34 AM Jason Wang <jasowang@redhat.com> wrote:
>
>
> 在 2021/10/1 下午3:06, Eugenio Pérez 写道:
> > Use translations added in VhostIOVATree in SVQ.
> >
> > Now every element needs to store the previous address also, so VirtQueue
> > can consume the elements properly. This adds a little overhead per VQ
> > element, having to allocate more memory to stash them. As a possible
> > optimization, this allocation could be avoided if the descriptor is not
> > a chain but a single one, but this is left undone.
> >
> > TODO: iova range should be queried before, and add logic to fail when
> > GPA is outside of its range and memory listener or svq add it.
> >
> > Signed-off-by: Eugenio Pérez<eperezma@redhat.com>
> > ---
> >   hw/virtio/vhost-shadow-virtqueue.h |   4 +-
> >   hw/virtio/vhost-shadow-virtqueue.c | 130 ++++++++++++++++++++++++-----
> >   hw/virtio/vhost-vdpa.c             |  40 ++++++++-
> >   hw/virtio/trace-events             |   1 +
> >   4 files changed, 152 insertions(+), 23 deletions(-)
>
>
> Think hard about the whole logic. This is safe since qemu memory map
> will fail if guest submits a invalidate IOVA.
>

Can you expand on this? What you mean is that VirtQueue already
protects SVQ code if the guest sets an invalid buffer address (GPA),
isn't it?

> Then I wonder if we do something much more simpler:
>
> 1) Using qemu VA as IOVA but only maps the VA that belongs to guest
> 2) Then we don't need any IOVA tree here, what we need is to just map
> vring and use qemu VA without any translation
>

That would be great, but either qemu's SVQ vring or guest translated
buffers address (in qemu VA form) were already in high addresses,
outside of the device's iova range (in my test).

I didn't try remapping tricks to make them fit in the range, but I
think it does complicate the solution relatively fast if there was
already memory in that range owned by qemu before enabling SVQ:

* Guest memory must be contiguous in VA address space, but it "must"
support hotplug/unplug (although vDPA currently pins it). Hotplug
memory could always overlap with SVQ vring, so we would need to move
it.
* Duplicating mapped memory for writing? (Not sure if guest memory is
actually movable in qemu).
* Indirect descriptors will need to allocate and free memory more or
less frequently, increasing the possibility of overlapping.

If we can move guest memory, however, I can see how we can track it in
a tree *but* mark when the tree is 1:1 with qemu's VA, so buffers
forwarding does not take the translation penalty. When guest memory
cannot be map 1:1, we can resort to tree, and come back to 1:1
translation if the offending tree node(s) get deleted.

However I think this puts the solution a little bit farther than
"starting simple" :).

Does it make sense?

Thanks!

> Thanks
>

Re: [RFC PATCH v4 20/20] vdpa: Add custom IOTLB translations to SVQ

Posted by Jason Wang 4 years, 3 months ago

On Fri, Oct 15, 2021 at 3:28 PM Eugenio Perez Martin
<eperezma@redhat.com> wrote:
>
> On Wed, Oct 13, 2021 at 7:34 AM Jason Wang <jasowang@redhat.com> wrote:
> >
> >
> > 在 2021/10/1 下午3:06, Eugenio Pérez 写道:
> > > Use translations added in VhostIOVATree in SVQ.
> > >
> > > Now every element needs to store the previous address also, so VirtQueue
> > > can consume the elements properly. This adds a little overhead per VQ
> > > element, having to allocate more memory to stash them. As a possible
> > > optimization, this allocation could be avoided if the descriptor is not
> > > a chain but a single one, but this is left undone.
> > >
> > > TODO: iova range should be queried before, and add logic to fail when
> > > GPA is outside of its range and memory listener or svq add it.
> > >
> > > Signed-off-by: Eugenio Pérez<eperezma@redhat.com>
> > > ---
> > >   hw/virtio/vhost-shadow-virtqueue.h |   4 +-
> > >   hw/virtio/vhost-shadow-virtqueue.c | 130 ++++++++++++++++++++++++-----
> > >   hw/virtio/vhost-vdpa.c             |  40 ++++++++-
> > >   hw/virtio/trace-events             |   1 +
> > >   4 files changed, 152 insertions(+), 23 deletions(-)
> >
> >
> > Think hard about the whole logic. This is safe since qemu memory map
> > will fail if guest submits a invalidate IOVA.
> >
>
> Can you expand on this? What you mean is that VirtQueue already
> protects SVQ code if the guest sets an invalid buffer address (GPA),
> isn't it?

Yes.

>
> > Then I wonder if we do something much more simpler:
> >
> > 1) Using qemu VA as IOVA but only maps the VA that belongs to guest
> > 2) Then we don't need any IOVA tree here, what we need is to just map
> > vring and use qemu VA without any translation
> >
>
> That would be great, but either qemu's SVQ vring or guest translated
> buffers address (in qemu VA form) were already in high addresses,
> outside of the device's iova range (in my test).

You're right. I miss that and that's why we need e.g iova tree and allocator.

What I proposed only makes sense when shared virtual memory (SVA) is
implemented. In the case of SVA, the valid iova range should be the
full VA range.

>
> I didn't try remapping tricks to make them fit in the range, but I
> think it does complicate the solution relatively fast if there was
> already memory in that range owned by qemu before enabling SVQ:
>
> * Guest memory must be contiguous in VA address space, but it "must"
> support hotplug/unplug (although vDPA currently pins it). Hotplug
> memory could always overlap with SVQ vring, so we would need to move
> it.
> * Duplicating mapped memory for writing? (Not sure if guest memory is
> actually movable in qemu).
> * Indirect descriptors will need to allocate and free memory more or
> less frequently, increasing the possibility of overlapping.

I'm not sure I get the problem, but overlapping is not an issue since
we're using VA.

>
> If we can move guest memory,

I'm not sure we can do this or it looks very tricky.

> however, I can see how we can track it in
> a tree *but* mark when the tree is 1:1 with qemu's VA, so buffers
> forwarding does not take the translation penalty. When guest memory
> cannot be map 1:1, we can resort to tree, and come back to 1:1
> translation if the offending tree node(s) get deleted.
>
> However I think this puts the solution a little bit farther than
> "starting simple" :).
>
> Does it make sense?

Yes. So I think I will review the IOVA tree codes and get back to you.

THanks

>
> Thanks!
>
> > Thanks
> >
>

Re: [RFC PATCH v4 20/20] vdpa: Add custom IOTLB translations to SVQ

Posted by Eugenio Perez Martin 4 years, 3 months ago

On Fri, Oct 15, 2021 at 9:37 AM Jason Wang <jasowang@redhat.com> wrote:
>
> On Fri, Oct 15, 2021 at 3:28 PM Eugenio Perez Martin
> <eperezma@redhat.com> wrote:
> >
> > On Wed, Oct 13, 2021 at 7:34 AM Jason Wang <jasowang@redhat.com> wrote:
> > >
> > >
> > > 在 2021/10/1 下午3:06, Eugenio Pérez 写道:
> > > > Use translations added in VhostIOVATree in SVQ.
> > > >
> > > > Now every element needs to store the previous address also, so VirtQueue
> > > > can consume the elements properly. This adds a little overhead per VQ
> > > > element, having to allocate more memory to stash them. As a possible
> > > > optimization, this allocation could be avoided if the descriptor is not
> > > > a chain but a single one, but this is left undone.
> > > >
> > > > TODO: iova range should be queried before, and add logic to fail when
> > > > GPA is outside of its range and memory listener or svq add it.
> > > >
> > > > Signed-off-by: Eugenio Pérez<eperezma@redhat.com>
> > > > ---
> > > >   hw/virtio/vhost-shadow-virtqueue.h |   4 +-
> > > >   hw/virtio/vhost-shadow-virtqueue.c | 130 ++++++++++++++++++++++++-----
> > > >   hw/virtio/vhost-vdpa.c             |  40 ++++++++-
> > > >   hw/virtio/trace-events             |   1 +
> > > >   4 files changed, 152 insertions(+), 23 deletions(-)
> > >
> > >
> > > Think hard about the whole logic. This is safe since qemu memory map
> > > will fail if guest submits a invalidate IOVA.
> > >
> >
> > Can you expand on this? What you mean is that VirtQueue already
> > protects SVQ code if the guest sets an invalid buffer address (GPA),
> > isn't it?
>
> Yes.
>
> >
> > > Then I wonder if we do something much more simpler:
> > >
> > > 1) Using qemu VA as IOVA but only maps the VA that belongs to guest
> > > 2) Then we don't need any IOVA tree here, what we need is to just map
> > > vring and use qemu VA without any translation
> > >
> >
> > That would be great, but either qemu's SVQ vring or guest translated
> > buffers address (in qemu VA form) were already in high addresses,
> > outside of the device's iova range (in my test).
>
> You're right. I miss that and that's why we need e.g iova tree and allocator.
>
> What I proposed only makes sense when shared virtual memory (SVA) is
> implemented. In the case of SVA, the valid iova range should be the
> full VA range.
>
> >
> > I didn't try remapping tricks to make them fit in the range, but I
> > think it does complicate the solution relatively fast if there was
> > already memory in that range owned by qemu before enabling SVQ:
> >
> > * Guest memory must be contiguous in VA address space, but it "must"
> > support hotplug/unplug (although vDPA currently pins it). Hotplug
> > memory could always overlap with SVQ vring, so we would need to move
> > it.
> > * Duplicating mapped memory for writing? (Not sure if guest memory is
> > actually movable in qemu).
> > * Indirect descriptors will need to allocate and free memory more or
> > less frequently, increasing the possibility of overlapping.
>
> I'm not sure I get the problem, but overlapping is not an issue since
> we're using VA.
>

It's basically the same (potential) problem of DPDK's SVQ: IOVA Range
goes from 0 to X. That means that both GPA and SVQ must be in IOVA
range. As an example, we put GPA at the beginning of the range, that
grows upwards when memory is hot plugged, and SVQ vrings that grows
downwards when devices are added or set in SVQ mode.

Even without both space fragmentation problems, we could reach a point
where both will take the same address, and we would need to go to the
tree.

But since we are able to detect those situations, I can see how we can
work in two modes as an optimization: 1:1 when they don't overlap, and
fragmented tree where it does. But I don't think it's a good idea to
include it from the beginning, and I'm not sure if that is worth it
without measuring the  tree translation cost first.

> >
> > If we can move guest memory,
>
> I'm not sure we can do this or it looks very tricky.
>

Just thinking out loud here, but maybe we could map all memory and
play with remap_file_pages [1] a little bit for that.

> > however, I can see how we can track it in
> > a tree *but* mark when the tree is 1:1 with qemu's VA, so buffers
> > forwarding does not take the translation penalty. When guest memory
> > cannot be map 1:1, we can resort to tree, and come back to 1:1
> > translation if the offending tree node(s) get deleted.
> >
> > However I think this puts the solution a little bit farther than
> > "starting simple" :).
> >
> > Does it make sense?
>
> Yes. So I think I will review the IOVA tree codes and get back to you.
>

Looking forward to it :).

Thanks!

[1] https://linux.die.net/man/2/remap_file_pages

> THanks
>
> >
> > Thanks!
> >
> > > Thanks
> > >
> >
>

Re: [RFC PATCH v4 20/20] vdpa: Add custom IOTLB translations to SVQ

Posted by Jason Wang 4 years, 3 months ago

On Fri, Oct 15, 2021 at 4:21 PM Eugenio Perez Martin
<eperezma@redhat.com> wrote:
>
> On Fri, Oct 15, 2021 at 9:37 AM Jason Wang <jasowang@redhat.com> wrote:
> >
> > On Fri, Oct 15, 2021 at 3:28 PM Eugenio Perez Martin
> > <eperezma@redhat.com> wrote:
> > >
> > > On Wed, Oct 13, 2021 at 7:34 AM Jason Wang <jasowang@redhat.com> wrote:
> > > >
> > > >
> > > > 在 2021/10/1 下午3:06, Eugenio Pérez 写道:
> > > > > Use translations added in VhostIOVATree in SVQ.
> > > > >
> > > > > Now every element needs to store the previous address also, so VirtQueue
> > > > > can consume the elements properly. This adds a little overhead per VQ
> > > > > element, having to allocate more memory to stash them. As a possible
> > > > > optimization, this allocation could be avoided if the descriptor is not
> > > > > a chain but a single one, but this is left undone.
> > > > >
> > > > > TODO: iova range should be queried before, and add logic to fail when
> > > > > GPA is outside of its range and memory listener or svq add it.
> > > > >
> > > > > Signed-off-by: Eugenio Pérez<eperezma@redhat.com>
> > > > > ---
> > > > >   hw/virtio/vhost-shadow-virtqueue.h |   4 +-
> > > > >   hw/virtio/vhost-shadow-virtqueue.c | 130 ++++++++++++++++++++++++-----
> > > > >   hw/virtio/vhost-vdpa.c             |  40 ++++++++-
> > > > >   hw/virtio/trace-events             |   1 +
> > > > >   4 files changed, 152 insertions(+), 23 deletions(-)
> > > >
> > > >
> > > > Think hard about the whole logic. This is safe since qemu memory map
> > > > will fail if guest submits a invalidate IOVA.
> > > >
> > >
> > > Can you expand on this? What you mean is that VirtQueue already
> > > protects SVQ code if the guest sets an invalid buffer address (GPA),
> > > isn't it?
> >
> > Yes.
> >
> > >
> > > > Then I wonder if we do something much more simpler:
> > > >
> > > > 1) Using qemu VA as IOVA but only maps the VA that belongs to guest
> > > > 2) Then we don't need any IOVA tree here, what we need is to just map
> > > > vring and use qemu VA without any translation
> > > >
> > >
> > > That would be great, but either qemu's SVQ vring or guest translated
> > > buffers address (in qemu VA form) were already in high addresses,
> > > outside of the device's iova range (in my test).
> >
> > You're right. I miss that and that's why we need e.g iova tree and allocator.
> >
> > What I proposed only makes sense when shared virtual memory (SVA) is
> > implemented. In the case of SVA, the valid iova range should be the
> > full VA range.
> >
> > >
> > > I didn't try remapping tricks to make them fit in the range, but I
> > > think it does complicate the solution relatively fast if there was
> > > already memory in that range owned by qemu before enabling SVQ:
> > >
> > > * Guest memory must be contiguous in VA address space, but it "must"
> > > support hotplug/unplug (although vDPA currently pins it). Hotplug
> > > memory could always overlap with SVQ vring, so we would need to move
> > > it.
> > > * Duplicating mapped memory for writing? (Not sure if guest memory is
> > > actually movable in qemu).
> > > * Indirect descriptors will need to allocate and free memory more or
> > > less frequently, increasing the possibility of overlapping.
> >
> > I'm not sure I get the problem, but overlapping is not an issue since
> > we're using VA.
> >
>
> It's basically the same (potential) problem of DPDK's SVQ: IOVA Range
> goes from 0 to X. That means that both GPA and SVQ must be in IOVA
> range. As an example, we put GPA at the beginning of the range, that
> grows upwards when memory is hot plugged, and SVQ vrings that grows
> downwards when devices are added or set in SVQ mode.

Yes, but this is not the case if we're using VA.

>
> Even without both space fragmentation problems, we could reach a point
> where both will take the same address, and we would need to go to the
> tree.
>
> But since we are able to detect those situations, I can see how we can
> work in two modes as an optimization: 1:1 when they don't overlap, and
> fragmented tree where it does. But I don't think it's a good idea to
> include it from the beginning, and I'm not sure if that is worth it
> without measuring the  tree translation cost first.
>
> > >
> > > If we can move guest memory,
> >
> > I'm not sure we can do this or it looks very tricky.
> >
>
> Just thinking out loud here, but maybe we could map all memory and
> play with remap_file_pages [1] a little bit for that.

The problem is that there's no guarantee that it will always succeed.
So let's start with the current dedicated IOVA address space. We can
do optimization on top anyhow.

>
> > > however, I can see how we can track it in
> > > a tree *but* mark when the tree is 1:1 with qemu's VA, so buffers
> > > forwarding does not take the translation penalty. When guest memory
> > > cannot be map 1:1, we can resort to tree, and come back to 1:1
> > > translation if the offending tree node(s) get deleted.
> > >
> > > However I think this puts the solution a little bit farther than
> > > "starting simple" :).
> > >
> > > Does it make sense?
> >
> > Yes. So I think I will review the IOVA tree codes and get back to you.
> >
>
> Looking forward to it :).

Thanks

>
> Thanks!
>
> [1] https://linux.die.net/man/2/remap_file_pages
>
> > THanks
> >
> > >
> > > Thanks!
> > >
> > > > Thanks
> > > >
> > >
> >
>

Re: [RFC PATCH v4 20/20] vdpa: Add custom IOTLB translations to SVQ

Posted by Eugenio Perez Martin 4 years, 3 months ago

On Fri, Oct 15, 2021 at 10:20 AM Eugenio Perez Martin
<eperezma@redhat.com> wrote:
>
> On Fri, Oct 15, 2021 at 9:37 AM Jason Wang <jasowang@redhat.com> wrote:
> >
> > On Fri, Oct 15, 2021 at 3:28 PM Eugenio Perez Martin
> > <eperezma@redhat.com> wrote:
> > >
> > > On Wed, Oct 13, 2021 at 7:34 AM Jason Wang <jasowang@redhat.com> wrote:
> > > >
> > > >
> > > > 在 2021/10/1 下午3:06, Eugenio Pérez 写道:
> > > > > Use translations added in VhostIOVATree in SVQ.
> > > > >
> > > > > Now every element needs to store the previous address also, so VirtQueue
> > > > > can consume the elements properly. This adds a little overhead per VQ
> > > > > element, having to allocate more memory to stash them. As a possible
> > > > > optimization, this allocation could be avoided if the descriptor is not
> > > > > a chain but a single one, but this is left undone.
> > > > >
> > > > > TODO: iova range should be queried before, and add logic to fail when
> > > > > GPA is outside of its range and memory listener or svq add it.
> > > > >
> > > > > Signed-off-by: Eugenio Pérez<eperezma@redhat.com>
> > > > > ---
> > > > >   hw/virtio/vhost-shadow-virtqueue.h |   4 +-
> > > > >   hw/virtio/vhost-shadow-virtqueue.c | 130 ++++++++++++++++++++++++-----
> > > > >   hw/virtio/vhost-vdpa.c             |  40 ++++++++-
> > > > >   hw/virtio/trace-events             |   1 +
> > > > >   4 files changed, 152 insertions(+), 23 deletions(-)
> > > >
> > > >
> > > > Think hard about the whole logic. This is safe since qemu memory map
> > > > will fail if guest submits a invalidate IOVA.
> > > >
> > >
> > > Can you expand on this? What you mean is that VirtQueue already
> > > protects SVQ code if the guest sets an invalid buffer address (GPA),
> > > isn't it?
> >
> > Yes.
> >
> > >
> > > > Then I wonder if we do something much more simpler:
> > > >
> > > > 1) Using qemu VA as IOVA but only maps the VA that belongs to guest
> > > > 2) Then we don't need any IOVA tree here, what we need is to just map
> > > > vring and use qemu VA without any translation
> > > >
> > >
> > > That would be great, but either qemu's SVQ vring or guest translated
> > > buffers address (in qemu VA form) were already in high addresses,
> > > outside of the device's iova range (in my test).
> >
> > You're right. I miss that and that's why we need e.g iova tree and allocator.
> >
> > What I proposed only makes sense when shared virtual memory (SVA) is
> > implemented. In the case of SVA, the valid iova range should be the
> > full VA range.
> >
> > >
> > > I didn't try remapping tricks to make them fit in the range, but I
> > > think it does complicate the solution relatively fast if there was
> > > already memory in that range owned by qemu before enabling SVQ:
> > >
> > > * Guest memory must be contiguous in VA address space, but it "must"
> > > support hotplug/unplug (although vDPA currently pins it). Hotplug
> > > memory could always overlap with SVQ vring, so we would need to move
> > > it.
> > > * Duplicating mapped memory for writing? (Not sure if guest memory is
> > > actually movable in qemu).
> > > * Indirect descriptors will need to allocate and free memory more or
> > > less frequently, increasing the possibility of overlapping.
> >
> > I'm not sure I get the problem, but overlapping is not an issue since
> > we're using VA.
> >
>
> It's basically the same (potential) problem of DPDK's SVQ: IOVA Range
> goes from 0 to X. That means that both GPA and SVQ must be in IOVA
> range. As an example, we put GPA at the beginning of the range, that
> grows upwards when memory is hot plugged, and SVQ vrings that grows
> downwards when devices are added or set in SVQ mode.
>
> Even without both space fragmentation problems, we could reach a point
> where both will take the same address, and we would need to go to the
> tree.
>
> But since we are able to detect those situations, I can see how we can
> work in two modes as an optimization: 1:1 when they don't overlap, and
> fragmented tree where it does. But I don't think it's a good idea to
> include it from the beginning, and I'm not sure if that is worth it
> without measuring the  tree translation cost first.
>
> > >
> > > If we can move guest memory,
> >
> > I'm not sure we can do this or it looks very tricky.
> >
>
> Just thinking out loud here, but maybe we could map all memory and
> play with remap_file_pages [1] a little bit for that.
>
> > > however, I can see how we can track it in
> > > a tree *but* mark when the tree is 1:1 with qemu's VA, so buffers
> > > forwarding does not take the translation penalty. When guest memory
> > > cannot be map 1:1, we can resort to tree, and come back to 1:1
> > > translation if the offending tree node(s) get deleted.
> > >
> > > However I think this puts the solution a little bit farther than
> > > "starting simple" :).
> > >
> > > Does it make sense?
> >
> > Yes. So I think I will review the IOVA tree codes and get back to you.
> >
>
> Looking forward to it :).
>

PS: Actually, they still use the GArray solution as the previous
series. I'm currently migrating to use an actual tree and adding
allocation features to util/iova-tree, so maybe it is not worth
reviewing it at this moment. The public interface to it is the same,
but there is little to review there.

Thanks!

> Thanks!
>
> [1] https://linux.die.net/man/2/remap_file_pages
>
> > THanks
> >
> > >
> > > Thanks!
> > >
> > > > Thanks
> > > >
> > >
> >

Re: [RFC PATCH v4 20/20] vdpa: Add custom IOTLB translations to SVQ

Posted by Jason Wang 4 years, 3 months ago

在 2021/10/1 下午3:06, Eugenio Pérez 写道:
> Use translations added in VhostIOVATree in SVQ.
>
> Now every element needs to store the previous address also, so VirtQueue
> can consume the elements properly. This adds a little overhead per VQ
> element, having to allocate more memory to stash them. As a possible
> optimization, this allocation could be avoided if the descriptor is not
> a chain but a single one, but this is left undone.
>
> TODO: iova range should be queried before, and add logic to fail when
> GPA is outside of its range and memory listener or svq add it.
>
> Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> ---
>   hw/virtio/vhost-shadow-virtqueue.h |   4 +-
>   hw/virtio/vhost-shadow-virtqueue.c | 130 ++++++++++++++++++++++++-----
>   hw/virtio/vhost-vdpa.c             |  40 ++++++++-
>   hw/virtio/trace-events             |   1 +
>   4 files changed, 152 insertions(+), 23 deletions(-)
>
> diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
> index b7baa424a7..a0e6b5267a 100644
> --- a/hw/virtio/vhost-shadow-virtqueue.h
> +++ b/hw/virtio/vhost-shadow-virtqueue.h
> @@ -11,6 +11,7 @@
>   #define VHOST_SHADOW_VIRTQUEUE_H
>   
>   #include "hw/virtio/vhost.h"
> +#include "hw/virtio/vhost-iova-tree.h"
>   
>   typedef struct VhostShadowVirtqueue VhostShadowVirtqueue;
>   
> @@ -28,7 +29,8 @@ bool vhost_svq_start(struct vhost_dev *dev, unsigned idx,
>   void vhost_svq_stop(struct vhost_dev *dev, unsigned idx,
>                       VhostShadowVirtqueue *svq);
>   
> -VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx);
> +VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx,
> +                                    VhostIOVATree *iova_map);
>   
>   void vhost_svq_free(VhostShadowVirtqueue *vq);
>   
> diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
> index 2fd0bab75d..9db538547e 100644
> --- a/hw/virtio/vhost-shadow-virtqueue.c
> +++ b/hw/virtio/vhost-shadow-virtqueue.c
> @@ -11,12 +11,19 @@
>   #include "hw/virtio/vhost-shadow-virtqueue.h"
>   #include "hw/virtio/vhost.h"
>   #include "hw/virtio/virtio-access.h"
> +#include "hw/virtio/vhost-iova-tree.h"
>   
>   #include "standard-headers/linux/vhost_types.h"
>   
>   #include "qemu/error-report.h"
>   #include "qemu/main-loop.h"
>   
> +typedef struct SVQElement {
> +    VirtQueueElement elem;
> +    void **in_sg_stash;
> +    void **out_sg_stash;
> +} SVQElement;
> +
>   /* Shadow virtqueue to relay notifications */
>   typedef struct VhostShadowVirtqueue {
>       /* Shadow vring */
> @@ -46,8 +53,11 @@ typedef struct VhostShadowVirtqueue {
>       /* Virtio device */
>       VirtIODevice *vdev;
>   
> +    /* IOVA mapping if used */
> +    VhostIOVATree *iova_map;
> +
>       /* Map for returning guest's descriptors */
> -    VirtQueueElement **ring_id_maps;
> +    SVQElement **ring_id_maps;
>   
>       /* Next head to expose to device */
>       uint16_t avail_idx_shadow;
> @@ -79,13 +89,6 @@ bool vhost_svq_valid_device_features(uint64_t *dev_features)
>               continue;
>   
>           case VIRTIO_F_ACCESS_PLATFORM:
> -            /* SVQ needs this feature disabled. Can't continue */
> -            if (*dev_features & BIT_ULL(b)) {
> -                clear_bit(b, dev_features);
> -                r = false;
> -            }
> -            break;
> -
>           case VIRTIO_F_VERSION_1:
>               /* SVQ needs this feature, so can't continue */
>               if (!(*dev_features & BIT_ULL(b))) {
> @@ -126,6 +129,64 @@ static void vhost_svq_set_notification(VhostShadowVirtqueue *svq, bool enable)
>       }
>   }
>   
> +static void vhost_svq_stash_addr(void ***stash, const struct iovec *iov,
> +                                 size_t num)
> +{
> +    size_t i;
> +
> +    if (num == 0) {
> +        return;
> +    }
> +
> +    *stash = g_new(void *, num);
> +    for (i = 0; i < num; ++i) {
> +        (*stash)[i] = iov[i].iov_base;
> +    }
> +}
> +
> +static void vhost_svq_unstash_addr(void **stash, struct iovec *iov, size_t num)
> +{
> +    size_t i;
> +
> +    if (num == 0) {
> +        return;
> +    }
> +
> +    for (i = 0; i < num; ++i) {
> +        iov[i].iov_base = stash[i];
> +    }
> +    g_free(stash);
> +}
> +
> +static void vhost_svq_translate_addr(const VhostShadowVirtqueue *svq,
> +                                     struct iovec *iovec, size_t num)
> +{
> +    size_t i;
> +
> +    for (i = 0; i < num; ++i) {
> +        VhostDMAMap needle = {
> +            .translated_addr = iovec[i].iov_base,
> +            .size = iovec[i].iov_len,
> +        };
> +        size_t off;
> +
> +        const VhostDMAMap *map = vhost_iova_tree_find_iova(svq->iova_map,
> +                                                           &needle);


Is it possible that we end up with more than one maps here?


> +        /*
> +         * Map cannot be NULL since iova map contains all guest space and
> +         * qemu already has a physical address mapped
> +         */
> +        assert(map);
> +
> +        /*
> +         * Map->iova chunk size is ignored. What to do if descriptor
> +         * (addr, size) does not fit is delegated to the device.
> +         */
> +        off = needle.translated_addr - map->translated_addr;
> +        iovec[i].iov_base = (void *)(map->iova + off);
> +    }
> +}
> +
>   static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
>                                       const struct iovec *iovec,
>                                       size_t num, bool more_descs, bool write)
> @@ -156,8 +217,9 @@ static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
>   }
>   
>   static unsigned vhost_svq_add_split(VhostShadowVirtqueue *svq,
> -                                    VirtQueueElement *elem)
> +                                    SVQElement *svq_elem)
>   {
> +    VirtQueueElement *elem = &svq_elem->elem;
>       int head;
>       unsigned avail_idx;
>       vring_avail_t *avail = svq->vring.avail;
> @@ -167,6 +229,12 @@ static unsigned vhost_svq_add_split(VhostShadowVirtqueue *svq,
>       /* We need some descriptors here */
>       assert(elem->out_num || elem->in_num);
>   
> +    vhost_svq_stash_addr(&svq_elem->in_sg_stash, elem->in_sg, elem->in_num);
> +    vhost_svq_stash_addr(&svq_elem->out_sg_stash, elem->out_sg, elem->out_num);


I wonder if we can solve the trick like stash and unstash with a 
dedicated sgs in svq_elem, instead of reusing the elem.

Thanks


> +
> +    vhost_svq_translate_addr(svq, elem->in_sg, elem->in_num);
> +    vhost_svq_translate_addr(svq, elem->out_sg, elem->out_num);
> +
>       vhost_vring_write_descs(svq, elem->out_sg, elem->out_num,
>                               elem->in_num > 0, false);
>       vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
> @@ -187,7 +255,7 @@ static unsigned vhost_svq_add_split(VhostShadowVirtqueue *svq,
>   
>   }
>   
> -static void vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
> +static void vhost_svq_add(VhostShadowVirtqueue *svq, SVQElement *elem)
>   {
>       unsigned qemu_head = vhost_svq_add_split(svq, elem);
>   
> @@ -221,7 +289,7 @@ static void vhost_handle_guest_kick(EventNotifier *n)
>           }
>   
>           while (true) {
> -            VirtQueueElement *elem = virtqueue_pop(svq->vq, sizeof(*elem));
> +            SVQElement *elem = virtqueue_pop(svq->vq, sizeof(*elem));
>               if (!elem) {
>                   break;
>               }
> @@ -247,7 +315,7 @@ static bool vhost_svq_more_used(VhostShadowVirtqueue *svq)
>       return svq->used_idx != svq->shadow_used_idx;
>   }
>   
> -static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq)
> +static SVQElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq)
>   {
>       vring_desc_t *descs = svq->vring.desc;
>       const vring_used_t *used = svq->vring.used;
> @@ -279,7 +347,7 @@ static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq)
>       descs[used_elem.id].next = svq->free_head;
>       svq->free_head = used_elem.id;
>   
> -    svq->ring_id_maps[used_elem.id]->len = used_elem.len;
> +    svq->ring_id_maps[used_elem.id]->elem.len = used_elem.len;
>       return g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
>   }
>   
> @@ -296,12 +364,19 @@ static void vhost_svq_handle_call_no_test(EventNotifier *n)
>   
>           vhost_svq_set_notification(svq, false);
>           while (true) {
> -            g_autofree VirtQueueElement *elem = vhost_svq_get_buf(svq);
> -            if (!elem) {
> +            g_autofree SVQElement *svq_elem = vhost_svq_get_buf(svq);
> +            VirtQueueElement *elem;
> +            if (!svq_elem) {
>                   break;
>               }
>   
>               assert(i < svq->vring.num);
> +            elem = &svq_elem->elem;
> +
> +            vhost_svq_unstash_addr(svq_elem->in_sg_stash, elem->in_sg,
> +                                   elem->in_num);
> +            vhost_svq_unstash_addr(svq_elem->out_sg_stash, elem->out_sg,
> +                                   elem->out_num);
>               virtqueue_fill(vq, elem, elem->len, i++);
>           }
>   
> @@ -451,14 +526,24 @@ void vhost_svq_stop(struct vhost_dev *dev, unsigned idx,
>       event_notifier_set_handler(&svq->host_notifier, NULL);
>   
>       for (i = 0; i < svq->vring.num; ++i) {
> -        g_autofree VirtQueueElement *elem = svq->ring_id_maps[i];
> +        g_autofree SVQElement *svq_elem = svq->ring_id_maps[i];
> +        VirtQueueElement *elem;
> +
> +        if (!svq_elem) {
> +            continue;
> +        }
> +
> +        elem = &svq_elem->elem;
> +        vhost_svq_unstash_addr(svq_elem->in_sg_stash, elem->in_sg,
> +                               elem->in_num);
> +        vhost_svq_unstash_addr(svq_elem->out_sg_stash, elem->out_sg,
> +                               elem->out_num);
> +
>           /*
>            * Although the doc says we must unpop in order, it's ok to unpop
>            * everything.
>            */
> -        if (elem) {
> -            virtqueue_unpop(svq->vq, elem, elem->len);
> -        }
> +        virtqueue_unpop(svq->vq, elem, elem->len);
>       }
>   }
>   
> @@ -466,7 +551,8 @@ void vhost_svq_stop(struct vhost_dev *dev, unsigned idx,
>    * Creates vhost shadow virtqueue, and instruct vhost device to use the shadow
>    * methods and file descriptors.
>    */
> -VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx)
> +VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx,
> +                                    VhostIOVATree *iova_map)
>   {
>       int vq_idx = dev->vq_index + idx;
>       unsigned num = virtio_queue_get_num(dev->vdev, vq_idx);
> @@ -500,11 +586,13 @@ VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx)
>       memset(svq->vring.desc, 0, driver_size);
>       svq->vring.used = qemu_memalign(qemu_real_host_page_size, device_size);
>       memset(svq->vring.used, 0, device_size);
> +    svq->iova_map = iova_map;
> +
>       for (i = 0; i < num - 1; i++) {
>           svq->vring.desc[i].next = cpu_to_le16(i + 1);
>       }
>   
> -    svq->ring_id_maps = g_new0(VirtQueueElement *, num);
> +    svq->ring_id_maps = g_new0(SVQElement *, num);
>       event_notifier_set_handler(&svq->call_notifier,
>                                  vhost_svq_handle_call);
>       return g_steal_pointer(&svq);
> diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
> index a9c680b487..f5a12fee9d 100644
> --- a/hw/virtio/vhost-vdpa.c
> +++ b/hw/virtio/vhost-vdpa.c
> @@ -176,6 +176,18 @@ static void vhost_vdpa_listener_region_add(MemoryListener *listener,
>                                            vaddr, section->readonly);
>   
>       llsize = int128_sub(llend, int128_make64(iova));
> +    if (v->shadow_vqs_enabled) {
> +        VhostDMAMap mem_region = {
> +            .translated_addr = vaddr,
> +            .size = int128_get64(llsize) - 1,
> +            .perm = IOMMU_ACCESS_FLAG(true, section->readonly),
> +        };
> +
> +        int r = vhost_iova_tree_alloc(v->iova_map, &mem_region);
> +        assert(r == VHOST_DMA_MAP_OK);
> +
> +        iova = mem_region.iova;
> +    }
>   
>       ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize),
>                                vaddr, section->readonly);
> @@ -754,6 +766,23 @@ static bool  vhost_vdpa_force_iommu(struct vhost_dev *dev)
>       return true;
>   }
>   
> +static int vhost_vdpa_get_iova_range(struct vhost_dev *dev,
> +                                     hwaddr *first, hwaddr *last)
> +{
> +    int ret;
> +    struct vhost_vdpa_iova_range range;
> +
> +    ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_IOVA_RANGE, &range);
> +    if (ret != 0) {
> +        return ret;
> +    }
> +
> +    *first = range.first;
> +    *last = range.last;
> +    trace_vhost_vdpa_get_iova_range(dev, *first, *last);
> +    return ret;
> +}
> +
>   /**
>    * Maps QEMU vaddr memory to device in a suitable way for shadow virtqueue:
>    * - It always reference qemu memory address, not guest's memory.
> @@ -881,6 +910,7 @@ static bool vhost_vdpa_svq_start_vq(struct vhost_dev *dev, unsigned idx)
>   static unsigned vhost_vdpa_enable_svq(struct vhost_vdpa *v, bool enable)
>   {
>       struct vhost_dev *hdev = v->dev;
> +    hwaddr iova_first, iova_last;
>       unsigned n;
>       int r;
>   
> @@ -894,7 +924,7 @@ static unsigned vhost_vdpa_enable_svq(struct vhost_vdpa *v, bool enable)
>           /* Allocate resources */
>           assert(v->shadow_vqs->len == 0);
>           for (n = 0; n < hdev->nvqs; ++n) {
> -            VhostShadowVirtqueue *svq = vhost_svq_new(hdev, n);
> +            VhostShadowVirtqueue *svq = vhost_svq_new(hdev, n, v->iova_map);
>               if (unlikely(!svq)) {
>                   g_ptr_array_set_size(v->shadow_vqs, 0);
>                   return 0;
> @@ -903,6 +933,8 @@ static unsigned vhost_vdpa_enable_svq(struct vhost_vdpa *v, bool enable)
>           }
>       }
>   
> +    r = vhost_vdpa_get_iova_range(hdev, &iova_first, &iova_last);
> +    assert(r == 0);
>       r = vhost_vdpa_vring_pause(hdev);
>       assert(r == 0);
>   
> @@ -913,6 +945,12 @@ static unsigned vhost_vdpa_enable_svq(struct vhost_vdpa *v, bool enable)
>           }
>       }
>   
> +    memory_listener_unregister(&v->listener);
> +    if (vhost_vdpa_dma_unmap(v, iova_first,
> +                             (iova_last - iova_first) & TARGET_PAGE_MASK)) {
> +        error_report("Fail to invalidate device iotlb");
> +    }
> +
>       /* Reset device so it can be configured */
>       r = vhost_vdpa_dev_start(hdev, false);
>       assert(r == 0);
> diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
> index 8ed19e9d0c..650e521e35 100644
> --- a/hw/virtio/trace-events
> +++ b/hw/virtio/trace-events
> @@ -52,6 +52,7 @@ vhost_vdpa_set_vring_call(void *dev, unsigned int index, int fd) "dev: %p index:
>   vhost_vdpa_get_features(void *dev, uint64_t features) "dev: %p features: 0x%"PRIx64
>   vhost_vdpa_set_owner(void *dev) "dev: %p"
>   vhost_vdpa_vq_get_addr(void *dev, void *vq, uint64_t desc_user_addr, uint64_t avail_user_addr, uint64_t used_user_addr) "dev: %p vq: %p desc_user_addr: 0x%"PRIx64" avail_user_addr: 0x%"PRIx64" used_user_addr: 0x%"PRIx64
> +vhost_vdpa_get_iova_range(void *dev, uint64_t first, uint64_t last) "dev: %p first: 0x%"PRIx64" last: 0x%"PRIx64
>   
>   # virtio.c
>   virtqueue_alloc_element(void *elem, size_t sz, unsigned in_num, unsigned out_num) "elem %p size %zd in_num %u out_num %u"

Re: [RFC PATCH v4 20/20] vdpa: Add custom IOTLB translations to SVQ

Posted by Eugenio Perez Martin 4 years, 3 months ago

On Tue, Oct 19, 2021 at 11:25 AM Jason Wang <jasowang@redhat.com> wrote:
>
>
> 在 2021/10/1 下午3:06, Eugenio Pérez 写道:
> > Use translations added in VhostIOVATree in SVQ.
> >
> > Now every element needs to store the previous address also, so VirtQueue
> > can consume the elements properly. This adds a little overhead per VQ
> > element, having to allocate more memory to stash them. As a possible
> > optimization, this allocation could be avoided if the descriptor is not
> > a chain but a single one, but this is left undone.
> >
> > TODO: iova range should be queried before, and add logic to fail when
> > GPA is outside of its range and memory listener or svq add it.
> >
> > Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> > ---
> >   hw/virtio/vhost-shadow-virtqueue.h |   4 +-
> >   hw/virtio/vhost-shadow-virtqueue.c | 130 ++++++++++++++++++++++++-----
> >   hw/virtio/vhost-vdpa.c             |  40 ++++++++-
> >   hw/virtio/trace-events             |   1 +
> >   4 files changed, 152 insertions(+), 23 deletions(-)
> >
> > diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
> > index b7baa424a7..a0e6b5267a 100644
> > --- a/hw/virtio/vhost-shadow-virtqueue.h
> > +++ b/hw/virtio/vhost-shadow-virtqueue.h
> > @@ -11,6 +11,7 @@
> >   #define VHOST_SHADOW_VIRTQUEUE_H
> >
> >   #include "hw/virtio/vhost.h"
> > +#include "hw/virtio/vhost-iova-tree.h"
> >
> >   typedef struct VhostShadowVirtqueue VhostShadowVirtqueue;
> >
> > @@ -28,7 +29,8 @@ bool vhost_svq_start(struct vhost_dev *dev, unsigned idx,
> >   void vhost_svq_stop(struct vhost_dev *dev, unsigned idx,
> >                       VhostShadowVirtqueue *svq);
> >
> > -VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx);
> > +VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx,
> > +                                    VhostIOVATree *iova_map);
> >
> >   void vhost_svq_free(VhostShadowVirtqueue *vq);
> >
> > diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
> > index 2fd0bab75d..9db538547e 100644
> > --- a/hw/virtio/vhost-shadow-virtqueue.c
> > +++ b/hw/virtio/vhost-shadow-virtqueue.c
> > @@ -11,12 +11,19 @@
> >   #include "hw/virtio/vhost-shadow-virtqueue.h"
> >   #include "hw/virtio/vhost.h"
> >   #include "hw/virtio/virtio-access.h"
> > +#include "hw/virtio/vhost-iova-tree.h"
> >
> >   #include "standard-headers/linux/vhost_types.h"
> >
> >   #include "qemu/error-report.h"
> >   #include "qemu/main-loop.h"
> >
> > +typedef struct SVQElement {
> > +    VirtQueueElement elem;
> > +    void **in_sg_stash;
> > +    void **out_sg_stash;
> > +} SVQElement;
> > +
> >   /* Shadow virtqueue to relay notifications */
> >   typedef struct VhostShadowVirtqueue {
> >       /* Shadow vring */
> > @@ -46,8 +53,11 @@ typedef struct VhostShadowVirtqueue {
> >       /* Virtio device */
> >       VirtIODevice *vdev;
> >
> > +    /* IOVA mapping if used */
> > +    VhostIOVATree *iova_map;
> > +
> >       /* Map for returning guest's descriptors */
> > -    VirtQueueElement **ring_id_maps;
> > +    SVQElement **ring_id_maps;
> >
> >       /* Next head to expose to device */
> >       uint16_t avail_idx_shadow;
> > @@ -79,13 +89,6 @@ bool vhost_svq_valid_device_features(uint64_t *dev_features)
> >               continue;
> >
> >           case VIRTIO_F_ACCESS_PLATFORM:
> > -            /* SVQ needs this feature disabled. Can't continue */
> > -            if (*dev_features & BIT_ULL(b)) {
> > -                clear_bit(b, dev_features);
> > -                r = false;
> > -            }
> > -            break;
> > -
> >           case VIRTIO_F_VERSION_1:
> >               /* SVQ needs this feature, so can't continue */
> >               if (!(*dev_features & BIT_ULL(b))) {
> > @@ -126,6 +129,64 @@ static void vhost_svq_set_notification(VhostShadowVirtqueue *svq, bool enable)
> >       }
> >   }
> >
> > +static void vhost_svq_stash_addr(void ***stash, const struct iovec *iov,
> > +                                 size_t num)
> > +{
> > +    size_t i;
> > +
> > +    if (num == 0) {
> > +        return;
> > +    }
> > +
> > +    *stash = g_new(void *, num);
> > +    for (i = 0; i < num; ++i) {
> > +        (*stash)[i] = iov[i].iov_base;
> > +    }
> > +}
> > +
> > +static void vhost_svq_unstash_addr(void **stash, struct iovec *iov, size_t num)
> > +{
> > +    size_t i;
> > +
> > +    if (num == 0) {
> > +        return;
> > +    }
> > +
> > +    for (i = 0; i < num; ++i) {
> > +        iov[i].iov_base = stash[i];
> > +    }
> > +    g_free(stash);
> > +}
> > +
> > +static void vhost_svq_translate_addr(const VhostShadowVirtqueue *svq,
> > +                                     struct iovec *iovec, size_t num)
> > +{
> > +    size_t i;
> > +
> > +    for (i = 0; i < num; ++i) {
> > +        VhostDMAMap needle = {
> > +            .translated_addr = iovec[i].iov_base,
> > +            .size = iovec[i].iov_len,
> > +        };
> > +        size_t off;
> > +
> > +        const VhostDMAMap *map = vhost_iova_tree_find_iova(svq->iova_map,
> > +                                                           &needle);
>
>
> Is it possible that we end up with more than one maps here?
>

Actually it is possible, since there is no guarantee that one
descriptor (or indirect descriptor) maps exactly to one iov. It could
map to many if qemu vaddr is not contiguous but GPA + size is. This is
something that must be fixed for the next revision, so thanks for
pointing it out!

Taking that into account, the condition that svq vring avail_idx -
used_idx was always less or equal than guest's vring avail_idx -
used_idx is not true anymore. Checking for that before adding buffers
to SVQ is the easy part, but how could we recover in that case?

I think that the easy solution is to check for more available buffers
unconditionally at the end of vhost_svq_handle_call, which handles the
SVQ used and is supposed to make more room for available buffers. So
vhost_handle_guest_kick would not check if eventfd is set or not
anymore.

Would that make sense?

Thanks!

>
> > +        /*
> > +         * Map cannot be NULL since iova map contains all guest space and
> > +         * qemu already has a physical address mapped
> > +         */
> > +        assert(map);
> > +
> > +        /*
> > +         * Map->iova chunk size is ignored. What to do if descriptor
> > +         * (addr, size) does not fit is delegated to the device.
> > +         */
> > +        off = needle.translated_addr - map->translated_addr;
> > +        iovec[i].iov_base = (void *)(map->iova + off);
> > +    }
> > +}
> > +
> >   static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
> >                                       const struct iovec *iovec,
> >                                       size_t num, bool more_descs, bool write)
> > @@ -156,8 +217,9 @@ static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
> >   }
> >
> >   static unsigned vhost_svq_add_split(VhostShadowVirtqueue *svq,
> > -                                    VirtQueueElement *elem)
> > +                                    SVQElement *svq_elem)
> >   {
> > +    VirtQueueElement *elem = &svq_elem->elem;
> >       int head;
> >       unsigned avail_idx;
> >       vring_avail_t *avail = svq->vring.avail;
> > @@ -167,6 +229,12 @@ static unsigned vhost_svq_add_split(VhostShadowVirtqueue *svq,
> >       /* We need some descriptors here */
> >       assert(elem->out_num || elem->in_num);
> >
> > +    vhost_svq_stash_addr(&svq_elem->in_sg_stash, elem->in_sg, elem->in_num);
> > +    vhost_svq_stash_addr(&svq_elem->out_sg_stash, elem->out_sg, elem->out_num);
>
>
> I wonder if we can solve the trick like stash and unstash with a
> dedicated sgs in svq_elem, instead of reusing the elem.
>

Actually yes, it would be way simpler to use a new sgs array in
svq_elem. I will change that.

Thanks!

> Thanks
>
>
> > +
> > +    vhost_svq_translate_addr(svq, elem->in_sg, elem->in_num);
> > +    vhost_svq_translate_addr(svq, elem->out_sg, elem->out_num);
> > +
> >       vhost_vring_write_descs(svq, elem->out_sg, elem->out_num,
> >                               elem->in_num > 0, false);
> >       vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
> > @@ -187,7 +255,7 @@ static unsigned vhost_svq_add_split(VhostShadowVirtqueue *svq,
> >
> >   }
> >
> > -static void vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
> > +static void vhost_svq_add(VhostShadowVirtqueue *svq, SVQElement *elem)
> >   {
> >       unsigned qemu_head = vhost_svq_add_split(svq, elem);
> >
> > @@ -221,7 +289,7 @@ static void vhost_handle_guest_kick(EventNotifier *n)
> >           }
> >
> >           while (true) {
> > -            VirtQueueElement *elem = virtqueue_pop(svq->vq, sizeof(*elem));
> > +            SVQElement *elem = virtqueue_pop(svq->vq, sizeof(*elem));
> >               if (!elem) {
> >                   break;
> >               }
> > @@ -247,7 +315,7 @@ static bool vhost_svq_more_used(VhostShadowVirtqueue *svq)
> >       return svq->used_idx != svq->shadow_used_idx;
> >   }
> >
> > -static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq)
> > +static SVQElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq)
> >   {
> >       vring_desc_t *descs = svq->vring.desc;
> >       const vring_used_t *used = svq->vring.used;
> > @@ -279,7 +347,7 @@ static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq)
> >       descs[used_elem.id].next = svq->free_head;
> >       svq->free_head = used_elem.id;
> >
> > -    svq->ring_id_maps[used_elem.id]->len = used_elem.len;
> > +    svq->ring_id_maps[used_elem.id]->elem.len = used_elem.len;
> >       return g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
> >   }
> >
> > @@ -296,12 +364,19 @@ static void vhost_svq_handle_call_no_test(EventNotifier *n)
> >
> >           vhost_svq_set_notification(svq, false);
> >           while (true) {
> > -            g_autofree VirtQueueElement *elem = vhost_svq_get_buf(svq);
> > -            if (!elem) {
> > +            g_autofree SVQElement *svq_elem = vhost_svq_get_buf(svq);
> > +            VirtQueueElement *elem;
> > +            if (!svq_elem) {
> >                   break;
> >               }
> >
> >               assert(i < svq->vring.num);
> > +            elem = &svq_elem->elem;
> > +
> > +            vhost_svq_unstash_addr(svq_elem->in_sg_stash, elem->in_sg,
> > +                                   elem->in_num);
> > +            vhost_svq_unstash_addr(svq_elem->out_sg_stash, elem->out_sg,
> > +                                   elem->out_num);
> >               virtqueue_fill(vq, elem, elem->len, i++);
> >           }
> >
> > @@ -451,14 +526,24 @@ void vhost_svq_stop(struct vhost_dev *dev, unsigned idx,
> >       event_notifier_set_handler(&svq->host_notifier, NULL);
> >
> >       for (i = 0; i < svq->vring.num; ++i) {
> > -        g_autofree VirtQueueElement *elem = svq->ring_id_maps[i];
> > +        g_autofree SVQElement *svq_elem = svq->ring_id_maps[i];
> > +        VirtQueueElement *elem;
> > +
> > +        if (!svq_elem) {
> > +            continue;
> > +        }
> > +
> > +        elem = &svq_elem->elem;
> > +        vhost_svq_unstash_addr(svq_elem->in_sg_stash, elem->in_sg,
> > +                               elem->in_num);
> > +        vhost_svq_unstash_addr(svq_elem->out_sg_stash, elem->out_sg,
> > +                               elem->out_num);
> > +
> >           /*
> >            * Although the doc says we must unpop in order, it's ok to unpop
> >            * everything.
> >            */
> > -        if (elem) {
> > -            virtqueue_unpop(svq->vq, elem, elem->len);
> > -        }
> > +        virtqueue_unpop(svq->vq, elem, elem->len);
> >       }
> >   }
> >
> > @@ -466,7 +551,8 @@ void vhost_svq_stop(struct vhost_dev *dev, unsigned idx,
> >    * Creates vhost shadow virtqueue, and instruct vhost device to use the shadow
> >    * methods and file descriptors.
> >    */
> > -VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx)
> > +VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx,
> > +                                    VhostIOVATree *iova_map)
> >   {
> >       int vq_idx = dev->vq_index + idx;
> >       unsigned num = virtio_queue_get_num(dev->vdev, vq_idx);
> > @@ -500,11 +586,13 @@ VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx)
> >       memset(svq->vring.desc, 0, driver_size);
> >       svq->vring.used = qemu_memalign(qemu_real_host_page_size, device_size);
> >       memset(svq->vring.used, 0, device_size);
> > +    svq->iova_map = iova_map;
> > +
> >       for (i = 0; i < num - 1; i++) {
> >           svq->vring.desc[i].next = cpu_to_le16(i + 1);
> >       }
> >
> > -    svq->ring_id_maps = g_new0(VirtQueueElement *, num);
> > +    svq->ring_id_maps = g_new0(SVQElement *, num);
> >       event_notifier_set_handler(&svq->call_notifier,
> >                                  vhost_svq_handle_call);
> >       return g_steal_pointer(&svq);
> > diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
> > index a9c680b487..f5a12fee9d 100644
> > --- a/hw/virtio/vhost-vdpa.c
> > +++ b/hw/virtio/vhost-vdpa.c
> > @@ -176,6 +176,18 @@ static void vhost_vdpa_listener_region_add(MemoryListener *listener,
> >                                            vaddr, section->readonly);
> >
> >       llsize = int128_sub(llend, int128_make64(iova));
> > +    if (v->shadow_vqs_enabled) {
> > +        VhostDMAMap mem_region = {
> > +            .translated_addr = vaddr,
> > +            .size = int128_get64(llsize) - 1,
> > +            .perm = IOMMU_ACCESS_FLAG(true, section->readonly),
> > +        };
> > +
> > +        int r = vhost_iova_tree_alloc(v->iova_map, &mem_region);
> > +        assert(r == VHOST_DMA_MAP_OK);
> > +
> > +        iova = mem_region.iova;
> > +    }
> >
> >       ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize),
> >                                vaddr, section->readonly);
> > @@ -754,6 +766,23 @@ static bool  vhost_vdpa_force_iommu(struct vhost_dev *dev)
> >       return true;
> >   }
> >
> > +static int vhost_vdpa_get_iova_range(struct vhost_dev *dev,
> > +                                     hwaddr *first, hwaddr *last)
> > +{
> > +    int ret;
> > +    struct vhost_vdpa_iova_range range;
> > +
> > +    ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_IOVA_RANGE, &range);
> > +    if (ret != 0) {
> > +        return ret;
> > +    }
> > +
> > +    *first = range.first;
> > +    *last = range.last;
> > +    trace_vhost_vdpa_get_iova_range(dev, *first, *last);
> > +    return ret;
> > +}
> > +
> >   /**
> >    * Maps QEMU vaddr memory to device in a suitable way for shadow virtqueue:
> >    * - It always reference qemu memory address, not guest's memory.
> > @@ -881,6 +910,7 @@ static bool vhost_vdpa_svq_start_vq(struct vhost_dev *dev, unsigned idx)
> >   static unsigned vhost_vdpa_enable_svq(struct vhost_vdpa *v, bool enable)
> >   {
> >       struct vhost_dev *hdev = v->dev;
> > +    hwaddr iova_first, iova_last;
> >       unsigned n;
> >       int r;
> >
> > @@ -894,7 +924,7 @@ static unsigned vhost_vdpa_enable_svq(struct vhost_vdpa *v, bool enable)
> >           /* Allocate resources */
> >           assert(v->shadow_vqs->len == 0);
> >           for (n = 0; n < hdev->nvqs; ++n) {
> > -            VhostShadowVirtqueue *svq = vhost_svq_new(hdev, n);
> > +            VhostShadowVirtqueue *svq = vhost_svq_new(hdev, n, v->iova_map);
> >               if (unlikely(!svq)) {
> >                   g_ptr_array_set_size(v->shadow_vqs, 0);
> >                   return 0;
> > @@ -903,6 +933,8 @@ static unsigned vhost_vdpa_enable_svq(struct vhost_vdpa *v, bool enable)
> >           }
> >       }
> >
> > +    r = vhost_vdpa_get_iova_range(hdev, &iova_first, &iova_last);
> > +    assert(r == 0);
> >       r = vhost_vdpa_vring_pause(hdev);
> >       assert(r == 0);
> >
> > @@ -913,6 +945,12 @@ static unsigned vhost_vdpa_enable_svq(struct vhost_vdpa *v, bool enable)
> >           }
> >       }
> >
> > +    memory_listener_unregister(&v->listener);
> > +    if (vhost_vdpa_dma_unmap(v, iova_first,
> > +                             (iova_last - iova_first) & TARGET_PAGE_MASK)) {
> > +        error_report("Fail to invalidate device iotlb");
> > +    }
> > +
> >       /* Reset device so it can be configured */
> >       r = vhost_vdpa_dev_start(hdev, false);
> >       assert(r == 0);
> > diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
> > index 8ed19e9d0c..650e521e35 100644
> > --- a/hw/virtio/trace-events
> > +++ b/hw/virtio/trace-events
> > @@ -52,6 +52,7 @@ vhost_vdpa_set_vring_call(void *dev, unsigned int index, int fd) "dev: %p index:
> >   vhost_vdpa_get_features(void *dev, uint64_t features) "dev: %p features: 0x%"PRIx64
> >   vhost_vdpa_set_owner(void *dev) "dev: %p"
> >   vhost_vdpa_vq_get_addr(void *dev, void *vq, uint64_t desc_user_addr, uint64_t avail_user_addr, uint64_t used_user_addr) "dev: %p vq: %p desc_user_addr: 0x%"PRIx64" avail_user_addr: 0x%"PRIx64" used_user_addr: 0x%"PRIx64
> > +vhost_vdpa_get_iova_range(void *dev, uint64_t first, uint64_t last) "dev: %p first: 0x%"PRIx64" last: 0x%"PRIx64
> >
> >   # virtio.c
> >   virtqueue_alloc_element(void *elem, size_t sz, unsigned in_num, unsigned out_num) "elem %p size %zd in_num %u out_num %u"
>

Re: [RFC PATCH v4 20/20] vdpa: Add custom IOTLB translations to SVQ

Posted by Jason Wang 4 years, 3 months ago

On Tue, Oct 19, 2021 at 6:29 PM Eugenio Perez Martin
<eperezma@redhat.com> wrote:
>
> On Tue, Oct 19, 2021 at 11:25 AM Jason Wang <jasowang@redhat.com> wrote:
> >
> >
> > 在 2021/10/1 下午3:06, Eugenio Pérez 写道:
> > > Use translations added in VhostIOVATree in SVQ.
> > >
> > > Now every element needs to store the previous address also, so VirtQueue
> > > can consume the elements properly. This adds a little overhead per VQ
> > > element, having to allocate more memory to stash them. As a possible
> > > optimization, this allocation could be avoided if the descriptor is not
> > > a chain but a single one, but this is left undone.
> > >
> > > TODO: iova range should be queried before, and add logic to fail when
> > > GPA is outside of its range and memory listener or svq add it.
> > >
> > > Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> > > ---
> > >   hw/virtio/vhost-shadow-virtqueue.h |   4 +-
> > >   hw/virtio/vhost-shadow-virtqueue.c | 130 ++++++++++++++++++++++++-----
> > >   hw/virtio/vhost-vdpa.c             |  40 ++++++++-
> > >   hw/virtio/trace-events             |   1 +
> > >   4 files changed, 152 insertions(+), 23 deletions(-)
> > >
> > > diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
> > > index b7baa424a7..a0e6b5267a 100644
> > > --- a/hw/virtio/vhost-shadow-virtqueue.h
> > > +++ b/hw/virtio/vhost-shadow-virtqueue.h
> > > @@ -11,6 +11,7 @@
> > >   #define VHOST_SHADOW_VIRTQUEUE_H
> > >
> > >   #include "hw/virtio/vhost.h"
> > > +#include "hw/virtio/vhost-iova-tree.h"
> > >
> > >   typedef struct VhostShadowVirtqueue VhostShadowVirtqueue;
> > >
> > > @@ -28,7 +29,8 @@ bool vhost_svq_start(struct vhost_dev *dev, unsigned idx,
> > >   void vhost_svq_stop(struct vhost_dev *dev, unsigned idx,
> > >                       VhostShadowVirtqueue *svq);
> > >
> > > -VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx);
> > > +VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx,
> > > +                                    VhostIOVATree *iova_map);
> > >
> > >   void vhost_svq_free(VhostShadowVirtqueue *vq);
> > >
> > > diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
> > > index 2fd0bab75d..9db538547e 100644
> > > --- a/hw/virtio/vhost-shadow-virtqueue.c
> > > +++ b/hw/virtio/vhost-shadow-virtqueue.c
> > > @@ -11,12 +11,19 @@
> > >   #include "hw/virtio/vhost-shadow-virtqueue.h"
> > >   #include "hw/virtio/vhost.h"
> > >   #include "hw/virtio/virtio-access.h"
> > > +#include "hw/virtio/vhost-iova-tree.h"
> > >
> > >   #include "standard-headers/linux/vhost_types.h"
> > >
> > >   #include "qemu/error-report.h"
> > >   #include "qemu/main-loop.h"
> > >
> > > +typedef struct SVQElement {
> > > +    VirtQueueElement elem;
> > > +    void **in_sg_stash;
> > > +    void **out_sg_stash;
> > > +} SVQElement;
> > > +
> > >   /* Shadow virtqueue to relay notifications */
> > >   typedef struct VhostShadowVirtqueue {
> > >       /* Shadow vring */
> > > @@ -46,8 +53,11 @@ typedef struct VhostShadowVirtqueue {
> > >       /* Virtio device */
> > >       VirtIODevice *vdev;
> > >
> > > +    /* IOVA mapping if used */
> > > +    VhostIOVATree *iova_map;
> > > +
> > >       /* Map for returning guest's descriptors */
> > > -    VirtQueueElement **ring_id_maps;
> > > +    SVQElement **ring_id_maps;
> > >
> > >       /* Next head to expose to device */
> > >       uint16_t avail_idx_shadow;
> > > @@ -79,13 +89,6 @@ bool vhost_svq_valid_device_features(uint64_t *dev_features)
> > >               continue;
> > >
> > >           case VIRTIO_F_ACCESS_PLATFORM:
> > > -            /* SVQ needs this feature disabled. Can't continue */
> > > -            if (*dev_features & BIT_ULL(b)) {
> > > -                clear_bit(b, dev_features);
> > > -                r = false;
> > > -            }
> > > -            break;
> > > -
> > >           case VIRTIO_F_VERSION_1:
> > >               /* SVQ needs this feature, so can't continue */
> > >               if (!(*dev_features & BIT_ULL(b))) {
> > > @@ -126,6 +129,64 @@ static void vhost_svq_set_notification(VhostShadowVirtqueue *svq, bool enable)
> > >       }
> > >   }
> > >
> > > +static void vhost_svq_stash_addr(void ***stash, const struct iovec *iov,
> > > +                                 size_t num)
> > > +{
> > > +    size_t i;
> > > +
> > > +    if (num == 0) {
> > > +        return;
> > > +    }
> > > +
> > > +    *stash = g_new(void *, num);
> > > +    for (i = 0; i < num; ++i) {
> > > +        (*stash)[i] = iov[i].iov_base;
> > > +    }
> > > +}
> > > +
> > > +static void vhost_svq_unstash_addr(void **stash, struct iovec *iov, size_t num)
> > > +{
> > > +    size_t i;
> > > +
> > > +    if (num == 0) {
> > > +        return;
> > > +    }
> > > +
> > > +    for (i = 0; i < num; ++i) {
> > > +        iov[i].iov_base = stash[i];
> > > +    }
> > > +    g_free(stash);
> > > +}
> > > +
> > > +static void vhost_svq_translate_addr(const VhostShadowVirtqueue *svq,
> > > +                                     struct iovec *iovec, size_t num)
> > > +{
> > > +    size_t i;
> > > +
> > > +    for (i = 0; i < num; ++i) {
> > > +        VhostDMAMap needle = {
> > > +            .translated_addr = iovec[i].iov_base,
> > > +            .size = iovec[i].iov_len,
> > > +        };
> > > +        size_t off;
> > > +
> > > +        const VhostDMAMap *map = vhost_iova_tree_find_iova(svq->iova_map,
> > > +                                                           &needle);
> >
> >
> > Is it possible that we end up with more than one maps here?
> >
>
> Actually it is possible, since there is no guarantee that one
> descriptor (or indirect descriptor) maps exactly to one iov. It could
> map to many if qemu vaddr is not contiguous but GPA + size is. This is
> something that must be fixed for the next revision, so thanks for
> pointing it out!
>
> Taking that into account, the condition that svq vring avail_idx -
> used_idx was always less or equal than guest's vring avail_idx -
> used_idx is not true anymore. Checking for that before adding buffers
> to SVQ is the easy part, but how could we recover in that case?
>
> I think that the easy solution is to check for more available buffers
> unconditionally at the end of vhost_svq_handle_call, which handles the
> SVQ used and is supposed to make more room for available buffers. So
> vhost_handle_guest_kick would not check if eventfd is set or not
> anymore.
>
> Would that make sense?

Yes, I think it should work.

Thanks

>
> Thanks!
>
> >
> > > +        /*
> > > +         * Map cannot be NULL since iova map contains all guest space and
> > > +         * qemu already has a physical address mapped
> > > +         */
> > > +        assert(map);
> > > +
> > > +        /*
> > > +         * Map->iova chunk size is ignored. What to do if descriptor
> > > +         * (addr, size) does not fit is delegated to the device.
> > > +         */
> > > +        off = needle.translated_addr - map->translated_addr;
> > > +        iovec[i].iov_base = (void *)(map->iova + off);
> > > +    }
> > > +}
> > > +
> > >   static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
> > >                                       const struct iovec *iovec,
> > >                                       size_t num, bool more_descs, bool write)
> > > @@ -156,8 +217,9 @@ static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
> > >   }
> > >
> > >   static unsigned vhost_svq_add_split(VhostShadowVirtqueue *svq,
> > > -                                    VirtQueueElement *elem)
> > > +                                    SVQElement *svq_elem)
> > >   {
> > > +    VirtQueueElement *elem = &svq_elem->elem;
> > >       int head;
> > >       unsigned avail_idx;
> > >       vring_avail_t *avail = svq->vring.avail;
> > > @@ -167,6 +229,12 @@ static unsigned vhost_svq_add_split(VhostShadowVirtqueue *svq,
> > >       /* We need some descriptors here */
> > >       assert(elem->out_num || elem->in_num);
> > >
> > > +    vhost_svq_stash_addr(&svq_elem->in_sg_stash, elem->in_sg, elem->in_num);
> > > +    vhost_svq_stash_addr(&svq_elem->out_sg_stash, elem->out_sg, elem->out_num);
> >
> >
> > I wonder if we can solve the trick like stash and unstash with a
> > dedicated sgs in svq_elem, instead of reusing the elem.
> >
>
> Actually yes, it would be way simpler to use a new sgs array in
> svq_elem. I will change that.
>
> Thanks!
>
> > Thanks
> >
> >
> > > +
> > > +    vhost_svq_translate_addr(svq, elem->in_sg, elem->in_num);
> > > +    vhost_svq_translate_addr(svq, elem->out_sg, elem->out_num);
> > > +
> > >       vhost_vring_write_descs(svq, elem->out_sg, elem->out_num,
> > >                               elem->in_num > 0, false);
> > >       vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
> > > @@ -187,7 +255,7 @@ static unsigned vhost_svq_add_split(VhostShadowVirtqueue *svq,
> > >
> > >   }
> > >
> > > -static void vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
> > > +static void vhost_svq_add(VhostShadowVirtqueue *svq, SVQElement *elem)
> > >   {
> > >       unsigned qemu_head = vhost_svq_add_split(svq, elem);
> > >
> > > @@ -221,7 +289,7 @@ static void vhost_handle_guest_kick(EventNotifier *n)
> > >           }
> > >
> > >           while (true) {
> > > -            VirtQueueElement *elem = virtqueue_pop(svq->vq, sizeof(*elem));
> > > +            SVQElement *elem = virtqueue_pop(svq->vq, sizeof(*elem));
> > >               if (!elem) {
> > >                   break;
> > >               }
> > > @@ -247,7 +315,7 @@ static bool vhost_svq_more_used(VhostShadowVirtqueue *svq)
> > >       return svq->used_idx != svq->shadow_used_idx;
> > >   }
> > >
> > > -static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq)
> > > +static SVQElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq)
> > >   {
> > >       vring_desc_t *descs = svq->vring.desc;
> > >       const vring_used_t *used = svq->vring.used;
> > > @@ -279,7 +347,7 @@ static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq)
> > >       descs[used_elem.id].next = svq->free_head;
> > >       svq->free_head = used_elem.id;
> > >
> > > -    svq->ring_id_maps[used_elem.id]->len = used_elem.len;
> > > +    svq->ring_id_maps[used_elem.id]->elem.len = used_elem.len;
> > >       return g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
> > >   }
> > >
> > > @@ -296,12 +364,19 @@ static void vhost_svq_handle_call_no_test(EventNotifier *n)
> > >
> > >           vhost_svq_set_notification(svq, false);
> > >           while (true) {
> > > -            g_autofree VirtQueueElement *elem = vhost_svq_get_buf(svq);
> > > -            if (!elem) {
> > > +            g_autofree SVQElement *svq_elem = vhost_svq_get_buf(svq);
> > > +            VirtQueueElement *elem;
> > > +            if (!svq_elem) {
> > >                   break;
> > >               }
> > >
> > >               assert(i < svq->vring.num);
> > > +            elem = &svq_elem->elem;
> > > +
> > > +            vhost_svq_unstash_addr(svq_elem->in_sg_stash, elem->in_sg,
> > > +                                   elem->in_num);
> > > +            vhost_svq_unstash_addr(svq_elem->out_sg_stash, elem->out_sg,
> > > +                                   elem->out_num);
> > >               virtqueue_fill(vq, elem, elem->len, i++);
> > >           }
> > >
> > > @@ -451,14 +526,24 @@ void vhost_svq_stop(struct vhost_dev *dev, unsigned idx,
> > >       event_notifier_set_handler(&svq->host_notifier, NULL);
> > >
> > >       for (i = 0; i < svq->vring.num; ++i) {
> > > -        g_autofree VirtQueueElement *elem = svq->ring_id_maps[i];
> > > +        g_autofree SVQElement *svq_elem = svq->ring_id_maps[i];
> > > +        VirtQueueElement *elem;
> > > +
> > > +        if (!svq_elem) {
> > > +            continue;
> > > +        }
> > > +
> > > +        elem = &svq_elem->elem;
> > > +        vhost_svq_unstash_addr(svq_elem->in_sg_stash, elem->in_sg,
> > > +                               elem->in_num);
> > > +        vhost_svq_unstash_addr(svq_elem->out_sg_stash, elem->out_sg,
> > > +                               elem->out_num);
> > > +
> > >           /*
> > >            * Although the doc says we must unpop in order, it's ok to unpop
> > >            * everything.
> > >            */
> > > -        if (elem) {
> > > -            virtqueue_unpop(svq->vq, elem, elem->len);
> > > -        }
> > > +        virtqueue_unpop(svq->vq, elem, elem->len);
> > >       }
> > >   }
> > >
> > > @@ -466,7 +551,8 @@ void vhost_svq_stop(struct vhost_dev *dev, unsigned idx,
> > >    * Creates vhost shadow virtqueue, and instruct vhost device to use the shadow
> > >    * methods and file descriptors.
> > >    */
> > > -VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx)
> > > +VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx,
> > > +                                    VhostIOVATree *iova_map)
> > >   {
> > >       int vq_idx = dev->vq_index + idx;
> > >       unsigned num = virtio_queue_get_num(dev->vdev, vq_idx);
> > > @@ -500,11 +586,13 @@ VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx)
> > >       memset(svq->vring.desc, 0, driver_size);
> > >       svq->vring.used = qemu_memalign(qemu_real_host_page_size, device_size);
> > >       memset(svq->vring.used, 0, device_size);
> > > +    svq->iova_map = iova_map;
> > > +
> > >       for (i = 0; i < num - 1; i++) {
> > >           svq->vring.desc[i].next = cpu_to_le16(i + 1);
> > >       }
> > >
> > > -    svq->ring_id_maps = g_new0(VirtQueueElement *, num);
> > > +    svq->ring_id_maps = g_new0(SVQElement *, num);
> > >       event_notifier_set_handler(&svq->call_notifier,
> > >                                  vhost_svq_handle_call);
> > >       return g_steal_pointer(&svq);
> > > diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
> > > index a9c680b487..f5a12fee9d 100644
> > > --- a/hw/virtio/vhost-vdpa.c
> > > +++ b/hw/virtio/vhost-vdpa.c
> > > @@ -176,6 +176,18 @@ static void vhost_vdpa_listener_region_add(MemoryListener *listener,
> > >                                            vaddr, section->readonly);
> > >
> > >       llsize = int128_sub(llend, int128_make64(iova));
> > > +    if (v->shadow_vqs_enabled) {
> > > +        VhostDMAMap mem_region = {
> > > +            .translated_addr = vaddr,
> > > +            .size = int128_get64(llsize) - 1,
> > > +            .perm = IOMMU_ACCESS_FLAG(true, section->readonly),
> > > +        };
> > > +
> > > +        int r = vhost_iova_tree_alloc(v->iova_map, &mem_region);
> > > +        assert(r == VHOST_DMA_MAP_OK);
> > > +
> > > +        iova = mem_region.iova;
> > > +    }
> > >
> > >       ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize),
> > >                                vaddr, section->readonly);
> > > @@ -754,6 +766,23 @@ static bool  vhost_vdpa_force_iommu(struct vhost_dev *dev)
> > >       return true;
> > >   }
> > >
> > > +static int vhost_vdpa_get_iova_range(struct vhost_dev *dev,
> > > +                                     hwaddr *first, hwaddr *last)
> > > +{
> > > +    int ret;
> > > +    struct vhost_vdpa_iova_range range;
> > > +
> > > +    ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_IOVA_RANGE, &range);
> > > +    if (ret != 0) {
> > > +        return ret;
> > > +    }
> > > +
> > > +    *first = range.first;
> > > +    *last = range.last;
> > > +    trace_vhost_vdpa_get_iova_range(dev, *first, *last);
> > > +    return ret;
> > > +}
> > > +
> > >   /**
> > >    * Maps QEMU vaddr memory to device in a suitable way for shadow virtqueue:
> > >    * - It always reference qemu memory address, not guest's memory.
> > > @@ -881,6 +910,7 @@ static bool vhost_vdpa_svq_start_vq(struct vhost_dev *dev, unsigned idx)
> > >   static unsigned vhost_vdpa_enable_svq(struct vhost_vdpa *v, bool enable)
> > >   {
> > >       struct vhost_dev *hdev = v->dev;
> > > +    hwaddr iova_first, iova_last;
> > >       unsigned n;
> > >       int r;
> > >
> > > @@ -894,7 +924,7 @@ static unsigned vhost_vdpa_enable_svq(struct vhost_vdpa *v, bool enable)
> > >           /* Allocate resources */
> > >           assert(v->shadow_vqs->len == 0);
> > >           for (n = 0; n < hdev->nvqs; ++n) {
> > > -            VhostShadowVirtqueue *svq = vhost_svq_new(hdev, n);
> > > +            VhostShadowVirtqueue *svq = vhost_svq_new(hdev, n, v->iova_map);
> > >               if (unlikely(!svq)) {
> > >                   g_ptr_array_set_size(v->shadow_vqs, 0);
> > >                   return 0;
> > > @@ -903,6 +933,8 @@ static unsigned vhost_vdpa_enable_svq(struct vhost_vdpa *v, bool enable)
> > >           }
> > >       }
> > >
> > > +    r = vhost_vdpa_get_iova_range(hdev, &iova_first, &iova_last);
> > > +    assert(r == 0);
> > >       r = vhost_vdpa_vring_pause(hdev);
> > >       assert(r == 0);
> > >
> > > @@ -913,6 +945,12 @@ static unsigned vhost_vdpa_enable_svq(struct vhost_vdpa *v, bool enable)
> > >           }
> > >       }
> > >
> > > +    memory_listener_unregister(&v->listener);
> > > +    if (vhost_vdpa_dma_unmap(v, iova_first,
> > > +                             (iova_last - iova_first) & TARGET_PAGE_MASK)) {
> > > +        error_report("Fail to invalidate device iotlb");
> > > +    }
> > > +
> > >       /* Reset device so it can be configured */
> > >       r = vhost_vdpa_dev_start(hdev, false);
> > >       assert(r == 0);
> > > diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
> > > index 8ed19e9d0c..650e521e35 100644
> > > --- a/hw/virtio/trace-events
> > > +++ b/hw/virtio/trace-events
> > > @@ -52,6 +52,7 @@ vhost_vdpa_set_vring_call(void *dev, unsigned int index, int fd) "dev: %p index:
> > >   vhost_vdpa_get_features(void *dev, uint64_t features) "dev: %p features: 0x%"PRIx64
> > >   vhost_vdpa_set_owner(void *dev) "dev: %p"
> > >   vhost_vdpa_vq_get_addr(void *dev, void *vq, uint64_t desc_user_addr, uint64_t avail_user_addr, uint64_t used_user_addr) "dev: %p vq: %p desc_user_addr: 0x%"PRIx64" avail_user_addr: 0x%"PRIx64" used_user_addr: 0x%"PRIx64
> > > +vhost_vdpa_get_iova_range(void *dev, uint64_t first, uint64_t last) "dev: %p first: 0x%"PRIx64" last: 0x%"PRIx64
> > >
> > >   # virtio.c
> > >   virtqueue_alloc_element(void *elem, size_t sz, unsigned in_num, unsigned out_num) "elem %p size %zd in_num %u out_num %u"
> >
>

Re: [RFC PATCH v4 20/20] vdpa: Add custom IOTLB translations to SVQ

Posted by Jason Wang 4 years, 3 months ago

On Wed, Oct 20, 2021 at 10:02 AM Jason Wang <jasowang@redhat.com> wrote:
>
> On Tue, Oct 19, 2021 at 6:29 PM Eugenio Perez Martin
> <eperezma@redhat.com> wrote:
> >
> > On Tue, Oct 19, 2021 at 11:25 AM Jason Wang <jasowang@redhat.com> wrote:
> > >
> > >
> > > 在 2021/10/1 下午3:06, Eugenio Pérez 写道:
> > > > Use translations added in VhostIOVATree in SVQ.
> > > >
> > > > Now every element needs to store the previous address also, so VirtQueue
> > > > can consume the elements properly. This adds a little overhead per VQ
> > > > element, having to allocate more memory to stash them. As a possible
> > > > optimization, this allocation could be avoided if the descriptor is not
> > > > a chain but a single one, but this is left undone.
> > > >
> > > > TODO: iova range should be queried before, and add logic to fail when
> > > > GPA is outside of its range and memory listener or svq add it.
> > > >
> > > > Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> > > > ---
> > > >   hw/virtio/vhost-shadow-virtqueue.h |   4 +-
> > > >   hw/virtio/vhost-shadow-virtqueue.c | 130 ++++++++++++++++++++++++-----
> > > >   hw/virtio/vhost-vdpa.c             |  40 ++++++++-
> > > >   hw/virtio/trace-events             |   1 +
> > > >   4 files changed, 152 insertions(+), 23 deletions(-)
> > > >
> > > > diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
> > > > index b7baa424a7..a0e6b5267a 100644
> > > > --- a/hw/virtio/vhost-shadow-virtqueue.h
> > > > +++ b/hw/virtio/vhost-shadow-virtqueue.h
> > > > @@ -11,6 +11,7 @@
> > > >   #define VHOST_SHADOW_VIRTQUEUE_H
> > > >
> > > >   #include "hw/virtio/vhost.h"
> > > > +#include "hw/virtio/vhost-iova-tree.h"
> > > >
> > > >   typedef struct VhostShadowVirtqueue VhostShadowVirtqueue;
> > > >
> > > > @@ -28,7 +29,8 @@ bool vhost_svq_start(struct vhost_dev *dev, unsigned idx,
> > > >   void vhost_svq_stop(struct vhost_dev *dev, unsigned idx,
> > > >                       VhostShadowVirtqueue *svq);
> > > >
> > > > -VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx);
> > > > +VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx,
> > > > +                                    VhostIOVATree *iova_map);
> > > >
> > > >   void vhost_svq_free(VhostShadowVirtqueue *vq);
> > > >
> > > > diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
> > > > index 2fd0bab75d..9db538547e 100644
> > > > --- a/hw/virtio/vhost-shadow-virtqueue.c
> > > > +++ b/hw/virtio/vhost-shadow-virtqueue.c
> > > > @@ -11,12 +11,19 @@
> > > >   #include "hw/virtio/vhost-shadow-virtqueue.h"
> > > >   #include "hw/virtio/vhost.h"
> > > >   #include "hw/virtio/virtio-access.h"
> > > > +#include "hw/virtio/vhost-iova-tree.h"
> > > >
> > > >   #include "standard-headers/linux/vhost_types.h"
> > > >
> > > >   #include "qemu/error-report.h"
> > > >   #include "qemu/main-loop.h"
> > > >
> > > > +typedef struct SVQElement {
> > > > +    VirtQueueElement elem;
> > > > +    void **in_sg_stash;
> > > > +    void **out_sg_stash;
> > > > +} SVQElement;
> > > > +
> > > >   /* Shadow virtqueue to relay notifications */
> > > >   typedef struct VhostShadowVirtqueue {
> > > >       /* Shadow vring */
> > > > @@ -46,8 +53,11 @@ typedef struct VhostShadowVirtqueue {
> > > >       /* Virtio device */
> > > >       VirtIODevice *vdev;
> > > >
> > > > +    /* IOVA mapping if used */
> > > > +    VhostIOVATree *iova_map;
> > > > +
> > > >       /* Map for returning guest's descriptors */
> > > > -    VirtQueueElement **ring_id_maps;
> > > > +    SVQElement **ring_id_maps;
> > > >
> > > >       /* Next head to expose to device */
> > > >       uint16_t avail_idx_shadow;
> > > > @@ -79,13 +89,6 @@ bool vhost_svq_valid_device_features(uint64_t *dev_features)
> > > >               continue;
> > > >
> > > >           case VIRTIO_F_ACCESS_PLATFORM:
> > > > -            /* SVQ needs this feature disabled. Can't continue */
> > > > -            if (*dev_features & BIT_ULL(b)) {
> > > > -                clear_bit(b, dev_features);
> > > > -                r = false;
> > > > -            }
> > > > -            break;
> > > > -
> > > >           case VIRTIO_F_VERSION_1:
> > > >               /* SVQ needs this feature, so can't continue */
> > > >               if (!(*dev_features & BIT_ULL(b))) {
> > > > @@ -126,6 +129,64 @@ static void vhost_svq_set_notification(VhostShadowVirtqueue *svq, bool enable)
> > > >       }
> > > >   }
> > > >
> > > > +static void vhost_svq_stash_addr(void ***stash, const struct iovec *iov,
> > > > +                                 size_t num)
> > > > +{
> > > > +    size_t i;
> > > > +
> > > > +    if (num == 0) {
> > > > +        return;
> > > > +    }
> > > > +
> > > > +    *stash = g_new(void *, num);
> > > > +    for (i = 0; i < num; ++i) {
> > > > +        (*stash)[i] = iov[i].iov_base;
> > > > +    }
> > > > +}
> > > > +
> > > > +static void vhost_svq_unstash_addr(void **stash, struct iovec *iov, size_t num)
> > > > +{
> > > > +    size_t i;
> > > > +
> > > > +    if (num == 0) {
> > > > +        return;
> > > > +    }
> > > > +
> > > > +    for (i = 0; i < num; ++i) {
> > > > +        iov[i].iov_base = stash[i];
> > > > +    }
> > > > +    g_free(stash);
> > > > +}
> > > > +
> > > > +static void vhost_svq_translate_addr(const VhostShadowVirtqueue *svq,
> > > > +                                     struct iovec *iovec, size_t num)
> > > > +{
> > > > +    size_t i;
> > > > +
> > > > +    for (i = 0; i < num; ++i) {
> > > > +        VhostDMAMap needle = {
> > > > +            .translated_addr = iovec[i].iov_base,
> > > > +            .size = iovec[i].iov_len,
> > > > +        };
> > > > +        size_t off;
> > > > +
> > > > +        const VhostDMAMap *map = vhost_iova_tree_find_iova(svq->iova_map,
> > > > +                                                           &needle);
> > >
> > >
> > > Is it possible that we end up with more than one maps here?
> > >
> >
> > Actually it is possible, since there is no guarantee that one
> > descriptor (or indirect descriptor) maps exactly to one iov. It could
> > map to many if qemu vaddr is not contiguous but GPA + size is. This is
> > something that must be fixed for the next revision, so thanks for
> > pointing it out!
> >
> > Taking that into account, the condition that svq vring avail_idx -
> > used_idx was always less or equal than guest's vring avail_idx -
> > used_idx is not true anymore. Checking for that before adding buffers
> > to SVQ is the easy part, but how could we recover in that case?
> >
> > I think that the easy solution is to check for more available buffers
> > unconditionally at the end of vhost_svq_handle_call, which handles the
> > SVQ used and is supposed to make more room for available buffers. So
> > vhost_handle_guest_kick would not check if eventfd is set or not
> > anymore.
> >
> > Would that make sense?
>
> Yes, I think it should work.

Btw, I wonder how to handle indirect descriptors. SVQ doesn't use
indirect descriptors for now, but it looks like a must otherwise we
may end up SVQ is full before VQ.

It looks to me an easy way is to always use indirect descriptors if #sg >= 2?

Thanks

>
> Thanks
>
> >
> > Thanks!
> >
> > >
> > > > +        /*
> > > > +         * Map cannot be NULL since iova map contains all guest space and
> > > > +         * qemu already has a physical address mapped
> > > > +         */
> > > > +        assert(map);
> > > > +
> > > > +        /*
> > > > +         * Map->iova chunk size is ignored. What to do if descriptor
> > > > +         * (addr, size) does not fit is delegated to the device.
> > > > +         */
> > > > +        off = needle.translated_addr - map->translated_addr;
> > > > +        iovec[i].iov_base = (void *)(map->iova + off);
> > > > +    }
> > > > +}
> > > > +
> > > >   static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
> > > >                                       const struct iovec *iovec,
> > > >                                       size_t num, bool more_descs, bool write)
> > > > @@ -156,8 +217,9 @@ static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
> > > >   }
> > > >
> > > >   static unsigned vhost_svq_add_split(VhostShadowVirtqueue *svq,
> > > > -                                    VirtQueueElement *elem)
> > > > +                                    SVQElement *svq_elem)
> > > >   {
> > > > +    VirtQueueElement *elem = &svq_elem->elem;
> > > >       int head;
> > > >       unsigned avail_idx;
> > > >       vring_avail_t *avail = svq->vring.avail;
> > > > @@ -167,6 +229,12 @@ static unsigned vhost_svq_add_split(VhostShadowVirtqueue *svq,
> > > >       /* We need some descriptors here */
> > > >       assert(elem->out_num || elem->in_num);
> > > >
> > > > +    vhost_svq_stash_addr(&svq_elem->in_sg_stash, elem->in_sg, elem->in_num);
> > > > +    vhost_svq_stash_addr(&svq_elem->out_sg_stash, elem->out_sg, elem->out_num);
> > >
> > >
> > > I wonder if we can solve the trick like stash and unstash with a
> > > dedicated sgs in svq_elem, instead of reusing the elem.
> > >
> >
> > Actually yes, it would be way simpler to use a new sgs array in
> > svq_elem. I will change that.
> >
> > Thanks!
> >
> > > Thanks
> > >
> > >
> > > > +
> > > > +    vhost_svq_translate_addr(svq, elem->in_sg, elem->in_num);
> > > > +    vhost_svq_translate_addr(svq, elem->out_sg, elem->out_num);
> > > > +
> > > >       vhost_vring_write_descs(svq, elem->out_sg, elem->out_num,
> > > >                               elem->in_num > 0, false);
> > > >       vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
> > > > @@ -187,7 +255,7 @@ static unsigned vhost_svq_add_split(VhostShadowVirtqueue *svq,
> > > >
> > > >   }
> > > >
> > > > -static void vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
> > > > +static void vhost_svq_add(VhostShadowVirtqueue *svq, SVQElement *elem)
> > > >   {
> > > >       unsigned qemu_head = vhost_svq_add_split(svq, elem);
> > > >
> > > > @@ -221,7 +289,7 @@ static void vhost_handle_guest_kick(EventNotifier *n)
> > > >           }
> > > >
> > > >           while (true) {
> > > > -            VirtQueueElement *elem = virtqueue_pop(svq->vq, sizeof(*elem));
> > > > +            SVQElement *elem = virtqueue_pop(svq->vq, sizeof(*elem));
> > > >               if (!elem) {
> > > >                   break;
> > > >               }
> > > > @@ -247,7 +315,7 @@ static bool vhost_svq_more_used(VhostShadowVirtqueue *svq)
> > > >       return svq->used_idx != svq->shadow_used_idx;
> > > >   }
> > > >
> > > > -static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq)
> > > > +static SVQElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq)
> > > >   {
> > > >       vring_desc_t *descs = svq->vring.desc;
> > > >       const vring_used_t *used = svq->vring.used;
> > > > @@ -279,7 +347,7 @@ static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq)
> > > >       descs[used_elem.id].next = svq->free_head;
> > > >       svq->free_head = used_elem.id;
> > > >
> > > > -    svq->ring_id_maps[used_elem.id]->len = used_elem.len;
> > > > +    svq->ring_id_maps[used_elem.id]->elem.len = used_elem.len;
> > > >       return g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
> > > >   }
> > > >
> > > > @@ -296,12 +364,19 @@ static void vhost_svq_handle_call_no_test(EventNotifier *n)
> > > >
> > > >           vhost_svq_set_notification(svq, false);
> > > >           while (true) {
> > > > -            g_autofree VirtQueueElement *elem = vhost_svq_get_buf(svq);
> > > > -            if (!elem) {
> > > > +            g_autofree SVQElement *svq_elem = vhost_svq_get_buf(svq);
> > > > +            VirtQueueElement *elem;
> > > > +            if (!svq_elem) {
> > > >                   break;
> > > >               }
> > > >
> > > >               assert(i < svq->vring.num);
> > > > +            elem = &svq_elem->elem;
> > > > +
> > > > +            vhost_svq_unstash_addr(svq_elem->in_sg_stash, elem->in_sg,
> > > > +                                   elem->in_num);
> > > > +            vhost_svq_unstash_addr(svq_elem->out_sg_stash, elem->out_sg,
> > > > +                                   elem->out_num);
> > > >               virtqueue_fill(vq, elem, elem->len, i++);
> > > >           }
> > > >
> > > > @@ -451,14 +526,24 @@ void vhost_svq_stop(struct vhost_dev *dev, unsigned idx,
> > > >       event_notifier_set_handler(&svq->host_notifier, NULL);
> > > >
> > > >       for (i = 0; i < svq->vring.num; ++i) {
> > > > -        g_autofree VirtQueueElement *elem = svq->ring_id_maps[i];
> > > > +        g_autofree SVQElement *svq_elem = svq->ring_id_maps[i];
> > > > +        VirtQueueElement *elem;
> > > > +
> > > > +        if (!svq_elem) {
> > > > +            continue;
> > > > +        }
> > > > +
> > > > +        elem = &svq_elem->elem;
> > > > +        vhost_svq_unstash_addr(svq_elem->in_sg_stash, elem->in_sg,
> > > > +                               elem->in_num);
> > > > +        vhost_svq_unstash_addr(svq_elem->out_sg_stash, elem->out_sg,
> > > > +                               elem->out_num);
> > > > +
> > > >           /*
> > > >            * Although the doc says we must unpop in order, it's ok to unpop
> > > >            * everything.
> > > >            */
> > > > -        if (elem) {
> > > > -            virtqueue_unpop(svq->vq, elem, elem->len);
> > > > -        }
> > > > +        virtqueue_unpop(svq->vq, elem, elem->len);
> > > >       }
> > > >   }
> > > >
> > > > @@ -466,7 +551,8 @@ void vhost_svq_stop(struct vhost_dev *dev, unsigned idx,
> > > >    * Creates vhost shadow virtqueue, and instruct vhost device to use the shadow
> > > >    * methods and file descriptors.
> > > >    */
> > > > -VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx)
> > > > +VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx,
> > > > +                                    VhostIOVATree *iova_map)
> > > >   {
> > > >       int vq_idx = dev->vq_index + idx;
> > > >       unsigned num = virtio_queue_get_num(dev->vdev, vq_idx);
> > > > @@ -500,11 +586,13 @@ VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx)
> > > >       memset(svq->vring.desc, 0, driver_size);
> > > >       svq->vring.used = qemu_memalign(qemu_real_host_page_size, device_size);
> > > >       memset(svq->vring.used, 0, device_size);
> > > > +    svq->iova_map = iova_map;
> > > > +
> > > >       for (i = 0; i < num - 1; i++) {
> > > >           svq->vring.desc[i].next = cpu_to_le16(i + 1);
> > > >       }
> > > >
> > > > -    svq->ring_id_maps = g_new0(VirtQueueElement *, num);
> > > > +    svq->ring_id_maps = g_new0(SVQElement *, num);
> > > >       event_notifier_set_handler(&svq->call_notifier,
> > > >                                  vhost_svq_handle_call);
> > > >       return g_steal_pointer(&svq);
> > > > diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
> > > > index a9c680b487..f5a12fee9d 100644
> > > > --- a/hw/virtio/vhost-vdpa.c
> > > > +++ b/hw/virtio/vhost-vdpa.c
> > > > @@ -176,6 +176,18 @@ static void vhost_vdpa_listener_region_add(MemoryListener *listener,
> > > >                                            vaddr, section->readonly);
> > > >
> > > >       llsize = int128_sub(llend, int128_make64(iova));
> > > > +    if (v->shadow_vqs_enabled) {
> > > > +        VhostDMAMap mem_region = {
> > > > +            .translated_addr = vaddr,
> > > > +            .size = int128_get64(llsize) - 1,
> > > > +            .perm = IOMMU_ACCESS_FLAG(true, section->readonly),
> > > > +        };
> > > > +
> > > > +        int r = vhost_iova_tree_alloc(v->iova_map, &mem_region);
> > > > +        assert(r == VHOST_DMA_MAP_OK);
> > > > +
> > > > +        iova = mem_region.iova;
> > > > +    }
> > > >
> > > >       ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize),
> > > >                                vaddr, section->readonly);
> > > > @@ -754,6 +766,23 @@ static bool  vhost_vdpa_force_iommu(struct vhost_dev *dev)
> > > >       return true;
> > > >   }
> > > >
> > > > +static int vhost_vdpa_get_iova_range(struct vhost_dev *dev,
> > > > +                                     hwaddr *first, hwaddr *last)
> > > > +{
> > > > +    int ret;
> > > > +    struct vhost_vdpa_iova_range range;
> > > > +
> > > > +    ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_IOVA_RANGE, &range);
> > > > +    if (ret != 0) {
> > > > +        return ret;
> > > > +    }
> > > > +
> > > > +    *first = range.first;
> > > > +    *last = range.last;
> > > > +    trace_vhost_vdpa_get_iova_range(dev, *first, *last);
> > > > +    return ret;
> > > > +}
> > > > +
> > > >   /**
> > > >    * Maps QEMU vaddr memory to device in a suitable way for shadow virtqueue:
> > > >    * - It always reference qemu memory address, not guest's memory.
> > > > @@ -881,6 +910,7 @@ static bool vhost_vdpa_svq_start_vq(struct vhost_dev *dev, unsigned idx)
> > > >   static unsigned vhost_vdpa_enable_svq(struct vhost_vdpa *v, bool enable)
> > > >   {
> > > >       struct vhost_dev *hdev = v->dev;
> > > > +    hwaddr iova_first, iova_last;
> > > >       unsigned n;
> > > >       int r;
> > > >
> > > > @@ -894,7 +924,7 @@ static unsigned vhost_vdpa_enable_svq(struct vhost_vdpa *v, bool enable)
> > > >           /* Allocate resources */
> > > >           assert(v->shadow_vqs->len == 0);
> > > >           for (n = 0; n < hdev->nvqs; ++n) {
> > > > -            VhostShadowVirtqueue *svq = vhost_svq_new(hdev, n);
> > > > +            VhostShadowVirtqueue *svq = vhost_svq_new(hdev, n, v->iova_map);
> > > >               if (unlikely(!svq)) {
> > > >                   g_ptr_array_set_size(v->shadow_vqs, 0);
> > > >                   return 0;
> > > > @@ -903,6 +933,8 @@ static unsigned vhost_vdpa_enable_svq(struct vhost_vdpa *v, bool enable)
> > > >           }
> > > >       }
> > > >
> > > > +    r = vhost_vdpa_get_iova_range(hdev, &iova_first, &iova_last);
> > > > +    assert(r == 0);
> > > >       r = vhost_vdpa_vring_pause(hdev);
> > > >       assert(r == 0);
> > > >
> > > > @@ -913,6 +945,12 @@ static unsigned vhost_vdpa_enable_svq(struct vhost_vdpa *v, bool enable)
> > > >           }
> > > >       }
> > > >
> > > > +    memory_listener_unregister(&v->listener);
> > > > +    if (vhost_vdpa_dma_unmap(v, iova_first,
> > > > +                             (iova_last - iova_first) & TARGET_PAGE_MASK)) {
> > > > +        error_report("Fail to invalidate device iotlb");
> > > > +    }
> > > > +
> > > >       /* Reset device so it can be configured */
> > > >       r = vhost_vdpa_dev_start(hdev, false);
> > > >       assert(r == 0);
> > > > diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
> > > > index 8ed19e9d0c..650e521e35 100644
> > > > --- a/hw/virtio/trace-events
> > > > +++ b/hw/virtio/trace-events
> > > > @@ -52,6 +52,7 @@ vhost_vdpa_set_vring_call(void *dev, unsigned int index, int fd) "dev: %p index:
> > > >   vhost_vdpa_get_features(void *dev, uint64_t features) "dev: %p features: 0x%"PRIx64
> > > >   vhost_vdpa_set_owner(void *dev) "dev: %p"
> > > >   vhost_vdpa_vq_get_addr(void *dev, void *vq, uint64_t desc_user_addr, uint64_t avail_user_addr, uint64_t used_user_addr) "dev: %p vq: %p desc_user_addr: 0x%"PRIx64" avail_user_addr: 0x%"PRIx64" used_user_addr: 0x%"PRIx64
> > > > +vhost_vdpa_get_iova_range(void *dev, uint64_t first, uint64_t last) "dev: %p first: 0x%"PRIx64" last: 0x%"PRIx64
> > > >
> > > >   # virtio.c
> > > >   virtqueue_alloc_element(void *elem, size_t sz, unsigned in_num, unsigned out_num) "elem %p size %zd in_num %u out_num %u"
> > >
> >

Re: [RFC PATCH v4 20/20] vdpa: Add custom IOTLB translations to SVQ

Posted by Eugenio Perez Martin 4 years, 3 months ago

On Wed, Oct 20, 2021 at 4:07 AM Jason Wang <jasowang@redhat.com> wrote:
>
> On Wed, Oct 20, 2021 at 10:02 AM Jason Wang <jasowang@redhat.com> wrote:
> >
> > On Tue, Oct 19, 2021 at 6:29 PM Eugenio Perez Martin
> > <eperezma@redhat.com> wrote:
> > >
> > > On Tue, Oct 19, 2021 at 11:25 AM Jason Wang <jasowang@redhat.com> wrote:
> > > >
> > > >
> > > > 在 2021/10/1 下午3:06, Eugenio Pérez 写道:
> > > > > Use translations added in VhostIOVATree in SVQ.
> > > > >
> > > > > Now every element needs to store the previous address also, so VirtQueue
> > > > > can consume the elements properly. This adds a little overhead per VQ
> > > > > element, having to allocate more memory to stash them. As a possible
> > > > > optimization, this allocation could be avoided if the descriptor is not
> > > > > a chain but a single one, but this is left undone.
> > > > >
> > > > > TODO: iova range should be queried before, and add logic to fail when
> > > > > GPA is outside of its range and memory listener or svq add it.
> > > > >
> > > > > Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> > > > > ---
> > > > >   hw/virtio/vhost-shadow-virtqueue.h |   4 +-
> > > > >   hw/virtio/vhost-shadow-virtqueue.c | 130 ++++++++++++++++++++++++-----
> > > > >   hw/virtio/vhost-vdpa.c             |  40 ++++++++-
> > > > >   hw/virtio/trace-events             |   1 +
> > > > >   4 files changed, 152 insertions(+), 23 deletions(-)
> > > > >
> > > > > diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
> > > > > index b7baa424a7..a0e6b5267a 100644
> > > > > --- a/hw/virtio/vhost-shadow-virtqueue.h
> > > > > +++ b/hw/virtio/vhost-shadow-virtqueue.h
> > > > > @@ -11,6 +11,7 @@
> > > > >   #define VHOST_SHADOW_VIRTQUEUE_H
> > > > >
> > > > >   #include "hw/virtio/vhost.h"
> > > > > +#include "hw/virtio/vhost-iova-tree.h"
> > > > >
> > > > >   typedef struct VhostShadowVirtqueue VhostShadowVirtqueue;
> > > > >
> > > > > @@ -28,7 +29,8 @@ bool vhost_svq_start(struct vhost_dev *dev, unsigned idx,
> > > > >   void vhost_svq_stop(struct vhost_dev *dev, unsigned idx,
> > > > >                       VhostShadowVirtqueue *svq);
> > > > >
> > > > > -VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx);
> > > > > +VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx,
> > > > > +                                    VhostIOVATree *iova_map);
> > > > >
> > > > >   void vhost_svq_free(VhostShadowVirtqueue *vq);
> > > > >
> > > > > diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
> > > > > index 2fd0bab75d..9db538547e 100644
> > > > > --- a/hw/virtio/vhost-shadow-virtqueue.c
> > > > > +++ b/hw/virtio/vhost-shadow-virtqueue.c
> > > > > @@ -11,12 +11,19 @@
> > > > >   #include "hw/virtio/vhost-shadow-virtqueue.h"
> > > > >   #include "hw/virtio/vhost.h"
> > > > >   #include "hw/virtio/virtio-access.h"
> > > > > +#include "hw/virtio/vhost-iova-tree.h"
> > > > >
> > > > >   #include "standard-headers/linux/vhost_types.h"
> > > > >
> > > > >   #include "qemu/error-report.h"
> > > > >   #include "qemu/main-loop.h"
> > > > >
> > > > > +typedef struct SVQElement {
> > > > > +    VirtQueueElement elem;
> > > > > +    void **in_sg_stash;
> > > > > +    void **out_sg_stash;
> > > > > +} SVQElement;
> > > > > +
> > > > >   /* Shadow virtqueue to relay notifications */
> > > > >   typedef struct VhostShadowVirtqueue {
> > > > >       /* Shadow vring */
> > > > > @@ -46,8 +53,11 @@ typedef struct VhostShadowVirtqueue {
> > > > >       /* Virtio device */
> > > > >       VirtIODevice *vdev;
> > > > >
> > > > > +    /* IOVA mapping if used */
> > > > > +    VhostIOVATree *iova_map;
> > > > > +
> > > > >       /* Map for returning guest's descriptors */
> > > > > -    VirtQueueElement **ring_id_maps;
> > > > > +    SVQElement **ring_id_maps;
> > > > >
> > > > >       /* Next head to expose to device */
> > > > >       uint16_t avail_idx_shadow;
> > > > > @@ -79,13 +89,6 @@ bool vhost_svq_valid_device_features(uint64_t *dev_features)
> > > > >               continue;
> > > > >
> > > > >           case VIRTIO_F_ACCESS_PLATFORM:
> > > > > -            /* SVQ needs this feature disabled. Can't continue */
> > > > > -            if (*dev_features & BIT_ULL(b)) {
> > > > > -                clear_bit(b, dev_features);
> > > > > -                r = false;
> > > > > -            }
> > > > > -            break;
> > > > > -
> > > > >           case VIRTIO_F_VERSION_1:
> > > > >               /* SVQ needs this feature, so can't continue */
> > > > >               if (!(*dev_features & BIT_ULL(b))) {
> > > > > @@ -126,6 +129,64 @@ static void vhost_svq_set_notification(VhostShadowVirtqueue *svq, bool enable)
> > > > >       }
> > > > >   }
> > > > >
> > > > > +static void vhost_svq_stash_addr(void ***stash, const struct iovec *iov,
> > > > > +                                 size_t num)
> > > > > +{
> > > > > +    size_t i;
> > > > > +
> > > > > +    if (num == 0) {
> > > > > +        return;
> > > > > +    }
> > > > > +
> > > > > +    *stash = g_new(void *, num);
> > > > > +    for (i = 0; i < num; ++i) {
> > > > > +        (*stash)[i] = iov[i].iov_base;
> > > > > +    }
> > > > > +}
> > > > > +
> > > > > +static void vhost_svq_unstash_addr(void **stash, struct iovec *iov, size_t num)
> > > > > +{
> > > > > +    size_t i;
> > > > > +
> > > > > +    if (num == 0) {
> > > > > +        return;
> > > > > +    }
> > > > > +
> > > > > +    for (i = 0; i < num; ++i) {
> > > > > +        iov[i].iov_base = stash[i];
> > > > > +    }
> > > > > +    g_free(stash);
> > > > > +}
> > > > > +
> > > > > +static void vhost_svq_translate_addr(const VhostShadowVirtqueue *svq,
> > > > > +                                     struct iovec *iovec, size_t num)
> > > > > +{
> > > > > +    size_t i;
> > > > > +
> > > > > +    for (i = 0; i < num; ++i) {
> > > > > +        VhostDMAMap needle = {
> > > > > +            .translated_addr = iovec[i].iov_base,
> > > > > +            .size = iovec[i].iov_len,
> > > > > +        };
> > > > > +        size_t off;
> > > > > +
> > > > > +        const VhostDMAMap *map = vhost_iova_tree_find_iova(svq->iova_map,
> > > > > +                                                           &needle);
> > > >
> > > >
> > > > Is it possible that we end up with more than one maps here?
> > > >
> > >
> > > Actually it is possible, since there is no guarantee that one
> > > descriptor (or indirect descriptor) maps exactly to one iov. It could
> > > map to many if qemu vaddr is not contiguous but GPA + size is. This is
> > > something that must be fixed for the next revision, so thanks for
> > > pointing it out!
> > >
> > > Taking that into account, the condition that svq vring avail_idx -
> > > used_idx was always less or equal than guest's vring avail_idx -
> > > used_idx is not true anymore. Checking for that before adding buffers
> > > to SVQ is the easy part, but how could we recover in that case?
> > >
> > > I think that the easy solution is to check for more available buffers
> > > unconditionally at the end of vhost_svq_handle_call, which handles the
> > > SVQ used and is supposed to make more room for available buffers. So
> > > vhost_handle_guest_kick would not check if eventfd is set or not
> > > anymore.
> > >
> > > Would that make sense?
> >
> > Yes, I think it should work.
>
> Btw, I wonder how to handle indirect descriptors. SVQ doesn't use
> indirect descriptors for now, but it looks like a must otherwise we
> may end up SVQ is full before VQ.
>

We can get to that situation without indirect too, if a single
descriptor maps to more than one sg buffer. The next revision is going
to control that too.

> It looks to me an easy way is to always use indirect descriptors if #sg >= 2?
>

I will use that, but that does not solve the case where a descriptor
maps to > 1 different buffers in qemu vaddr. So I think that some
check after marking descriptors as used is a must somehow.


> Thanks
>
> >
> > Thanks
> >
> > >
> > > Thanks!
> > >
> > > >
> > > > > +        /*
> > > > > +         * Map cannot be NULL since iova map contains all guest space and
> > > > > +         * qemu already has a physical address mapped
> > > > > +         */
> > > > > +        assert(map);
> > > > > +
> > > > > +        /*
> > > > > +         * Map->iova chunk size is ignored. What to do if descriptor
> > > > > +         * (addr, size) does not fit is delegated to the device.
> > > > > +         */
> > > > > +        off = needle.translated_addr - map->translated_addr;
> > > > > +        iovec[i].iov_base = (void *)(map->iova + off);
> > > > > +    }
> > > > > +}
> > > > > +
> > > > >   static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
> > > > >                                       const struct iovec *iovec,
> > > > >                                       size_t num, bool more_descs, bool write)
> > > > > @@ -156,8 +217,9 @@ static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
> > > > >   }
> > > > >
> > > > >   static unsigned vhost_svq_add_split(VhostShadowVirtqueue *svq,
> > > > > -                                    VirtQueueElement *elem)
> > > > > +                                    SVQElement *svq_elem)
> > > > >   {
> > > > > +    VirtQueueElement *elem = &svq_elem->elem;
> > > > >       int head;
> > > > >       unsigned avail_idx;
> > > > >       vring_avail_t *avail = svq->vring.avail;
> > > > > @@ -167,6 +229,12 @@ static unsigned vhost_svq_add_split(VhostShadowVirtqueue *svq,
> > > > >       /* We need some descriptors here */
> > > > >       assert(elem->out_num || elem->in_num);
> > > > >
> > > > > +    vhost_svq_stash_addr(&svq_elem->in_sg_stash, elem->in_sg, elem->in_num);
> > > > > +    vhost_svq_stash_addr(&svq_elem->out_sg_stash, elem->out_sg, elem->out_num);
> > > >
> > > >
> > > > I wonder if we can solve the trick like stash and unstash with a
> > > > dedicated sgs in svq_elem, instead of reusing the elem.
> > > >
> > >
> > > Actually yes, it would be way simpler to use a new sgs array in
> > > svq_elem. I will change that.
> > >
> > > Thanks!
> > >
> > > > Thanks
> > > >
> > > >
> > > > > +
> > > > > +    vhost_svq_translate_addr(svq, elem->in_sg, elem->in_num);
> > > > > +    vhost_svq_translate_addr(svq, elem->out_sg, elem->out_num);
> > > > > +
> > > > >       vhost_vring_write_descs(svq, elem->out_sg, elem->out_num,
> > > > >                               elem->in_num > 0, false);
> > > > >       vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
> > > > > @@ -187,7 +255,7 @@ static unsigned vhost_svq_add_split(VhostShadowVirtqueue *svq,
> > > > >
> > > > >   }
> > > > >
> > > > > -static void vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
> > > > > +static void vhost_svq_add(VhostShadowVirtqueue *svq, SVQElement *elem)
> > > > >   {
> > > > >       unsigned qemu_head = vhost_svq_add_split(svq, elem);
> > > > >
> > > > > @@ -221,7 +289,7 @@ static void vhost_handle_guest_kick(EventNotifier *n)
> > > > >           }
> > > > >
> > > > >           while (true) {
> > > > > -            VirtQueueElement *elem = virtqueue_pop(svq->vq, sizeof(*elem));
> > > > > +            SVQElement *elem = virtqueue_pop(svq->vq, sizeof(*elem));
> > > > >               if (!elem) {
> > > > >                   break;
> > > > >               }
> > > > > @@ -247,7 +315,7 @@ static bool vhost_svq_more_used(VhostShadowVirtqueue *svq)
> > > > >       return svq->used_idx != svq->shadow_used_idx;
> > > > >   }
> > > > >
> > > > > -static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq)
> > > > > +static SVQElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq)
> > > > >   {
> > > > >       vring_desc_t *descs = svq->vring.desc;
> > > > >       const vring_used_t *used = svq->vring.used;
> > > > > @@ -279,7 +347,7 @@ static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq)
> > > > >       descs[used_elem.id].next = svq->free_head;
> > > > >       svq->free_head = used_elem.id;
> > > > >
> > > > > -    svq->ring_id_maps[used_elem.id]->len = used_elem.len;
> > > > > +    svq->ring_id_maps[used_elem.id]->elem.len = used_elem.len;
> > > > >       return g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
> > > > >   }
> > > > >
> > > > > @@ -296,12 +364,19 @@ static void vhost_svq_handle_call_no_test(EventNotifier *n)
> > > > >
> > > > >           vhost_svq_set_notification(svq, false);
> > > > >           while (true) {
> > > > > -            g_autofree VirtQueueElement *elem = vhost_svq_get_buf(svq);
> > > > > -            if (!elem) {
> > > > > +            g_autofree SVQElement *svq_elem = vhost_svq_get_buf(svq);
> > > > > +            VirtQueueElement *elem;
> > > > > +            if (!svq_elem) {
> > > > >                   break;
> > > > >               }
> > > > >
> > > > >               assert(i < svq->vring.num);
> > > > > +            elem = &svq_elem->elem;
> > > > > +
> > > > > +            vhost_svq_unstash_addr(svq_elem->in_sg_stash, elem->in_sg,
> > > > > +                                   elem->in_num);
> > > > > +            vhost_svq_unstash_addr(svq_elem->out_sg_stash, elem->out_sg,
> > > > > +                                   elem->out_num);
> > > > >               virtqueue_fill(vq, elem, elem->len, i++);
> > > > >           }
> > > > >
> > > > > @@ -451,14 +526,24 @@ void vhost_svq_stop(struct vhost_dev *dev, unsigned idx,
> > > > >       event_notifier_set_handler(&svq->host_notifier, NULL);
> > > > >
> > > > >       for (i = 0; i < svq->vring.num; ++i) {
> > > > > -        g_autofree VirtQueueElement *elem = svq->ring_id_maps[i];
> > > > > +        g_autofree SVQElement *svq_elem = svq->ring_id_maps[i];
> > > > > +        VirtQueueElement *elem;
> > > > > +
> > > > > +        if (!svq_elem) {
> > > > > +            continue;
> > > > > +        }
> > > > > +
> > > > > +        elem = &svq_elem->elem;
> > > > > +        vhost_svq_unstash_addr(svq_elem->in_sg_stash, elem->in_sg,
> > > > > +                               elem->in_num);
> > > > > +        vhost_svq_unstash_addr(svq_elem->out_sg_stash, elem->out_sg,
> > > > > +                               elem->out_num);
> > > > > +
> > > > >           /*
> > > > >            * Although the doc says we must unpop in order, it's ok to unpop
> > > > >            * everything.
> > > > >            */
> > > > > -        if (elem) {
> > > > > -            virtqueue_unpop(svq->vq, elem, elem->len);
> > > > > -        }
> > > > > +        virtqueue_unpop(svq->vq, elem, elem->len);
> > > > >       }
> > > > >   }
> > > > >
> > > > > @@ -466,7 +551,8 @@ void vhost_svq_stop(struct vhost_dev *dev, unsigned idx,
> > > > >    * Creates vhost shadow virtqueue, and instruct vhost device to use the shadow
> > > > >    * methods and file descriptors.
> > > > >    */
> > > > > -VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx)
> > > > > +VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx,
> > > > > +                                    VhostIOVATree *iova_map)
> > > > >   {
> > > > >       int vq_idx = dev->vq_index + idx;
> > > > >       unsigned num = virtio_queue_get_num(dev->vdev, vq_idx);
> > > > > @@ -500,11 +586,13 @@ VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx)
> > > > >       memset(svq->vring.desc, 0, driver_size);
> > > > >       svq->vring.used = qemu_memalign(qemu_real_host_page_size, device_size);
> > > > >       memset(svq->vring.used, 0, device_size);
> > > > > +    svq->iova_map = iova_map;
> > > > > +
> > > > >       for (i = 0; i < num - 1; i++) {
> > > > >           svq->vring.desc[i].next = cpu_to_le16(i + 1);
> > > > >       }
> > > > >
> > > > > -    svq->ring_id_maps = g_new0(VirtQueueElement *, num);
> > > > > +    svq->ring_id_maps = g_new0(SVQElement *, num);
> > > > >       event_notifier_set_handler(&svq->call_notifier,
> > > > >                                  vhost_svq_handle_call);
> > > > >       return g_steal_pointer(&svq);
> > > > > diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
> > > > > index a9c680b487..f5a12fee9d 100644
> > > > > --- a/hw/virtio/vhost-vdpa.c
> > > > > +++ b/hw/virtio/vhost-vdpa.c
> > > > > @@ -176,6 +176,18 @@ static void vhost_vdpa_listener_region_add(MemoryListener *listener,
> > > > >                                            vaddr, section->readonly);
> > > > >
> > > > >       llsize = int128_sub(llend, int128_make64(iova));
> > > > > +    if (v->shadow_vqs_enabled) {
> > > > > +        VhostDMAMap mem_region = {
> > > > > +            .translated_addr = vaddr,
> > > > > +            .size = int128_get64(llsize) - 1,
> > > > > +            .perm = IOMMU_ACCESS_FLAG(true, section->readonly),
> > > > > +        };
> > > > > +
> > > > > +        int r = vhost_iova_tree_alloc(v->iova_map, &mem_region);
> > > > > +        assert(r == VHOST_DMA_MAP_OK);
> > > > > +
> > > > > +        iova = mem_region.iova;
> > > > > +    }
> > > > >
> > > > >       ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize),
> > > > >                                vaddr, section->readonly);
> > > > > @@ -754,6 +766,23 @@ static bool  vhost_vdpa_force_iommu(struct vhost_dev *dev)
> > > > >       return true;
> > > > >   }
> > > > >
> > > > > +static int vhost_vdpa_get_iova_range(struct vhost_dev *dev,
> > > > > +                                     hwaddr *first, hwaddr *last)
> > > > > +{
> > > > > +    int ret;
> > > > > +    struct vhost_vdpa_iova_range range;
> > > > > +
> > > > > +    ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_IOVA_RANGE, &range);
> > > > > +    if (ret != 0) {
> > > > > +        return ret;
> > > > > +    }
> > > > > +
> > > > > +    *first = range.first;
> > > > > +    *last = range.last;
> > > > > +    trace_vhost_vdpa_get_iova_range(dev, *first, *last);
> > > > > +    return ret;
> > > > > +}
> > > > > +
> > > > >   /**
> > > > >    * Maps QEMU vaddr memory to device in a suitable way for shadow virtqueue:
> > > > >    * - It always reference qemu memory address, not guest's memory.
> > > > > @@ -881,6 +910,7 @@ static bool vhost_vdpa_svq_start_vq(struct vhost_dev *dev, unsigned idx)
> > > > >   static unsigned vhost_vdpa_enable_svq(struct vhost_vdpa *v, bool enable)
> > > > >   {
> > > > >       struct vhost_dev *hdev = v->dev;
> > > > > +    hwaddr iova_first, iova_last;
> > > > >       unsigned n;
> > > > >       int r;
> > > > >
> > > > > @@ -894,7 +924,7 @@ static unsigned vhost_vdpa_enable_svq(struct vhost_vdpa *v, bool enable)
> > > > >           /* Allocate resources */
> > > > >           assert(v->shadow_vqs->len == 0);
> > > > >           for (n = 0; n < hdev->nvqs; ++n) {
> > > > > -            VhostShadowVirtqueue *svq = vhost_svq_new(hdev, n);
> > > > > +            VhostShadowVirtqueue *svq = vhost_svq_new(hdev, n, v->iova_map);
> > > > >               if (unlikely(!svq)) {
> > > > >                   g_ptr_array_set_size(v->shadow_vqs, 0);
> > > > >                   return 0;
> > > > > @@ -903,6 +933,8 @@ static unsigned vhost_vdpa_enable_svq(struct vhost_vdpa *v, bool enable)
> > > > >           }
> > > > >       }
> > > > >
> > > > > +    r = vhost_vdpa_get_iova_range(hdev, &iova_first, &iova_last);
> > > > > +    assert(r == 0);
> > > > >       r = vhost_vdpa_vring_pause(hdev);
> > > > >       assert(r == 0);
> > > > >
> > > > > @@ -913,6 +945,12 @@ static unsigned vhost_vdpa_enable_svq(struct vhost_vdpa *v, bool enable)
> > > > >           }
> > > > >       }
> > > > >
> > > > > +    memory_listener_unregister(&v->listener);
> > > > > +    if (vhost_vdpa_dma_unmap(v, iova_first,
> > > > > +                             (iova_last - iova_first) & TARGET_PAGE_MASK)) {
> > > > > +        error_report("Fail to invalidate device iotlb");
> > > > > +    }
> > > > > +
> > > > >       /* Reset device so it can be configured */
> > > > >       r = vhost_vdpa_dev_start(hdev, false);
> > > > >       assert(r == 0);
> > > > > diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
> > > > > index 8ed19e9d0c..650e521e35 100644
> > > > > --- a/hw/virtio/trace-events
> > > > > +++ b/hw/virtio/trace-events
> > > > > @@ -52,6 +52,7 @@ vhost_vdpa_set_vring_call(void *dev, unsigned int index, int fd) "dev: %p index:
> > > > >   vhost_vdpa_get_features(void *dev, uint64_t features) "dev: %p features: 0x%"PRIx64
> > > > >   vhost_vdpa_set_owner(void *dev) "dev: %p"
> > > > >   vhost_vdpa_vq_get_addr(void *dev, void *vq, uint64_t desc_user_addr, uint64_t avail_user_addr, uint64_t used_user_addr) "dev: %p vq: %p desc_user_addr: 0x%"PRIx64" avail_user_addr: 0x%"PRIx64" used_user_addr: 0x%"PRIx64
> > > > > +vhost_vdpa_get_iova_range(void *dev, uint64_t first, uint64_t last) "dev: %p first: 0x%"PRIx64" last: 0x%"PRIx64
> > > > >
> > > > >   # virtio.c
> > > > >   virtqueue_alloc_element(void *elem, size_t sz, unsigned in_num, unsigned out_num) "elem %p size %zd in_num %u out_num %u"
> > > >
> > >
>

Re: [RFC PATCH v4 20/20] vdpa: Add custom IOTLB translations to SVQ

Posted by Jason Wang 4 years, 3 months ago

On Wed, Oct 20, 2021 at 2:52 PM Eugenio Perez Martin
<eperezma@redhat.com> wrote:
>
> On Wed, Oct 20, 2021 at 4:07 AM Jason Wang <jasowang@redhat.com> wrote:
> >
> > On Wed, Oct 20, 2021 at 10:02 AM Jason Wang <jasowang@redhat.com> wrote:
> > >
> > > On Tue, Oct 19, 2021 at 6:29 PM Eugenio Perez Martin
> > > <eperezma@redhat.com> wrote:
> > > >
> > > > On Tue, Oct 19, 2021 at 11:25 AM Jason Wang <jasowang@redhat.com> wrote:
> > > > >
> > > > >
> > > > > 在 2021/10/1 下午3:06, Eugenio Pérez 写道:
> > > > > > Use translations added in VhostIOVATree in SVQ.
> > > > > >
> > > > > > Now every element needs to store the previous address also, so VirtQueue
> > > > > > can consume the elements properly. This adds a little overhead per VQ
> > > > > > element, having to allocate more memory to stash them. As a possible
> > > > > > optimization, this allocation could be avoided if the descriptor is not
> > > > > > a chain but a single one, but this is left undone.
> > > > > >
> > > > > > TODO: iova range should be queried before, and add logic to fail when
> > > > > > GPA is outside of its range and memory listener or svq add it.
> > > > > >
> > > > > > Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> > > > > > ---
> > > > > >   hw/virtio/vhost-shadow-virtqueue.h |   4 +-
> > > > > >   hw/virtio/vhost-shadow-virtqueue.c | 130 ++++++++++++++++++++++++-----
> > > > > >   hw/virtio/vhost-vdpa.c             |  40 ++++++++-
> > > > > >   hw/virtio/trace-events             |   1 +
> > > > > >   4 files changed, 152 insertions(+), 23 deletions(-)
> > > > > >
> > > > > > diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
> > > > > > index b7baa424a7..a0e6b5267a 100644
> > > > > > --- a/hw/virtio/vhost-shadow-virtqueue.h
> > > > > > +++ b/hw/virtio/vhost-shadow-virtqueue.h
> > > > > > @@ -11,6 +11,7 @@
> > > > > >   #define VHOST_SHADOW_VIRTQUEUE_H
> > > > > >
> > > > > >   #include "hw/virtio/vhost.h"
> > > > > > +#include "hw/virtio/vhost-iova-tree.h"
> > > > > >
> > > > > >   typedef struct VhostShadowVirtqueue VhostShadowVirtqueue;
> > > > > >
> > > > > > @@ -28,7 +29,8 @@ bool vhost_svq_start(struct vhost_dev *dev, unsigned idx,
> > > > > >   void vhost_svq_stop(struct vhost_dev *dev, unsigned idx,
> > > > > >                       VhostShadowVirtqueue *svq);
> > > > > >
> > > > > > -VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx);
> > > > > > +VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx,
> > > > > > +                                    VhostIOVATree *iova_map);
> > > > > >
> > > > > >   void vhost_svq_free(VhostShadowVirtqueue *vq);
> > > > > >
> > > > > > diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
> > > > > > index 2fd0bab75d..9db538547e 100644
> > > > > > --- a/hw/virtio/vhost-shadow-virtqueue.c
> > > > > > +++ b/hw/virtio/vhost-shadow-virtqueue.c
> > > > > > @@ -11,12 +11,19 @@
> > > > > >   #include "hw/virtio/vhost-shadow-virtqueue.h"
> > > > > >   #include "hw/virtio/vhost.h"
> > > > > >   #include "hw/virtio/virtio-access.h"
> > > > > > +#include "hw/virtio/vhost-iova-tree.h"
> > > > > >
> > > > > >   #include "standard-headers/linux/vhost_types.h"
> > > > > >
> > > > > >   #include "qemu/error-report.h"
> > > > > >   #include "qemu/main-loop.h"
> > > > > >
> > > > > > +typedef struct SVQElement {
> > > > > > +    VirtQueueElement elem;
> > > > > > +    void **in_sg_stash;
> > > > > > +    void **out_sg_stash;
> > > > > > +} SVQElement;
> > > > > > +
> > > > > >   /* Shadow virtqueue to relay notifications */
> > > > > >   typedef struct VhostShadowVirtqueue {
> > > > > >       /* Shadow vring */
> > > > > > @@ -46,8 +53,11 @@ typedef struct VhostShadowVirtqueue {
> > > > > >       /* Virtio device */
> > > > > >       VirtIODevice *vdev;
> > > > > >
> > > > > > +    /* IOVA mapping if used */
> > > > > > +    VhostIOVATree *iova_map;
> > > > > > +
> > > > > >       /* Map for returning guest's descriptors */
> > > > > > -    VirtQueueElement **ring_id_maps;
> > > > > > +    SVQElement **ring_id_maps;
> > > > > >
> > > > > >       /* Next head to expose to device */
> > > > > >       uint16_t avail_idx_shadow;
> > > > > > @@ -79,13 +89,6 @@ bool vhost_svq_valid_device_features(uint64_t *dev_features)
> > > > > >               continue;
> > > > > >
> > > > > >           case VIRTIO_F_ACCESS_PLATFORM:
> > > > > > -            /* SVQ needs this feature disabled. Can't continue */
> > > > > > -            if (*dev_features & BIT_ULL(b)) {
> > > > > > -                clear_bit(b, dev_features);
> > > > > > -                r = false;
> > > > > > -            }
> > > > > > -            break;
> > > > > > -
> > > > > >           case VIRTIO_F_VERSION_1:
> > > > > >               /* SVQ needs this feature, so can't continue */
> > > > > >               if (!(*dev_features & BIT_ULL(b))) {
> > > > > > @@ -126,6 +129,64 @@ static void vhost_svq_set_notification(VhostShadowVirtqueue *svq, bool enable)
> > > > > >       }
> > > > > >   }
> > > > > >
> > > > > > +static void vhost_svq_stash_addr(void ***stash, const struct iovec *iov,
> > > > > > +                                 size_t num)
> > > > > > +{
> > > > > > +    size_t i;
> > > > > > +
> > > > > > +    if (num == 0) {
> > > > > > +        return;
> > > > > > +    }
> > > > > > +
> > > > > > +    *stash = g_new(void *, num);
> > > > > > +    for (i = 0; i < num; ++i) {
> > > > > > +        (*stash)[i] = iov[i].iov_base;
> > > > > > +    }
> > > > > > +}
> > > > > > +
> > > > > > +static void vhost_svq_unstash_addr(void **stash, struct iovec *iov, size_t num)
> > > > > > +{
> > > > > > +    size_t i;
> > > > > > +
> > > > > > +    if (num == 0) {
> > > > > > +        return;
> > > > > > +    }
> > > > > > +
> > > > > > +    for (i = 0; i < num; ++i) {
> > > > > > +        iov[i].iov_base = stash[i];
> > > > > > +    }
> > > > > > +    g_free(stash);
> > > > > > +}
> > > > > > +
> > > > > > +static void vhost_svq_translate_addr(const VhostShadowVirtqueue *svq,
> > > > > > +                                     struct iovec *iovec, size_t num)
> > > > > > +{
> > > > > > +    size_t i;
> > > > > > +
> > > > > > +    for (i = 0; i < num; ++i) {
> > > > > > +        VhostDMAMap needle = {
> > > > > > +            .translated_addr = iovec[i].iov_base,
> > > > > > +            .size = iovec[i].iov_len,
> > > > > > +        };
> > > > > > +        size_t off;
> > > > > > +
> > > > > > +        const VhostDMAMap *map = vhost_iova_tree_find_iova(svq->iova_map,
> > > > > > +                                                           &needle);
> > > > >
> > > > >
> > > > > Is it possible that we end up with more than one maps here?
> > > > >
> > > >
> > > > Actually it is possible, since there is no guarantee that one
> > > > descriptor (or indirect descriptor) maps exactly to one iov. It could
> > > > map to many if qemu vaddr is not contiguous but GPA + size is. This is
> > > > something that must be fixed for the next revision, so thanks for
> > > > pointing it out!
> > > >
> > > > Taking that into account, the condition that svq vring avail_idx -
> > > > used_idx was always less or equal than guest's vring avail_idx -
> > > > used_idx is not true anymore. Checking for that before adding buffers
> > > > to SVQ is the easy part, but how could we recover in that case?
> > > >
> > > > I think that the easy solution is to check for more available buffers
> > > > unconditionally at the end of vhost_svq_handle_call, which handles the
> > > > SVQ used and is supposed to make more room for available buffers. So
> > > > vhost_handle_guest_kick would not check if eventfd is set or not
> > > > anymore.
> > > >
> > > > Would that make sense?
> > >
> > > Yes, I think it should work.
> >
> > Btw, I wonder how to handle indirect descriptors. SVQ doesn't use
> > indirect descriptors for now, but it looks like a must otherwise we
> > may end up SVQ is full before VQ.
> >
>
> We can get to that situation without indirect too, if a single
> descriptor maps to more than one sg buffer. The next revision is going
> to control that too.
>
> > It looks to me an easy way is to always use indirect descriptors if #sg >= 2?
> >
>
> I will use that, but that does not solve the case where a descriptor
> maps to > 1 different buffers in qemu vaddr.

Right, so we need to deal with the case when SVQ is out of space.


> So I think that some
> check after marking descriptors as used is a must somehow.

I thought it should be before processing the available buffer? It's
the guest driver that make sure there's sufficient space for used
ring?

Thanks

>
>
> > Thanks
> >
> > >
> > > Thanks
> > >
> > > >
> > > > Thanks!
> > > >
> > > > >
> > > > > > +        /*
> > > > > > +         * Map cannot be NULL since iova map contains all guest space and
> > > > > > +         * qemu already has a physical address mapped
> > > > > > +         */
> > > > > > +        assert(map);
> > > > > > +
> > > > > > +        /*
> > > > > > +         * Map->iova chunk size is ignored. What to do if descriptor
> > > > > > +         * (addr, size) does not fit is delegated to the device.
> > > > > > +         */
> > > > > > +        off = needle.translated_addr - map->translated_addr;
> > > > > > +        iovec[i].iov_base = (void *)(map->iova + off);
> > > > > > +    }
> > > > > > +}
> > > > > > +
> > > > > >   static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
> > > > > >                                       const struct iovec *iovec,
> > > > > >                                       size_t num, bool more_descs, bool write)
> > > > > > @@ -156,8 +217,9 @@ static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
> > > > > >   }
> > > > > >
> > > > > >   static unsigned vhost_svq_add_split(VhostShadowVirtqueue *svq,
> > > > > > -                                    VirtQueueElement *elem)
> > > > > > +                                    SVQElement *svq_elem)
> > > > > >   {
> > > > > > +    VirtQueueElement *elem = &svq_elem->elem;
> > > > > >       int head;
> > > > > >       unsigned avail_idx;
> > > > > >       vring_avail_t *avail = svq->vring.avail;
> > > > > > @@ -167,6 +229,12 @@ static unsigned vhost_svq_add_split(VhostShadowVirtqueue *svq,
> > > > > >       /* We need some descriptors here */
> > > > > >       assert(elem->out_num || elem->in_num);
> > > > > >
> > > > > > +    vhost_svq_stash_addr(&svq_elem->in_sg_stash, elem->in_sg, elem->in_num);
> > > > > > +    vhost_svq_stash_addr(&svq_elem->out_sg_stash, elem->out_sg, elem->out_num);
> > > > >
> > > > >
> > > > > I wonder if we can solve the trick like stash and unstash with a
> > > > > dedicated sgs in svq_elem, instead of reusing the elem.
> > > > >
> > > >
> > > > Actually yes, it would be way simpler to use a new sgs array in
> > > > svq_elem. I will change that.
> > > >
> > > > Thanks!
> > > >
> > > > > Thanks
> > > > >
> > > > >
> > > > > > +
> > > > > > +    vhost_svq_translate_addr(svq, elem->in_sg, elem->in_num);
> > > > > > +    vhost_svq_translate_addr(svq, elem->out_sg, elem->out_num);
> > > > > > +
> > > > > >       vhost_vring_write_descs(svq, elem->out_sg, elem->out_num,
> > > > > >                               elem->in_num > 0, false);
> > > > > >       vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
> > > > > > @@ -187,7 +255,7 @@ static unsigned vhost_svq_add_split(VhostShadowVirtqueue *svq,
> > > > > >
> > > > > >   }
> > > > > >
> > > > > > -static void vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
> > > > > > +static void vhost_svq_add(VhostShadowVirtqueue *svq, SVQElement *elem)
> > > > > >   {
> > > > > >       unsigned qemu_head = vhost_svq_add_split(svq, elem);
> > > > > >
> > > > > > @@ -221,7 +289,7 @@ static void vhost_handle_guest_kick(EventNotifier *n)
> > > > > >           }
> > > > > >
> > > > > >           while (true) {
> > > > > > -            VirtQueueElement *elem = virtqueue_pop(svq->vq, sizeof(*elem));
> > > > > > +            SVQElement *elem = virtqueue_pop(svq->vq, sizeof(*elem));
> > > > > >               if (!elem) {
> > > > > >                   break;
> > > > > >               }
> > > > > > @@ -247,7 +315,7 @@ static bool vhost_svq_more_used(VhostShadowVirtqueue *svq)
> > > > > >       return svq->used_idx != svq->shadow_used_idx;
> > > > > >   }
> > > > > >
> > > > > > -static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq)
> > > > > > +static SVQElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq)
> > > > > >   {
> > > > > >       vring_desc_t *descs = svq->vring.desc;
> > > > > >       const vring_used_t *used = svq->vring.used;
> > > > > > @@ -279,7 +347,7 @@ static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq)
> > > > > >       descs[used_elem.id].next = svq->free_head;
> > > > > >       svq->free_head = used_elem.id;
> > > > > >
> > > > > > -    svq->ring_id_maps[used_elem.id]->len = used_elem.len;
> > > > > > +    svq->ring_id_maps[used_elem.id]->elem.len = used_elem.len;
> > > > > >       return g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
> > > > > >   }
> > > > > >
> > > > > > @@ -296,12 +364,19 @@ static void vhost_svq_handle_call_no_test(EventNotifier *n)
> > > > > >
> > > > > >           vhost_svq_set_notification(svq, false);
> > > > > >           while (true) {
> > > > > > -            g_autofree VirtQueueElement *elem = vhost_svq_get_buf(svq);
> > > > > > -            if (!elem) {
> > > > > > +            g_autofree SVQElement *svq_elem = vhost_svq_get_buf(svq);
> > > > > > +            VirtQueueElement *elem;
> > > > > > +            if (!svq_elem) {
> > > > > >                   break;
> > > > > >               }
> > > > > >
> > > > > >               assert(i < svq->vring.num);
> > > > > > +            elem = &svq_elem->elem;
> > > > > > +
> > > > > > +            vhost_svq_unstash_addr(svq_elem->in_sg_stash, elem->in_sg,
> > > > > > +                                   elem->in_num);
> > > > > > +            vhost_svq_unstash_addr(svq_elem->out_sg_stash, elem->out_sg,
> > > > > > +                                   elem->out_num);
> > > > > >               virtqueue_fill(vq, elem, elem->len, i++);
> > > > > >           }
> > > > > >
> > > > > > @@ -451,14 +526,24 @@ void vhost_svq_stop(struct vhost_dev *dev, unsigned idx,
> > > > > >       event_notifier_set_handler(&svq->host_notifier, NULL);
> > > > > >
> > > > > >       for (i = 0; i < svq->vring.num; ++i) {
> > > > > > -        g_autofree VirtQueueElement *elem = svq->ring_id_maps[i];
> > > > > > +        g_autofree SVQElement *svq_elem = svq->ring_id_maps[i];
> > > > > > +        VirtQueueElement *elem;
> > > > > > +
> > > > > > +        if (!svq_elem) {
> > > > > > +            continue;
> > > > > > +        }
> > > > > > +
> > > > > > +        elem = &svq_elem->elem;
> > > > > > +        vhost_svq_unstash_addr(svq_elem->in_sg_stash, elem->in_sg,
> > > > > > +                               elem->in_num);
> > > > > > +        vhost_svq_unstash_addr(svq_elem->out_sg_stash, elem->out_sg,
> > > > > > +                               elem->out_num);
> > > > > > +
> > > > > >           /*
> > > > > >            * Although the doc says we must unpop in order, it's ok to unpop
> > > > > >            * everything.
> > > > > >            */
> > > > > > -        if (elem) {
> > > > > > -            virtqueue_unpop(svq->vq, elem, elem->len);
> > > > > > -        }
> > > > > > +        virtqueue_unpop(svq->vq, elem, elem->len);
> > > > > >       }
> > > > > >   }
> > > > > >
> > > > > > @@ -466,7 +551,8 @@ void vhost_svq_stop(struct vhost_dev *dev, unsigned idx,
> > > > > >    * Creates vhost shadow virtqueue, and instruct vhost device to use the shadow
> > > > > >    * methods and file descriptors.
> > > > > >    */
> > > > > > -VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx)
> > > > > > +VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx,
> > > > > > +                                    VhostIOVATree *iova_map)
> > > > > >   {
> > > > > >       int vq_idx = dev->vq_index + idx;
> > > > > >       unsigned num = virtio_queue_get_num(dev->vdev, vq_idx);
> > > > > > @@ -500,11 +586,13 @@ VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx)
> > > > > >       memset(svq->vring.desc, 0, driver_size);
> > > > > >       svq->vring.used = qemu_memalign(qemu_real_host_page_size, device_size);
> > > > > >       memset(svq->vring.used, 0, device_size);
> > > > > > +    svq->iova_map = iova_map;
> > > > > > +
> > > > > >       for (i = 0; i < num - 1; i++) {
> > > > > >           svq->vring.desc[i].next = cpu_to_le16(i + 1);
> > > > > >       }
> > > > > >
> > > > > > -    svq->ring_id_maps = g_new0(VirtQueueElement *, num);
> > > > > > +    svq->ring_id_maps = g_new0(SVQElement *, num);
> > > > > >       event_notifier_set_handler(&svq->call_notifier,
> > > > > >                                  vhost_svq_handle_call);
> > > > > >       return g_steal_pointer(&svq);
> > > > > > diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
> > > > > > index a9c680b487..f5a12fee9d 100644
> > > > > > --- a/hw/virtio/vhost-vdpa.c
> > > > > > +++ b/hw/virtio/vhost-vdpa.c
> > > > > > @@ -176,6 +176,18 @@ static void vhost_vdpa_listener_region_add(MemoryListener *listener,
> > > > > >                                            vaddr, section->readonly);
> > > > > >
> > > > > >       llsize = int128_sub(llend, int128_make64(iova));
> > > > > > +    if (v->shadow_vqs_enabled) {
> > > > > > +        VhostDMAMap mem_region = {
> > > > > > +            .translated_addr = vaddr,
> > > > > > +            .size = int128_get64(llsize) - 1,
> > > > > > +            .perm = IOMMU_ACCESS_FLAG(true, section->readonly),
> > > > > > +        };
> > > > > > +
> > > > > > +        int r = vhost_iova_tree_alloc(v->iova_map, &mem_region);
> > > > > > +        assert(r == VHOST_DMA_MAP_OK);
> > > > > > +
> > > > > > +        iova = mem_region.iova;
> > > > > > +    }
> > > > > >
> > > > > >       ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize),
> > > > > >                                vaddr, section->readonly);
> > > > > > @@ -754,6 +766,23 @@ static bool  vhost_vdpa_force_iommu(struct vhost_dev *dev)
> > > > > >       return true;
> > > > > >   }
> > > > > >
> > > > > > +static int vhost_vdpa_get_iova_range(struct vhost_dev *dev,
> > > > > > +                                     hwaddr *first, hwaddr *last)
> > > > > > +{
> > > > > > +    int ret;
> > > > > > +    struct vhost_vdpa_iova_range range;
> > > > > > +
> > > > > > +    ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_IOVA_RANGE, &range);
> > > > > > +    if (ret != 0) {
> > > > > > +        return ret;
> > > > > > +    }
> > > > > > +
> > > > > > +    *first = range.first;
> > > > > > +    *last = range.last;
> > > > > > +    trace_vhost_vdpa_get_iova_range(dev, *first, *last);
> > > > > > +    return ret;
> > > > > > +}
> > > > > > +
> > > > > >   /**
> > > > > >    * Maps QEMU vaddr memory to device in a suitable way for shadow virtqueue:
> > > > > >    * - It always reference qemu memory address, not guest's memory.
> > > > > > @@ -881,6 +910,7 @@ static bool vhost_vdpa_svq_start_vq(struct vhost_dev *dev, unsigned idx)
> > > > > >   static unsigned vhost_vdpa_enable_svq(struct vhost_vdpa *v, bool enable)
> > > > > >   {
> > > > > >       struct vhost_dev *hdev = v->dev;
> > > > > > +    hwaddr iova_first, iova_last;
> > > > > >       unsigned n;
> > > > > >       int r;
> > > > > >
> > > > > > @@ -894,7 +924,7 @@ static unsigned vhost_vdpa_enable_svq(struct vhost_vdpa *v, bool enable)
> > > > > >           /* Allocate resources */
> > > > > >           assert(v->shadow_vqs->len == 0);
> > > > > >           for (n = 0; n < hdev->nvqs; ++n) {
> > > > > > -            VhostShadowVirtqueue *svq = vhost_svq_new(hdev, n);
> > > > > > +            VhostShadowVirtqueue *svq = vhost_svq_new(hdev, n, v->iova_map);
> > > > > >               if (unlikely(!svq)) {
> > > > > >                   g_ptr_array_set_size(v->shadow_vqs, 0);
> > > > > >                   return 0;
> > > > > > @@ -903,6 +933,8 @@ static unsigned vhost_vdpa_enable_svq(struct vhost_vdpa *v, bool enable)
> > > > > >           }
> > > > > >       }
> > > > > >
> > > > > > +    r = vhost_vdpa_get_iova_range(hdev, &iova_first, &iova_last);
> > > > > > +    assert(r == 0);
> > > > > >       r = vhost_vdpa_vring_pause(hdev);
> > > > > >       assert(r == 0);
> > > > > >
> > > > > > @@ -913,6 +945,12 @@ static unsigned vhost_vdpa_enable_svq(struct vhost_vdpa *v, bool enable)
> > > > > >           }
> > > > > >       }
> > > > > >
> > > > > > +    memory_listener_unregister(&v->listener);
> > > > > > +    if (vhost_vdpa_dma_unmap(v, iova_first,
> > > > > > +                             (iova_last - iova_first) & TARGET_PAGE_MASK)) {
> > > > > > +        error_report("Fail to invalidate device iotlb");
> > > > > > +    }
> > > > > > +
> > > > > >       /* Reset device so it can be configured */
> > > > > >       r = vhost_vdpa_dev_start(hdev, false);
> > > > > >       assert(r == 0);
> > > > > > diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
> > > > > > index 8ed19e9d0c..650e521e35 100644
> > > > > > --- a/hw/virtio/trace-events
> > > > > > +++ b/hw/virtio/trace-events
> > > > > > @@ -52,6 +52,7 @@ vhost_vdpa_set_vring_call(void *dev, unsigned int index, int fd) "dev: %p index:
> > > > > >   vhost_vdpa_get_features(void *dev, uint64_t features) "dev: %p features: 0x%"PRIx64
> > > > > >   vhost_vdpa_set_owner(void *dev) "dev: %p"
> > > > > >   vhost_vdpa_vq_get_addr(void *dev, void *vq, uint64_t desc_user_addr, uint64_t avail_user_addr, uint64_t used_user_addr) "dev: %p vq: %p desc_user_addr: 0x%"PRIx64" avail_user_addr: 0x%"PRIx64" used_user_addr: 0x%"PRIx64
> > > > > > +vhost_vdpa_get_iova_range(void *dev, uint64_t first, uint64_t last) "dev: %p first: 0x%"PRIx64" last: 0x%"PRIx64
> > > > > >
> > > > > >   # virtio.c
> > > > > >   virtqueue_alloc_element(void *elem, size_t sz, unsigned in_num, unsigned out_num) "elem %p size %zd in_num %u out_num %u"
> > > > >
> > > >
> >
>

Re: [RFC PATCH v4 20/20] vdpa: Add custom IOTLB translations to SVQ

Posted by Eugenio Perez Martin 4 years, 3 months ago

On Wed, Oct 20, 2021 at 11:03 AM Jason Wang <jasowang@redhat.com> wrote:
>
> On Wed, Oct 20, 2021 at 2:52 PM Eugenio Perez Martin
> <eperezma@redhat.com> wrote:
> >
> > On Wed, Oct 20, 2021 at 4:07 AM Jason Wang <jasowang@redhat.com> wrote:
> > >
> > > On Wed, Oct 20, 2021 at 10:02 AM Jason Wang <jasowang@redhat.com> wrote:
> > > >
> > > > On Tue, Oct 19, 2021 at 6:29 PM Eugenio Perez Martin
> > > > <eperezma@redhat.com> wrote:
> > > > >
> > > > > On Tue, Oct 19, 2021 at 11:25 AM Jason Wang <jasowang@redhat.com> wrote:
> > > > > >
> > > > > >
> > > > > > 在 2021/10/1 下午3:06, Eugenio Pérez 写道:
> > > > > > > Use translations added in VhostIOVATree in SVQ.
> > > > > > >
> > > > > > > Now every element needs to store the previous address also, so VirtQueue
> > > > > > > can consume the elements properly. This adds a little overhead per VQ
> > > > > > > element, having to allocate more memory to stash them. As a possible
> > > > > > > optimization, this allocation could be avoided if the descriptor is not
> > > > > > > a chain but a single one, but this is left undone.
> > > > > > >
> > > > > > > TODO: iova range should be queried before, and add logic to fail when
> > > > > > > GPA is outside of its range and memory listener or svq add it.
> > > > > > >
> > > > > > > Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> > > > > > > ---
> > > > > > >   hw/virtio/vhost-shadow-virtqueue.h |   4 +-
> > > > > > >   hw/virtio/vhost-shadow-virtqueue.c | 130 ++++++++++++++++++++++++-----
> > > > > > >   hw/virtio/vhost-vdpa.c             |  40 ++++++++-
> > > > > > >   hw/virtio/trace-events             |   1 +
> > > > > > >   4 files changed, 152 insertions(+), 23 deletions(-)
> > > > > > >
> > > > > > > diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
> > > > > > > index b7baa424a7..a0e6b5267a 100644
> > > > > > > --- a/hw/virtio/vhost-shadow-virtqueue.h
> > > > > > > +++ b/hw/virtio/vhost-shadow-virtqueue.h
> > > > > > > @@ -11,6 +11,7 @@
> > > > > > >   #define VHOST_SHADOW_VIRTQUEUE_H
> > > > > > >
> > > > > > >   #include "hw/virtio/vhost.h"
> > > > > > > +#include "hw/virtio/vhost-iova-tree.h"
> > > > > > >
> > > > > > >   typedef struct VhostShadowVirtqueue VhostShadowVirtqueue;
> > > > > > >
> > > > > > > @@ -28,7 +29,8 @@ bool vhost_svq_start(struct vhost_dev *dev, unsigned idx,
> > > > > > >   void vhost_svq_stop(struct vhost_dev *dev, unsigned idx,
> > > > > > >                       VhostShadowVirtqueue *svq);
> > > > > > >
> > > > > > > -VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx);
> > > > > > > +VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx,
> > > > > > > +                                    VhostIOVATree *iova_map);
> > > > > > >
> > > > > > >   void vhost_svq_free(VhostShadowVirtqueue *vq);
> > > > > > >
> > > > > > > diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
> > > > > > > index 2fd0bab75d..9db538547e 100644
> > > > > > > --- a/hw/virtio/vhost-shadow-virtqueue.c
> > > > > > > +++ b/hw/virtio/vhost-shadow-virtqueue.c
> > > > > > > @@ -11,12 +11,19 @@
> > > > > > >   #include "hw/virtio/vhost-shadow-virtqueue.h"
> > > > > > >   #include "hw/virtio/vhost.h"
> > > > > > >   #include "hw/virtio/virtio-access.h"
> > > > > > > +#include "hw/virtio/vhost-iova-tree.h"
> > > > > > >
> > > > > > >   #include "standard-headers/linux/vhost_types.h"
> > > > > > >
> > > > > > >   #include "qemu/error-report.h"
> > > > > > >   #include "qemu/main-loop.h"
> > > > > > >
> > > > > > > +typedef struct SVQElement {
> > > > > > > +    VirtQueueElement elem;
> > > > > > > +    void **in_sg_stash;
> > > > > > > +    void **out_sg_stash;
> > > > > > > +} SVQElement;
> > > > > > > +
> > > > > > >   /* Shadow virtqueue to relay notifications */
> > > > > > >   typedef struct VhostShadowVirtqueue {
> > > > > > >       /* Shadow vring */
> > > > > > > @@ -46,8 +53,11 @@ typedef struct VhostShadowVirtqueue {
> > > > > > >       /* Virtio device */
> > > > > > >       VirtIODevice *vdev;
> > > > > > >
> > > > > > > +    /* IOVA mapping if used */
> > > > > > > +    VhostIOVATree *iova_map;
> > > > > > > +
> > > > > > >       /* Map for returning guest's descriptors */
> > > > > > > -    VirtQueueElement **ring_id_maps;
> > > > > > > +    SVQElement **ring_id_maps;
> > > > > > >
> > > > > > >       /* Next head to expose to device */
> > > > > > >       uint16_t avail_idx_shadow;
> > > > > > > @@ -79,13 +89,6 @@ bool vhost_svq_valid_device_features(uint64_t *dev_features)
> > > > > > >               continue;
> > > > > > >
> > > > > > >           case VIRTIO_F_ACCESS_PLATFORM:
> > > > > > > -            /* SVQ needs this feature disabled. Can't continue */
> > > > > > > -            if (*dev_features & BIT_ULL(b)) {
> > > > > > > -                clear_bit(b, dev_features);
> > > > > > > -                r = false;
> > > > > > > -            }
> > > > > > > -            break;
> > > > > > > -
> > > > > > >           case VIRTIO_F_VERSION_1:
> > > > > > >               /* SVQ needs this feature, so can't continue */
> > > > > > >               if (!(*dev_features & BIT_ULL(b))) {
> > > > > > > @@ -126,6 +129,64 @@ static void vhost_svq_set_notification(VhostShadowVirtqueue *svq, bool enable)
> > > > > > >       }
> > > > > > >   }
> > > > > > >
> > > > > > > +static void vhost_svq_stash_addr(void ***stash, const struct iovec *iov,
> > > > > > > +                                 size_t num)
> > > > > > > +{
> > > > > > > +    size_t i;
> > > > > > > +
> > > > > > > +    if (num == 0) {
> > > > > > > +        return;
> > > > > > > +    }
> > > > > > > +
> > > > > > > +    *stash = g_new(void *, num);
> > > > > > > +    for (i = 0; i < num; ++i) {
> > > > > > > +        (*stash)[i] = iov[i].iov_base;
> > > > > > > +    }
> > > > > > > +}
> > > > > > > +
> > > > > > > +static void vhost_svq_unstash_addr(void **stash, struct iovec *iov, size_t num)
> > > > > > > +{
> > > > > > > +    size_t i;
> > > > > > > +
> > > > > > > +    if (num == 0) {
> > > > > > > +        return;
> > > > > > > +    }
> > > > > > > +
> > > > > > > +    for (i = 0; i < num; ++i) {
> > > > > > > +        iov[i].iov_base = stash[i];
> > > > > > > +    }
> > > > > > > +    g_free(stash);
> > > > > > > +}
> > > > > > > +
> > > > > > > +static void vhost_svq_translate_addr(const VhostShadowVirtqueue *svq,
> > > > > > > +                                     struct iovec *iovec, size_t num)
> > > > > > > +{
> > > > > > > +    size_t i;
> > > > > > > +
> > > > > > > +    for (i = 0; i < num; ++i) {
> > > > > > > +        VhostDMAMap needle = {
> > > > > > > +            .translated_addr = iovec[i].iov_base,
> > > > > > > +            .size = iovec[i].iov_len,
> > > > > > > +        };
> > > > > > > +        size_t off;
> > > > > > > +
> > > > > > > +        const VhostDMAMap *map = vhost_iova_tree_find_iova(svq->iova_map,
> > > > > > > +                                                           &needle);
> > > > > >
> > > > > >
> > > > > > Is it possible that we end up with more than one maps here?
> > > > > >
> > > > >
> > > > > Actually it is possible, since there is no guarantee that one
> > > > > descriptor (or indirect descriptor) maps exactly to one iov. It could
> > > > > map to many if qemu vaddr is not contiguous but GPA + size is. This is
> > > > > something that must be fixed for the next revision, so thanks for
> > > > > pointing it out!
> > > > >
> > > > > Taking that into account, the condition that svq vring avail_idx -
> > > > > used_idx was always less or equal than guest's vring avail_idx -
> > > > > used_idx is not true anymore. Checking for that before adding buffers
> > > > > to SVQ is the easy part, but how could we recover in that case?
> > > > >
> > > > > I think that the easy solution is to check for more available buffers
> > > > > unconditionally at the end of vhost_svq_handle_call, which handles the
> > > > > SVQ used and is supposed to make more room for available buffers. So
> > > > > vhost_handle_guest_kick would not check if eventfd is set or not
> > > > > anymore.
> > > > >
> > > > > Would that make sense?
> > > >
> > > > Yes, I think it should work.
> > >
> > > Btw, I wonder how to handle indirect descriptors. SVQ doesn't use
> > > indirect descriptors for now, but it looks like a must otherwise we
> > > may end up SVQ is full before VQ.
> > >
> >
> > We can get to that situation without indirect too, if a single
> > descriptor maps to more than one sg buffer. The next revision is going
> > to control that too.
> >
> > > It looks to me an easy way is to always use indirect descriptors if #sg >= 2?
> > >
> >
> > I will use that, but that does not solve the case where a descriptor
> > maps to > 1 different buffers in qemu vaddr.
>
> Right, so we need to deal with the case when SVQ is out of space.
>
>
> > So I think that some
> > check after marking descriptors as used is a must somehow.
>
> I thought it should be before processing the available buffer?

Yes, I meant after that. Somehow, because I include checking the
number of sg buffers as "processing". :).

> It's
> the guest driver that make sure there's sufficient space for used
> ring?
>

(I think we are talking the same with different words, but just in
case I will develop the idea here with an example).

The guest is able to check if there is enough space in the SVQ's
vring, but not in the device's vring. As an example of this, imagine
that a guest makes available a GPA contiguous buffer of 64K, one
descriptor. However, this memory is divided into 16 chunks of 4K in
qemu's VA space. Imagine that at this moment there are only eight
slots free in each vring, and that neither communication is using
indirect descriptors.

The guest only needs 1 descriptor available to make that buffer
available, so it will add to avail ring. But SVQ needs 16 chained
descriptors, so the buffer is not going to reach the device until it
makes at least 8 more descriptors as used. SVQ checked for the amount
of available room, as you said, but it cannot forward the available
one.

Since the guest already sent kick when it made the descriptor
available, we need another mechanism to know when we have all the
needed free slots in the SVQ vring. And that's what I meant with the
check after marking some buffers as available.

I still think it is not worth it to protect the forwarding methods of
hogging BQL, since there must be a limit sooner or later, but it is
something that is worth putting on the table again. But this requires
changes for the next version for sure.

I can think in more scenarios, like guest making available an indirect
descriptor of vq size that needs to be splitted in even more sgs. Qemu
already does not support more than 1024 sgs buffers in VirtQueue, but
a driver (as SVQ) must *not* create an indirect descriptor chain
longer than the Queue Size. Should we always increase vq size to 1024
always? I think these are highly unlikely, but again these concerns
must be at least commented here.

Does it make sense?

Thanks!

> Thanks
>
> >
> >
> > > Thanks
> > >
> > > >
> > > > Thanks
> > > >
> > > > >
> > > > > Thanks!
> > > > >
> > > > > >
> > > > > > > +        /*
> > > > > > > +         * Map cannot be NULL since iova map contains all guest space and
> > > > > > > +         * qemu already has a physical address mapped
> > > > > > > +         */
> > > > > > > +        assert(map);
> > > > > > > +
> > > > > > > +        /*
> > > > > > > +         * Map->iova chunk size is ignored. What to do if descriptor
> > > > > > > +         * (addr, size) does not fit is delegated to the device.
> > > > > > > +         */
> > > > > > > +        off = needle.translated_addr - map->translated_addr;
> > > > > > > +        iovec[i].iov_base = (void *)(map->iova + off);
> > > > > > > +    }
> > > > > > > +}
> > > > > > > +
> > > > > > >   static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
> > > > > > >                                       const struct iovec *iovec,
> > > > > > >                                       size_t num, bool more_descs, bool write)
> > > > > > > @@ -156,8 +217,9 @@ static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
> > > > > > >   }
> > > > > > >
> > > > > > >   static unsigned vhost_svq_add_split(VhostShadowVirtqueue *svq,
> > > > > > > -                                    VirtQueueElement *elem)
> > > > > > > +                                    SVQElement *svq_elem)
> > > > > > >   {
> > > > > > > +    VirtQueueElement *elem = &svq_elem->elem;
> > > > > > >       int head;
> > > > > > >       unsigned avail_idx;
> > > > > > >       vring_avail_t *avail = svq->vring.avail;
> > > > > > > @@ -167,6 +229,12 @@ static unsigned vhost_svq_add_split(VhostShadowVirtqueue *svq,
> > > > > > >       /* We need some descriptors here */
> > > > > > >       assert(elem->out_num || elem->in_num);
> > > > > > >
> > > > > > > +    vhost_svq_stash_addr(&svq_elem->in_sg_stash, elem->in_sg, elem->in_num);
> > > > > > > +    vhost_svq_stash_addr(&svq_elem->out_sg_stash, elem->out_sg, elem->out_num);
> > > > > >
> > > > > >
> > > > > > I wonder if we can solve the trick like stash and unstash with a
> > > > > > dedicated sgs in svq_elem, instead of reusing the elem.
> > > > > >
> > > > >
> > > > > Actually yes, it would be way simpler to use a new sgs array in
> > > > > svq_elem. I will change that.
> > > > >
> > > > > Thanks!
> > > > >
> > > > > > Thanks
> > > > > >
> > > > > >
> > > > > > > +
> > > > > > > +    vhost_svq_translate_addr(svq, elem->in_sg, elem->in_num);
> > > > > > > +    vhost_svq_translate_addr(svq, elem->out_sg, elem->out_num);
> > > > > > > +
> > > > > > >       vhost_vring_write_descs(svq, elem->out_sg, elem->out_num,
> > > > > > >                               elem->in_num > 0, false);
> > > > > > >       vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
> > > > > > > @@ -187,7 +255,7 @@ static unsigned vhost_svq_add_split(VhostShadowVirtqueue *svq,
> > > > > > >
> > > > > > >   }
> > > > > > >
> > > > > > > -static void vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
> > > > > > > +static void vhost_svq_add(VhostShadowVirtqueue *svq, SVQElement *elem)
> > > > > > >   {
> > > > > > >       unsigned qemu_head = vhost_svq_add_split(svq, elem);
> > > > > > >
> > > > > > > @@ -221,7 +289,7 @@ static void vhost_handle_guest_kick(EventNotifier *n)
> > > > > > >           }
> > > > > > >
> > > > > > >           while (true) {
> > > > > > > -            VirtQueueElement *elem = virtqueue_pop(svq->vq, sizeof(*elem));
> > > > > > > +            SVQElement *elem = virtqueue_pop(svq->vq, sizeof(*elem));
> > > > > > >               if (!elem) {
> > > > > > >                   break;
> > > > > > >               }
> > > > > > > @@ -247,7 +315,7 @@ static bool vhost_svq_more_used(VhostShadowVirtqueue *svq)
> > > > > > >       return svq->used_idx != svq->shadow_used_idx;
> > > > > > >   }
> > > > > > >
> > > > > > > -static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq)
> > > > > > > +static SVQElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq)
> > > > > > >   {
> > > > > > >       vring_desc_t *descs = svq->vring.desc;
> > > > > > >       const vring_used_t *used = svq->vring.used;
> > > > > > > @@ -279,7 +347,7 @@ static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq)
> > > > > > >       descs[used_elem.id].next = svq->free_head;
> > > > > > >       svq->free_head = used_elem.id;
> > > > > > >
> > > > > > > -    svq->ring_id_maps[used_elem.id]->len = used_elem.len;
> > > > > > > +    svq->ring_id_maps[used_elem.id]->elem.len = used_elem.len;
> > > > > > >       return g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
> > > > > > >   }
> > > > > > >
> > > > > > > @@ -296,12 +364,19 @@ static void vhost_svq_handle_call_no_test(EventNotifier *n)
> > > > > > >
> > > > > > >           vhost_svq_set_notification(svq, false);
> > > > > > >           while (true) {
> > > > > > > -            g_autofree VirtQueueElement *elem = vhost_svq_get_buf(svq);
> > > > > > > -            if (!elem) {
> > > > > > > +            g_autofree SVQElement *svq_elem = vhost_svq_get_buf(svq);
> > > > > > > +            VirtQueueElement *elem;
> > > > > > > +            if (!svq_elem) {
> > > > > > >                   break;
> > > > > > >               }
> > > > > > >
> > > > > > >               assert(i < svq->vring.num);
> > > > > > > +            elem = &svq_elem->elem;
> > > > > > > +
> > > > > > > +            vhost_svq_unstash_addr(svq_elem->in_sg_stash, elem->in_sg,
> > > > > > > +                                   elem->in_num);
> > > > > > > +            vhost_svq_unstash_addr(svq_elem->out_sg_stash, elem->out_sg,
> > > > > > > +                                   elem->out_num);
> > > > > > >               virtqueue_fill(vq, elem, elem->len, i++);
> > > > > > >           }
> > > > > > >
> > > > > > > @@ -451,14 +526,24 @@ void vhost_svq_stop(struct vhost_dev *dev, unsigned idx,
> > > > > > >       event_notifier_set_handler(&svq->host_notifier, NULL);
> > > > > > >
> > > > > > >       for (i = 0; i < svq->vring.num; ++i) {
> > > > > > > -        g_autofree VirtQueueElement *elem = svq->ring_id_maps[i];
> > > > > > > +        g_autofree SVQElement *svq_elem = svq->ring_id_maps[i];
> > > > > > > +        VirtQueueElement *elem;
> > > > > > > +
> > > > > > > +        if (!svq_elem) {
> > > > > > > +            continue;
> > > > > > > +        }
> > > > > > > +
> > > > > > > +        elem = &svq_elem->elem;
> > > > > > > +        vhost_svq_unstash_addr(svq_elem->in_sg_stash, elem->in_sg,
> > > > > > > +                               elem->in_num);
> > > > > > > +        vhost_svq_unstash_addr(svq_elem->out_sg_stash, elem->out_sg,
> > > > > > > +                               elem->out_num);
> > > > > > > +
> > > > > > >           /*
> > > > > > >            * Although the doc says we must unpop in order, it's ok to unpop
> > > > > > >            * everything.
> > > > > > >            */
> > > > > > > -        if (elem) {
> > > > > > > -            virtqueue_unpop(svq->vq, elem, elem->len);
> > > > > > > -        }
> > > > > > > +        virtqueue_unpop(svq->vq, elem, elem->len);
> > > > > > >       }
> > > > > > >   }
> > > > > > >
> > > > > > > @@ -466,7 +551,8 @@ void vhost_svq_stop(struct vhost_dev *dev, unsigned idx,
> > > > > > >    * Creates vhost shadow virtqueue, and instruct vhost device to use the shadow
> > > > > > >    * methods and file descriptors.
> > > > > > >    */
> > > > > > > -VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx)
> > > > > > > +VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx,
> > > > > > > +                                    VhostIOVATree *iova_map)
> > > > > > >   {
> > > > > > >       int vq_idx = dev->vq_index + idx;
> > > > > > >       unsigned num = virtio_queue_get_num(dev->vdev, vq_idx);
> > > > > > > @@ -500,11 +586,13 @@ VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx)
> > > > > > >       memset(svq->vring.desc, 0, driver_size);
> > > > > > >       svq->vring.used = qemu_memalign(qemu_real_host_page_size, device_size);
> > > > > > >       memset(svq->vring.used, 0, device_size);
> > > > > > > +    svq->iova_map = iova_map;
> > > > > > > +
> > > > > > >       for (i = 0; i < num - 1; i++) {
> > > > > > >           svq->vring.desc[i].next = cpu_to_le16(i + 1);
> > > > > > >       }
> > > > > > >
> > > > > > > -    svq->ring_id_maps = g_new0(VirtQueueElement *, num);
> > > > > > > +    svq->ring_id_maps = g_new0(SVQElement *, num);
> > > > > > >       event_notifier_set_handler(&svq->call_notifier,
> > > > > > >                                  vhost_svq_handle_call);
> > > > > > >       return g_steal_pointer(&svq);
> > > > > > > diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
> > > > > > > index a9c680b487..f5a12fee9d 100644
> > > > > > > --- a/hw/virtio/vhost-vdpa.c
> > > > > > > +++ b/hw/virtio/vhost-vdpa.c
> > > > > > > @@ -176,6 +176,18 @@ static void vhost_vdpa_listener_region_add(MemoryListener *listener,
> > > > > > >                                            vaddr, section->readonly);
> > > > > > >
> > > > > > >       llsize = int128_sub(llend, int128_make64(iova));
> > > > > > > +    if (v->shadow_vqs_enabled) {
> > > > > > > +        VhostDMAMap mem_region = {
> > > > > > > +            .translated_addr = vaddr,
> > > > > > > +            .size = int128_get64(llsize) - 1,
> > > > > > > +            .perm = IOMMU_ACCESS_FLAG(true, section->readonly),
> > > > > > > +        };
> > > > > > > +
> > > > > > > +        int r = vhost_iova_tree_alloc(v->iova_map, &mem_region);
> > > > > > > +        assert(r == VHOST_DMA_MAP_OK);
> > > > > > > +
> > > > > > > +        iova = mem_region.iova;
> > > > > > > +    }
> > > > > > >
> > > > > > >       ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize),
> > > > > > >                                vaddr, section->readonly);
> > > > > > > @@ -754,6 +766,23 @@ static bool  vhost_vdpa_force_iommu(struct vhost_dev *dev)
> > > > > > >       return true;
> > > > > > >   }
> > > > > > >
> > > > > > > +static int vhost_vdpa_get_iova_range(struct vhost_dev *dev,
> > > > > > > +                                     hwaddr *first, hwaddr *last)
> > > > > > > +{
> > > > > > > +    int ret;
> > > > > > > +    struct vhost_vdpa_iova_range range;
> > > > > > > +
> > > > > > > +    ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_IOVA_RANGE, &range);
> > > > > > > +    if (ret != 0) {
> > > > > > > +        return ret;
> > > > > > > +    }
> > > > > > > +
> > > > > > > +    *first = range.first;
> > > > > > > +    *last = range.last;
> > > > > > > +    trace_vhost_vdpa_get_iova_range(dev, *first, *last);
> > > > > > > +    return ret;
> > > > > > > +}
> > > > > > > +
> > > > > > >   /**
> > > > > > >    * Maps QEMU vaddr memory to device in a suitable way for shadow virtqueue:
> > > > > > >    * - It always reference qemu memory address, not guest's memory.
> > > > > > > @@ -881,6 +910,7 @@ static bool vhost_vdpa_svq_start_vq(struct vhost_dev *dev, unsigned idx)
> > > > > > >   static unsigned vhost_vdpa_enable_svq(struct vhost_vdpa *v, bool enable)
> > > > > > >   {
> > > > > > >       struct vhost_dev *hdev = v->dev;
> > > > > > > +    hwaddr iova_first, iova_last;
> > > > > > >       unsigned n;
> > > > > > >       int r;
> > > > > > >
> > > > > > > @@ -894,7 +924,7 @@ static unsigned vhost_vdpa_enable_svq(struct vhost_vdpa *v, bool enable)
> > > > > > >           /* Allocate resources */
> > > > > > >           assert(v->shadow_vqs->len == 0);
> > > > > > >           for (n = 0; n < hdev->nvqs; ++n) {
> > > > > > > -            VhostShadowVirtqueue *svq = vhost_svq_new(hdev, n);
> > > > > > > +            VhostShadowVirtqueue *svq = vhost_svq_new(hdev, n, v->iova_map);
> > > > > > >               if (unlikely(!svq)) {
> > > > > > >                   g_ptr_array_set_size(v->shadow_vqs, 0);
> > > > > > >                   return 0;
> > > > > > > @@ -903,6 +933,8 @@ static unsigned vhost_vdpa_enable_svq(struct vhost_vdpa *v, bool enable)
> > > > > > >           }
> > > > > > >       }
> > > > > > >
> > > > > > > +    r = vhost_vdpa_get_iova_range(hdev, &iova_first, &iova_last);
> > > > > > > +    assert(r == 0);
> > > > > > >       r = vhost_vdpa_vring_pause(hdev);
> > > > > > >       assert(r == 0);
> > > > > > >
> > > > > > > @@ -913,6 +945,12 @@ static unsigned vhost_vdpa_enable_svq(struct vhost_vdpa *v, bool enable)
> > > > > > >           }
> > > > > > >       }
> > > > > > >
> > > > > > > +    memory_listener_unregister(&v->listener);
> > > > > > > +    if (vhost_vdpa_dma_unmap(v, iova_first,
> > > > > > > +                             (iova_last - iova_first) & TARGET_PAGE_MASK)) {
> > > > > > > +        error_report("Fail to invalidate device iotlb");
> > > > > > > +    }
> > > > > > > +
> > > > > > >       /* Reset device so it can be configured */
> > > > > > >       r = vhost_vdpa_dev_start(hdev, false);
> > > > > > >       assert(r == 0);
> > > > > > > diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
> > > > > > > index 8ed19e9d0c..650e521e35 100644
> > > > > > > --- a/hw/virtio/trace-events
> > > > > > > +++ b/hw/virtio/trace-events
> > > > > > > @@ -52,6 +52,7 @@ vhost_vdpa_set_vring_call(void *dev, unsigned int index, int fd) "dev: %p index:
> > > > > > >   vhost_vdpa_get_features(void *dev, uint64_t features) "dev: %p features: 0x%"PRIx64
> > > > > > >   vhost_vdpa_set_owner(void *dev) "dev: %p"
> > > > > > >   vhost_vdpa_vq_get_addr(void *dev, void *vq, uint64_t desc_user_addr, uint64_t avail_user_addr, uint64_t used_user_addr) "dev: %p vq: %p desc_user_addr: 0x%"PRIx64" avail_user_addr: 0x%"PRIx64" used_user_addr: 0x%"PRIx64
> > > > > > > +vhost_vdpa_get_iova_range(void *dev, uint64_t first, uint64_t last) "dev: %p first: 0x%"PRIx64" last: 0x%"PRIx64
> > > > > > >
> > > > > > >   # virtio.c
> > > > > > >   virtqueue_alloc_element(void *elem, size_t sz, unsigned in_num, unsigned out_num) "elem %p size %zd in_num %u out_num %u"
> > > > > >
> > > > >
> > >
> >
>

Re: [RFC PATCH v4 20/20] vdpa: Add custom IOTLB translations to SVQ

Posted by Jason Wang 4 years, 3 months ago

On Wed, Oct 20, 2021 at 7:57 PM Eugenio Perez Martin
<eperezma@redhat.com> wrote:
>
> On Wed, Oct 20, 2021 at 11:03 AM Jason Wang <jasowang@redhat.com> wrote:
> >
> > On Wed, Oct 20, 2021 at 2:52 PM Eugenio Perez Martin
> > <eperezma@redhat.com> wrote:
> > >
> > > On Wed, Oct 20, 2021 at 4:07 AM Jason Wang <jasowang@redhat.com> wrote:
> > > >
> > > > On Wed, Oct 20, 2021 at 10:02 AM Jason Wang <jasowang@redhat.com> wrote:
> > > > >
> > > > > On Tue, Oct 19, 2021 at 6:29 PM Eugenio Perez Martin
> > > > > <eperezma@redhat.com> wrote:
> > > > > >
> > > > > > On Tue, Oct 19, 2021 at 11:25 AM Jason Wang <jasowang@redhat.com> wrote:
> > > > > > >
> > > > > > >
> > > > > > > 在 2021/10/1 下午3:06, Eugenio Pérez 写道:
> > > > > > > > Use translations added in VhostIOVATree in SVQ.
> > > > > > > >
> > > > > > > > Now every element needs to store the previous address also, so VirtQueue
> > > > > > > > can consume the elements properly. This adds a little overhead per VQ
> > > > > > > > element, having to allocate more memory to stash them. As a possible
> > > > > > > > optimization, this allocation could be avoided if the descriptor is not
> > > > > > > > a chain but a single one, but this is left undone.
> > > > > > > >
> > > > > > > > TODO: iova range should be queried before, and add logic to fail when
> > > > > > > > GPA is outside of its range and memory listener or svq add it.
> > > > > > > >
> > > > > > > > Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> > > > > > > > ---
> > > > > > > >   hw/virtio/vhost-shadow-virtqueue.h |   4 +-
> > > > > > > >   hw/virtio/vhost-shadow-virtqueue.c | 130 ++++++++++++++++++++++++-----
> > > > > > > >   hw/virtio/vhost-vdpa.c             |  40 ++++++++-
> > > > > > > >   hw/virtio/trace-events             |   1 +
> > > > > > > >   4 files changed, 152 insertions(+), 23 deletions(-)
> > > > > > > >
> > > > > > > > diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
> > > > > > > > index b7baa424a7..a0e6b5267a 100644
> > > > > > > > --- a/hw/virtio/vhost-shadow-virtqueue.h
> > > > > > > > +++ b/hw/virtio/vhost-shadow-virtqueue.h
> > > > > > > > @@ -11,6 +11,7 @@
> > > > > > > >   #define VHOST_SHADOW_VIRTQUEUE_H
> > > > > > > >
> > > > > > > >   #include "hw/virtio/vhost.h"
> > > > > > > > +#include "hw/virtio/vhost-iova-tree.h"
> > > > > > > >
> > > > > > > >   typedef struct VhostShadowVirtqueue VhostShadowVirtqueue;
> > > > > > > >
> > > > > > > > @@ -28,7 +29,8 @@ bool vhost_svq_start(struct vhost_dev *dev, unsigned idx,
> > > > > > > >   void vhost_svq_stop(struct vhost_dev *dev, unsigned idx,
> > > > > > > >                       VhostShadowVirtqueue *svq);
> > > > > > > >
> > > > > > > > -VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx);
> > > > > > > > +VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx,
> > > > > > > > +                                    VhostIOVATree *iova_map);
> > > > > > > >
> > > > > > > >   void vhost_svq_free(VhostShadowVirtqueue *vq);
> > > > > > > >
> > > > > > > > diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
> > > > > > > > index 2fd0bab75d..9db538547e 100644
> > > > > > > > --- a/hw/virtio/vhost-shadow-virtqueue.c
> > > > > > > > +++ b/hw/virtio/vhost-shadow-virtqueue.c
> > > > > > > > @@ -11,12 +11,19 @@
> > > > > > > >   #include "hw/virtio/vhost-shadow-virtqueue.h"
> > > > > > > >   #include "hw/virtio/vhost.h"
> > > > > > > >   #include "hw/virtio/virtio-access.h"
> > > > > > > > +#include "hw/virtio/vhost-iova-tree.h"
> > > > > > > >
> > > > > > > >   #include "standard-headers/linux/vhost_types.h"
> > > > > > > >
> > > > > > > >   #include "qemu/error-report.h"
> > > > > > > >   #include "qemu/main-loop.h"
> > > > > > > >
> > > > > > > > +typedef struct SVQElement {
> > > > > > > > +    VirtQueueElement elem;
> > > > > > > > +    void **in_sg_stash;
> > > > > > > > +    void **out_sg_stash;
> > > > > > > > +} SVQElement;
> > > > > > > > +
> > > > > > > >   /* Shadow virtqueue to relay notifications */
> > > > > > > >   typedef struct VhostShadowVirtqueue {
> > > > > > > >       /* Shadow vring */
> > > > > > > > @@ -46,8 +53,11 @@ typedef struct VhostShadowVirtqueue {
> > > > > > > >       /* Virtio device */
> > > > > > > >       VirtIODevice *vdev;
> > > > > > > >
> > > > > > > > +    /* IOVA mapping if used */
> > > > > > > > +    VhostIOVATree *iova_map;
> > > > > > > > +
> > > > > > > >       /* Map for returning guest's descriptors */
> > > > > > > > -    VirtQueueElement **ring_id_maps;
> > > > > > > > +    SVQElement **ring_id_maps;
> > > > > > > >
> > > > > > > >       /* Next head to expose to device */
> > > > > > > >       uint16_t avail_idx_shadow;
> > > > > > > > @@ -79,13 +89,6 @@ bool vhost_svq_valid_device_features(uint64_t *dev_features)
> > > > > > > >               continue;
> > > > > > > >
> > > > > > > >           case VIRTIO_F_ACCESS_PLATFORM:
> > > > > > > > -            /* SVQ needs this feature disabled. Can't continue */
> > > > > > > > -            if (*dev_features & BIT_ULL(b)) {
> > > > > > > > -                clear_bit(b, dev_features);
> > > > > > > > -                r = false;
> > > > > > > > -            }
> > > > > > > > -            break;
> > > > > > > > -
> > > > > > > >           case VIRTIO_F_VERSION_1:
> > > > > > > >               /* SVQ needs this feature, so can't continue */
> > > > > > > >               if (!(*dev_features & BIT_ULL(b))) {
> > > > > > > > @@ -126,6 +129,64 @@ static void vhost_svq_set_notification(VhostShadowVirtqueue *svq, bool enable)
> > > > > > > >       }
> > > > > > > >   }
> > > > > > > >
> > > > > > > > +static void vhost_svq_stash_addr(void ***stash, const struct iovec *iov,
> > > > > > > > +                                 size_t num)
> > > > > > > > +{
> > > > > > > > +    size_t i;
> > > > > > > > +
> > > > > > > > +    if (num == 0) {
> > > > > > > > +        return;
> > > > > > > > +    }
> > > > > > > > +
> > > > > > > > +    *stash = g_new(void *, num);
> > > > > > > > +    for (i = 0; i < num; ++i) {
> > > > > > > > +        (*stash)[i] = iov[i].iov_base;
> > > > > > > > +    }
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +static void vhost_svq_unstash_addr(void **stash, struct iovec *iov, size_t num)
> > > > > > > > +{
> > > > > > > > +    size_t i;
> > > > > > > > +
> > > > > > > > +    if (num == 0) {
> > > > > > > > +        return;
> > > > > > > > +    }
> > > > > > > > +
> > > > > > > > +    for (i = 0; i < num; ++i) {
> > > > > > > > +        iov[i].iov_base = stash[i];
> > > > > > > > +    }
> > > > > > > > +    g_free(stash);
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +static void vhost_svq_translate_addr(const VhostShadowVirtqueue *svq,
> > > > > > > > +                                     struct iovec *iovec, size_t num)
> > > > > > > > +{
> > > > > > > > +    size_t i;
> > > > > > > > +
> > > > > > > > +    for (i = 0; i < num; ++i) {
> > > > > > > > +        VhostDMAMap needle = {
> > > > > > > > +            .translated_addr = iovec[i].iov_base,
> > > > > > > > +            .size = iovec[i].iov_len,
> > > > > > > > +        };
> > > > > > > > +        size_t off;
> > > > > > > > +
> > > > > > > > +        const VhostDMAMap *map = vhost_iova_tree_find_iova(svq->iova_map,
> > > > > > > > +                                                           &needle);
> > > > > > >
> > > > > > >
> > > > > > > Is it possible that we end up with more than one maps here?
> > > > > > >
> > > > > >
> > > > > > Actually it is possible, since there is no guarantee that one
> > > > > > descriptor (or indirect descriptor) maps exactly to one iov. It could
> > > > > > map to many if qemu vaddr is not contiguous but GPA + size is. This is
> > > > > > something that must be fixed for the next revision, so thanks for
> > > > > > pointing it out!
> > > > > >
> > > > > > Taking that into account, the condition that svq vring avail_idx -
> > > > > > used_idx was always less or equal than guest's vring avail_idx -
> > > > > > used_idx is not true anymore. Checking for that before adding buffers
> > > > > > to SVQ is the easy part, but how could we recover in that case?
> > > > > >
> > > > > > I think that the easy solution is to check for more available buffers
> > > > > > unconditionally at the end of vhost_svq_handle_call, which handles the
> > > > > > SVQ used and is supposed to make more room for available buffers. So
> > > > > > vhost_handle_guest_kick would not check if eventfd is set or not
> > > > > > anymore.
> > > > > >
> > > > > > Would that make sense?
> > > > >
> > > > > Yes, I think it should work.
> > > >
> > > > Btw, I wonder how to handle indirect descriptors. SVQ doesn't use
> > > > indirect descriptors for now, but it looks like a must otherwise we
> > > > may end up SVQ is full before VQ.
> > > >
> > >
> > > We can get to that situation without indirect too, if a single
> > > descriptor maps to more than one sg buffer. The next revision is going
> > > to control that too.
> > >
> > > > It looks to me an easy way is to always use indirect descriptors if #sg >= 2?
> > > >
> > >
> > > I will use that, but that does not solve the case where a descriptor
> > > maps to > 1 different buffers in qemu vaddr.
> >
> > Right, so we need to deal with the case when SVQ is out of space.
> >
> >
> > > So I think that some
> > > check after marking descriptors as used is a must somehow.
> >
> > I thought it should be before processing the available buffer?
>
> Yes, I meant after that. Somehow, because I include checking the
> number of sg buffers as "processing". :).
>
> > It's
> > the guest driver that make sure there's sufficient space for used
> > ring?
> >
>
> (I think we are talking the same with different words, but just in
> case I will develop the idea here with an example).
>
> The guest is able to check if there is enough space in the SVQ's
> vring, but not in the device's vring. As an example of this, imagine
> that a guest makes available a GPA contiguous buffer of 64K, one
> descriptor. However, this memory is divided into 16 chunks of 4K in
> qemu's VA space. Imagine that at this moment there are only eight
> slots free in each vring, and that neither communication is using
> indirect descriptors.
>
> The guest only needs 1 descriptor available to make that buffer
> available, so it will add to avail ring. But SVQ needs 16 chained
> descriptors, so the buffer is not going to reach the device until it
> makes at least 8 more descriptors as used. SVQ checked for the amount
> of available room, as you said, but it cannot forward the available
> one.
>
> Since the guest already sent kick when it made the descriptor
> available, we need another mechanism to know when we have all the
> needed free slots in the SVQ vring. And that's what I meant with the
> check after marking some buffers as available.
>
> I still think it is not worth it to protect the forwarding methods of
> hogging BQL, since there must be a limit sooner or later, but it is
> something that is worth putting on the table again. But this requires
> changes for the next version for sure.

Ok.

>
> I can think in more scenarios, like guest making available an indirect
> descriptor of vq size that needs to be splitted in even more sgs. Qemu
> already does not support more than 1024 sgs buffers in VirtQueue, but
> a driver (as SVQ) must *not* create an indirect descriptor chain
> longer than the Queue Size. Should we always increase vq size to 1024
> always? I think these are highly unlikely, but again these concerns
> must be at least commented here.
>
> Does it make sense?

Right. So I think the SVQ codes should be ready to handle all those cases.

Thanks

>
> Thanks!
>
> > Thanks
> >
> > >
> > >
> > > > Thanks
> > > >
> > > > >
> > > > > Thanks
> > > > >
> > > > > >
> > > > > > Thanks!
> > > > > >
> > > > > > >
> > > > > > > > +        /*
> > > > > > > > +         * Map cannot be NULL since iova map contains all guest space and
> > > > > > > > +         * qemu already has a physical address mapped
> > > > > > > > +         */
> > > > > > > > +        assert(map);
> > > > > > > > +
> > > > > > > > +        /*
> > > > > > > > +         * Map->iova chunk size is ignored. What to do if descriptor
> > > > > > > > +         * (addr, size) does not fit is delegated to the device.
> > > > > > > > +         */
> > > > > > > > +        off = needle.translated_addr - map->translated_addr;
> > > > > > > > +        iovec[i].iov_base = (void *)(map->iova + off);
> > > > > > > > +    }
> > > > > > > > +}
> > > > > > > > +
> > > > > > > >   static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
> > > > > > > >                                       const struct iovec *iovec,
> > > > > > > >                                       size_t num, bool more_descs, bool write)
> > > > > > > > @@ -156,8 +217,9 @@ static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
> > > > > > > >   }
> > > > > > > >
> > > > > > > >   static unsigned vhost_svq_add_split(VhostShadowVirtqueue *svq,
> > > > > > > > -                                    VirtQueueElement *elem)
> > > > > > > > +                                    SVQElement *svq_elem)
> > > > > > > >   {
> > > > > > > > +    VirtQueueElement *elem = &svq_elem->elem;
> > > > > > > >       int head;
> > > > > > > >       unsigned avail_idx;
> > > > > > > >       vring_avail_t *avail = svq->vring.avail;
> > > > > > > > @@ -167,6 +229,12 @@ static unsigned vhost_svq_add_split(VhostShadowVirtqueue *svq,
> > > > > > > >       /* We need some descriptors here */
> > > > > > > >       assert(elem->out_num || elem->in_num);
> > > > > > > >
> > > > > > > > +    vhost_svq_stash_addr(&svq_elem->in_sg_stash, elem->in_sg, elem->in_num);
> > > > > > > > +    vhost_svq_stash_addr(&svq_elem->out_sg_stash, elem->out_sg, elem->out_num);
> > > > > > >
> > > > > > >
> > > > > > > I wonder if we can solve the trick like stash and unstash with a
> > > > > > > dedicated sgs in svq_elem, instead of reusing the elem.
> > > > > > >
> > > > > >
> > > > > > Actually yes, it would be way simpler to use a new sgs array in
> > > > > > svq_elem. I will change that.
> > > > > >
> > > > > > Thanks!
> > > > > >
> > > > > > > Thanks
> > > > > > >
> > > > > > >
> > > > > > > > +
> > > > > > > > +    vhost_svq_translate_addr(svq, elem->in_sg, elem->in_num);
> > > > > > > > +    vhost_svq_translate_addr(svq, elem->out_sg, elem->out_num);
> > > > > > > > +
> > > > > > > >       vhost_vring_write_descs(svq, elem->out_sg, elem->out_num,
> > > > > > > >                               elem->in_num > 0, false);
> > > > > > > >       vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
> > > > > > > > @@ -187,7 +255,7 @@ static unsigned vhost_svq_add_split(VhostShadowVirtqueue *svq,
> > > > > > > >
> > > > > > > >   }
> > > > > > > >
> > > > > > > > -static void vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
> > > > > > > > +static void vhost_svq_add(VhostShadowVirtqueue *svq, SVQElement *elem)
> > > > > > > >   {
> > > > > > > >       unsigned qemu_head = vhost_svq_add_split(svq, elem);
> > > > > > > >
> > > > > > > > @@ -221,7 +289,7 @@ static void vhost_handle_guest_kick(EventNotifier *n)
> > > > > > > >           }
> > > > > > > >
> > > > > > > >           while (true) {
> > > > > > > > -            VirtQueueElement *elem = virtqueue_pop(svq->vq, sizeof(*elem));
> > > > > > > > +            SVQElement *elem = virtqueue_pop(svq->vq, sizeof(*elem));
> > > > > > > >               if (!elem) {
> > > > > > > >                   break;
> > > > > > > >               }
> > > > > > > > @@ -247,7 +315,7 @@ static bool vhost_svq_more_used(VhostShadowVirtqueue *svq)
> > > > > > > >       return svq->used_idx != svq->shadow_used_idx;
> > > > > > > >   }
> > > > > > > >
> > > > > > > > -static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq)
> > > > > > > > +static SVQElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq)
> > > > > > > >   {
> > > > > > > >       vring_desc_t *descs = svq->vring.desc;
> > > > > > > >       const vring_used_t *used = svq->vring.used;
> > > > > > > > @@ -279,7 +347,7 @@ static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq)
> > > > > > > >       descs[used_elem.id].next = svq->free_head;
> > > > > > > >       svq->free_head = used_elem.id;
> > > > > > > >
> > > > > > > > -    svq->ring_id_maps[used_elem.id]->len = used_elem.len;
> > > > > > > > +    svq->ring_id_maps[used_elem.id]->elem.len = used_elem.len;
> > > > > > > >       return g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
> > > > > > > >   }
> > > > > > > >
> > > > > > > > @@ -296,12 +364,19 @@ static void vhost_svq_handle_call_no_test(EventNotifier *n)
> > > > > > > >
> > > > > > > >           vhost_svq_set_notification(svq, false);
> > > > > > > >           while (true) {
> > > > > > > > -            g_autofree VirtQueueElement *elem = vhost_svq_get_buf(svq);
> > > > > > > > -            if (!elem) {
> > > > > > > > +            g_autofree SVQElement *svq_elem = vhost_svq_get_buf(svq);
> > > > > > > > +            VirtQueueElement *elem;
> > > > > > > > +            if (!svq_elem) {
> > > > > > > >                   break;
> > > > > > > >               }
> > > > > > > >
> > > > > > > >               assert(i < svq->vring.num);
> > > > > > > > +            elem = &svq_elem->elem;
> > > > > > > > +
> > > > > > > > +            vhost_svq_unstash_addr(svq_elem->in_sg_stash, elem->in_sg,
> > > > > > > > +                                   elem->in_num);
> > > > > > > > +            vhost_svq_unstash_addr(svq_elem->out_sg_stash, elem->out_sg,
> > > > > > > > +                                   elem->out_num);
> > > > > > > >               virtqueue_fill(vq, elem, elem->len, i++);
> > > > > > > >           }
> > > > > > > >
> > > > > > > > @@ -451,14 +526,24 @@ void vhost_svq_stop(struct vhost_dev *dev, unsigned idx,
> > > > > > > >       event_notifier_set_handler(&svq->host_notifier, NULL);
> > > > > > > >
> > > > > > > >       for (i = 0; i < svq->vring.num; ++i) {
> > > > > > > > -        g_autofree VirtQueueElement *elem = svq->ring_id_maps[i];
> > > > > > > > +        g_autofree SVQElement *svq_elem = svq->ring_id_maps[i];
> > > > > > > > +        VirtQueueElement *elem;
> > > > > > > > +
> > > > > > > > +        if (!svq_elem) {
> > > > > > > > +            continue;
> > > > > > > > +        }
> > > > > > > > +
> > > > > > > > +        elem = &svq_elem->elem;
> > > > > > > > +        vhost_svq_unstash_addr(svq_elem->in_sg_stash, elem->in_sg,
> > > > > > > > +                               elem->in_num);
> > > > > > > > +        vhost_svq_unstash_addr(svq_elem->out_sg_stash, elem->out_sg,
> > > > > > > > +                               elem->out_num);
> > > > > > > > +
> > > > > > > >           /*
> > > > > > > >            * Although the doc says we must unpop in order, it's ok to unpop
> > > > > > > >            * everything.
> > > > > > > >            */
> > > > > > > > -        if (elem) {
> > > > > > > > -            virtqueue_unpop(svq->vq, elem, elem->len);
> > > > > > > > -        }
> > > > > > > > +        virtqueue_unpop(svq->vq, elem, elem->len);
> > > > > > > >       }
> > > > > > > >   }
> > > > > > > >
> > > > > > > > @@ -466,7 +551,8 @@ void vhost_svq_stop(struct vhost_dev *dev, unsigned idx,
> > > > > > > >    * Creates vhost shadow virtqueue, and instruct vhost device to use the shadow
> > > > > > > >    * methods and file descriptors.
> > > > > > > >    */
> > > > > > > > -VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx)
> > > > > > > > +VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx,
> > > > > > > > +                                    VhostIOVATree *iova_map)
> > > > > > > >   {
> > > > > > > >       int vq_idx = dev->vq_index + idx;
> > > > > > > >       unsigned num = virtio_queue_get_num(dev->vdev, vq_idx);
> > > > > > > > @@ -500,11 +586,13 @@ VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx)
> > > > > > > >       memset(svq->vring.desc, 0, driver_size);
> > > > > > > >       svq->vring.used = qemu_memalign(qemu_real_host_page_size, device_size);
> > > > > > > >       memset(svq->vring.used, 0, device_size);
> > > > > > > > +    svq->iova_map = iova_map;
> > > > > > > > +
> > > > > > > >       for (i = 0; i < num - 1; i++) {
> > > > > > > >           svq->vring.desc[i].next = cpu_to_le16(i + 1);
> > > > > > > >       }
> > > > > > > >
> > > > > > > > -    svq->ring_id_maps = g_new0(VirtQueueElement *, num);
> > > > > > > > +    svq->ring_id_maps = g_new0(SVQElement *, num);
> > > > > > > >       event_notifier_set_handler(&svq->call_notifier,
> > > > > > > >                                  vhost_svq_handle_call);
> > > > > > > >       return g_steal_pointer(&svq);
> > > > > > > > diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
> > > > > > > > index a9c680b487..f5a12fee9d 100644
> > > > > > > > --- a/hw/virtio/vhost-vdpa.c
> > > > > > > > +++ b/hw/virtio/vhost-vdpa.c
> > > > > > > > @@ -176,6 +176,18 @@ static void vhost_vdpa_listener_region_add(MemoryListener *listener,
> > > > > > > >                                            vaddr, section->readonly);
> > > > > > > >
> > > > > > > >       llsize = int128_sub(llend, int128_make64(iova));
> > > > > > > > +    if (v->shadow_vqs_enabled) {
> > > > > > > > +        VhostDMAMap mem_region = {
> > > > > > > > +            .translated_addr = vaddr,
> > > > > > > > +            .size = int128_get64(llsize) - 1,
> > > > > > > > +            .perm = IOMMU_ACCESS_FLAG(true, section->readonly),
> > > > > > > > +        };
> > > > > > > > +
> > > > > > > > +        int r = vhost_iova_tree_alloc(v->iova_map, &mem_region);
> > > > > > > > +        assert(r == VHOST_DMA_MAP_OK);
> > > > > > > > +
> > > > > > > > +        iova = mem_region.iova;
> > > > > > > > +    }
> > > > > > > >
> > > > > > > >       ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize),
> > > > > > > >                                vaddr, section->readonly);
> > > > > > > > @@ -754,6 +766,23 @@ static bool  vhost_vdpa_force_iommu(struct vhost_dev *dev)
> > > > > > > >       return true;
> > > > > > > >   }
> > > > > > > >
> > > > > > > > +static int vhost_vdpa_get_iova_range(struct vhost_dev *dev,
> > > > > > > > +                                     hwaddr *first, hwaddr *last)
> > > > > > > > +{
> > > > > > > > +    int ret;
> > > > > > > > +    struct vhost_vdpa_iova_range range;
> > > > > > > > +
> > > > > > > > +    ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_IOVA_RANGE, &range);
> > > > > > > > +    if (ret != 0) {
> > > > > > > > +        return ret;
> > > > > > > > +    }
> > > > > > > > +
> > > > > > > > +    *first = range.first;
> > > > > > > > +    *last = range.last;
> > > > > > > > +    trace_vhost_vdpa_get_iova_range(dev, *first, *last);
> > > > > > > > +    return ret;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > >   /**
> > > > > > > >    * Maps QEMU vaddr memory to device in a suitable way for shadow virtqueue:
> > > > > > > >    * - It always reference qemu memory address, not guest's memory.
> > > > > > > > @@ -881,6 +910,7 @@ static bool vhost_vdpa_svq_start_vq(struct vhost_dev *dev, unsigned idx)
> > > > > > > >   static unsigned vhost_vdpa_enable_svq(struct vhost_vdpa *v, bool enable)
> > > > > > > >   {
> > > > > > > >       struct vhost_dev *hdev = v->dev;
> > > > > > > > +    hwaddr iova_first, iova_last;
> > > > > > > >       unsigned n;
> > > > > > > >       int r;
> > > > > > > >
> > > > > > > > @@ -894,7 +924,7 @@ static unsigned vhost_vdpa_enable_svq(struct vhost_vdpa *v, bool enable)
> > > > > > > >           /* Allocate resources */
> > > > > > > >           assert(v->shadow_vqs->len == 0);
> > > > > > > >           for (n = 0; n < hdev->nvqs; ++n) {
> > > > > > > > -            VhostShadowVirtqueue *svq = vhost_svq_new(hdev, n);
> > > > > > > > +            VhostShadowVirtqueue *svq = vhost_svq_new(hdev, n, v->iova_map);
> > > > > > > >               if (unlikely(!svq)) {
> > > > > > > >                   g_ptr_array_set_size(v->shadow_vqs, 0);
> > > > > > > >                   return 0;
> > > > > > > > @@ -903,6 +933,8 @@ static unsigned vhost_vdpa_enable_svq(struct vhost_vdpa *v, bool enable)
> > > > > > > >           }
> > > > > > > >       }
> > > > > > > >
> > > > > > > > +    r = vhost_vdpa_get_iova_range(hdev, &iova_first, &iova_last);
> > > > > > > > +    assert(r == 0);
> > > > > > > >       r = vhost_vdpa_vring_pause(hdev);
> > > > > > > >       assert(r == 0);
> > > > > > > >
> > > > > > > > @@ -913,6 +945,12 @@ static unsigned vhost_vdpa_enable_svq(struct vhost_vdpa *v, bool enable)
> > > > > > > >           }
> > > > > > > >       }
> > > > > > > >
> > > > > > > > +    memory_listener_unregister(&v->listener);
> > > > > > > > +    if (vhost_vdpa_dma_unmap(v, iova_first,
> > > > > > > > +                             (iova_last - iova_first) & TARGET_PAGE_MASK)) {
> > > > > > > > +        error_report("Fail to invalidate device iotlb");
> > > > > > > > +    }
> > > > > > > > +
> > > > > > > >       /* Reset device so it can be configured */
> > > > > > > >       r = vhost_vdpa_dev_start(hdev, false);
> > > > > > > >       assert(r == 0);
> > > > > > > > diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
> > > > > > > > index 8ed19e9d0c..650e521e35 100644
> > > > > > > > --- a/hw/virtio/trace-events
> > > > > > > > +++ b/hw/virtio/trace-events
> > > > > > > > @@ -52,6 +52,7 @@ vhost_vdpa_set_vring_call(void *dev, unsigned int index, int fd) "dev: %p index:
> > > > > > > >   vhost_vdpa_get_features(void *dev, uint64_t features) "dev: %p features: 0x%"PRIx64
> > > > > > > >   vhost_vdpa_set_owner(void *dev) "dev: %p"
> > > > > > > >   vhost_vdpa_vq_get_addr(void *dev, void *vq, uint64_t desc_user_addr, uint64_t avail_user_addr, uint64_t used_user_addr) "dev: %p vq: %p desc_user_addr: 0x%"PRIx64" avail_user_addr: 0x%"PRIx64" used_user_addr: 0x%"PRIx64
> > > > > > > > +vhost_vdpa_get_iova_range(void *dev, uint64_t first, uint64_t last) "dev: %p first: 0x%"PRIx64" last: 0x%"PRIx64
> > > > > > > >
> > > > > > > >   # virtio.c
> > > > > > > >   virtqueue_alloc_element(void *elem, size_t sz, unsigned in_num, unsigned out_num) "elem %p size %zd in_num %u out_num %u"
> > > > > > >
> > > > > >
> > > >
> > >
> >
>

Re: [RFC PATCH v4 20/20] vdpa: Add custom IOTLB translations to SVQ

Posted by Jason Wang 4 years, 3 months ago

On Wed, Oct 20, 2021 at 7:57 PM Eugenio Perez Martin
<eperezma@redhat.com> wrote:
>
> On Wed, Oct 20, 2021 at 11:03 AM Jason Wang <jasowang@redhat.com> wrote:
> >
> > On Wed, Oct 20, 2021 at 2:52 PM Eugenio Perez Martin
> > <eperezma@redhat.com> wrote:
> > >
> > > On Wed, Oct 20, 2021 at 4:07 AM Jason Wang <jasowang@redhat.com> wrote:
> > > >
> > > > On Wed, Oct 20, 2021 at 10:02 AM Jason Wang <jasowang@redhat.com> wrote:
> > > > >
> > > > > On Tue, Oct 19, 2021 at 6:29 PM Eugenio Perez Martin
> > > > > <eperezma@redhat.com> wrote:
> > > > > >
> > > > > > On Tue, Oct 19, 2021 at 11:25 AM Jason Wang <jasowang@redhat.com> wrote:
> > > > > > >
> > > > > > >
> > > > > > > 在 2021/10/1 下午3:06, Eugenio Pérez 写道:
> > > > > > > > Use translations added in VhostIOVATree in SVQ.
> > > > > > > >
> > > > > > > > Now every element needs to store the previous address also, so VirtQueue
> > > > > > > > can consume the elements properly. This adds a little overhead per VQ
> > > > > > > > element, having to allocate more memory to stash them. As a possible
> > > > > > > > optimization, this allocation could be avoided if the descriptor is not
> > > > > > > > a chain but a single one, but this is left undone.
> > > > > > > >
> > > > > > > > TODO: iova range should be queried before, and add logic to fail when
> > > > > > > > GPA is outside of its range and memory listener or svq add it.
> > > > > > > >
> > > > > > > > Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> > > > > > > > ---
> > > > > > > >   hw/virtio/vhost-shadow-virtqueue.h |   4 +-
> > > > > > > >   hw/virtio/vhost-shadow-virtqueue.c | 130 ++++++++++++++++++++++++-----
> > > > > > > >   hw/virtio/vhost-vdpa.c             |  40 ++++++++-
> > > > > > > >   hw/virtio/trace-events             |   1 +
> > > > > > > >   4 files changed, 152 insertions(+), 23 deletions(-)
> > > > > > > >
> > > > > > > > diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
> > > > > > > > index b7baa424a7..a0e6b5267a 100644
> > > > > > > > --- a/hw/virtio/vhost-shadow-virtqueue.h
> > > > > > > > +++ b/hw/virtio/vhost-shadow-virtqueue.h
> > > > > > > > @@ -11,6 +11,7 @@
> > > > > > > >   #define VHOST_SHADOW_VIRTQUEUE_H
> > > > > > > >
> > > > > > > >   #include "hw/virtio/vhost.h"
> > > > > > > > +#include "hw/virtio/vhost-iova-tree.h"
> > > > > > > >
> > > > > > > >   typedef struct VhostShadowVirtqueue VhostShadowVirtqueue;
> > > > > > > >
> > > > > > > > @@ -28,7 +29,8 @@ bool vhost_svq_start(struct vhost_dev *dev, unsigned idx,
> > > > > > > >   void vhost_svq_stop(struct vhost_dev *dev, unsigned idx,
> > > > > > > >                       VhostShadowVirtqueue *svq);
> > > > > > > >
> > > > > > > > -VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx);
> > > > > > > > +VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx,
> > > > > > > > +                                    VhostIOVATree *iova_map);
> > > > > > > >
> > > > > > > >   void vhost_svq_free(VhostShadowVirtqueue *vq);
> > > > > > > >
> > > > > > > > diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
> > > > > > > > index 2fd0bab75d..9db538547e 100644
> > > > > > > > --- a/hw/virtio/vhost-shadow-virtqueue.c
> > > > > > > > +++ b/hw/virtio/vhost-shadow-virtqueue.c
> > > > > > > > @@ -11,12 +11,19 @@
> > > > > > > >   #include "hw/virtio/vhost-shadow-virtqueue.h"
> > > > > > > >   #include "hw/virtio/vhost.h"
> > > > > > > >   #include "hw/virtio/virtio-access.h"
> > > > > > > > +#include "hw/virtio/vhost-iova-tree.h"
> > > > > > > >
> > > > > > > >   #include "standard-headers/linux/vhost_types.h"
> > > > > > > >
> > > > > > > >   #include "qemu/error-report.h"
> > > > > > > >   #include "qemu/main-loop.h"
> > > > > > > >
> > > > > > > > +typedef struct SVQElement {
> > > > > > > > +    VirtQueueElement elem;
> > > > > > > > +    void **in_sg_stash;
> > > > > > > > +    void **out_sg_stash;
> > > > > > > > +} SVQElement;
> > > > > > > > +
> > > > > > > >   /* Shadow virtqueue to relay notifications */
> > > > > > > >   typedef struct VhostShadowVirtqueue {
> > > > > > > >       /* Shadow vring */
> > > > > > > > @@ -46,8 +53,11 @@ typedef struct VhostShadowVirtqueue {
> > > > > > > >       /* Virtio device */
> > > > > > > >       VirtIODevice *vdev;
> > > > > > > >
> > > > > > > > +    /* IOVA mapping if used */
> > > > > > > > +    VhostIOVATree *iova_map;
> > > > > > > > +
> > > > > > > >       /* Map for returning guest's descriptors */
> > > > > > > > -    VirtQueueElement **ring_id_maps;
> > > > > > > > +    SVQElement **ring_id_maps;
> > > > > > > >
> > > > > > > >       /* Next head to expose to device */
> > > > > > > >       uint16_t avail_idx_shadow;
> > > > > > > > @@ -79,13 +89,6 @@ bool vhost_svq_valid_device_features(uint64_t *dev_features)
> > > > > > > >               continue;
> > > > > > > >
> > > > > > > >           case VIRTIO_F_ACCESS_PLATFORM:
> > > > > > > > -            /* SVQ needs this feature disabled. Can't continue */
> > > > > > > > -            if (*dev_features & BIT_ULL(b)) {
> > > > > > > > -                clear_bit(b, dev_features);
> > > > > > > > -                r = false;
> > > > > > > > -            }
> > > > > > > > -            break;
> > > > > > > > -
> > > > > > > >           case VIRTIO_F_VERSION_1:
> > > > > > > >               /* SVQ needs this feature, so can't continue */
> > > > > > > >               if (!(*dev_features & BIT_ULL(b))) {
> > > > > > > > @@ -126,6 +129,64 @@ static void vhost_svq_set_notification(VhostShadowVirtqueue *svq, bool enable)
> > > > > > > >       }
> > > > > > > >   }
> > > > > > > >
> > > > > > > > +static void vhost_svq_stash_addr(void ***stash, const struct iovec *iov,
> > > > > > > > +                                 size_t num)
> > > > > > > > +{
> > > > > > > > +    size_t i;
> > > > > > > > +
> > > > > > > > +    if (num == 0) {
> > > > > > > > +        return;
> > > > > > > > +    }
> > > > > > > > +
> > > > > > > > +    *stash = g_new(void *, num);
> > > > > > > > +    for (i = 0; i < num; ++i) {
> > > > > > > > +        (*stash)[i] = iov[i].iov_base;
> > > > > > > > +    }
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +static void vhost_svq_unstash_addr(void **stash, struct iovec *iov, size_t num)
> > > > > > > > +{
> > > > > > > > +    size_t i;
> > > > > > > > +
> > > > > > > > +    if (num == 0) {
> > > > > > > > +        return;
> > > > > > > > +    }
> > > > > > > > +
> > > > > > > > +    for (i = 0; i < num; ++i) {
> > > > > > > > +        iov[i].iov_base = stash[i];
> > > > > > > > +    }
> > > > > > > > +    g_free(stash);
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +static void vhost_svq_translate_addr(const VhostShadowVirtqueue *svq,
> > > > > > > > +                                     struct iovec *iovec, size_t num)
> > > > > > > > +{
> > > > > > > > +    size_t i;
> > > > > > > > +
> > > > > > > > +    for (i = 0; i < num; ++i) {
> > > > > > > > +        VhostDMAMap needle = {
> > > > > > > > +            .translated_addr = iovec[i].iov_base,
> > > > > > > > +            .size = iovec[i].iov_len,
> > > > > > > > +        };
> > > > > > > > +        size_t off;
> > > > > > > > +
> > > > > > > > +        const VhostDMAMap *map = vhost_iova_tree_find_iova(svq->iova_map,
> > > > > > > > +                                                           &needle);
> > > > > > >
> > > > > > >
> > > > > > > Is it possible that we end up with more than one maps here?
> > > > > > >
> > > > > >
> > > > > > Actually it is possible, since there is no guarantee that one
> > > > > > descriptor (or indirect descriptor) maps exactly to one iov. It could
> > > > > > map to many if qemu vaddr is not contiguous but GPA + size is. This is
> > > > > > something that must be fixed for the next revision, so thanks for
> > > > > > pointing it out!
> > > > > >
> > > > > > Taking that into account, the condition that svq vring avail_idx -
> > > > > > used_idx was always less or equal than guest's vring avail_idx -
> > > > > > used_idx is not true anymore. Checking for that before adding buffers
> > > > > > to SVQ is the easy part, but how could we recover in that case?
> > > > > >
> > > > > > I think that the easy solution is to check for more available buffers
> > > > > > unconditionally at the end of vhost_svq_handle_call, which handles the
> > > > > > SVQ used and is supposed to make more room for available buffers. So
> > > > > > vhost_handle_guest_kick would not check if eventfd is set or not
> > > > > > anymore.
> > > > > >
> > > > > > Would that make sense?
> > > > >
> > > > > Yes, I think it should work.
> > > >
> > > > Btw, I wonder how to handle indirect descriptors. SVQ doesn't use
> > > > indirect descriptors for now, but it looks like a must otherwise we
> > > > may end up SVQ is full before VQ.
> > > >
> > >
> > > We can get to that situation without indirect too, if a single
> > > descriptor maps to more than one sg buffer. The next revision is going
> > > to control that too.
> > >
> > > > It looks to me an easy way is to always use indirect descriptors if #sg >= 2?
> > > >
> > >
> > > I will use that, but that does not solve the case where a descriptor
> > > maps to > 1 different buffers in qemu vaddr.
> >
> > Right, so we need to deal with the case when SVQ is out of space.
> >
> >
> > > So I think that some
> > > check after marking descriptors as used is a must somehow.
> >
> > I thought it should be before processing the available buffer?
>
> Yes, I meant after that. Somehow, because I include checking the
> number of sg buffers as "processing". :).
>
> > It's
> > the guest driver that make sure there's sufficient space for used
> > ring?
> >
>
> (I think we are talking the same with different words, but just in
> case I will develop the idea here with an example).
>
> The guest is able to check if there is enough space in the SVQ's
> vring, but not in the device's vring. As an example of this, imagine
> that a guest makes available a GPA contiguous buffer of 64K, one
> descriptor. However, this memory is divided into 16 chunks of 4K in
> qemu's VA space. Imagine that at this moment there are only eight
> slots free in each vring, and that neither communication is using
> indirect descriptors.
>
> The guest only needs 1 descriptor available to make that buffer
> available, so it will add to avail ring. But SVQ needs 16 chained
> descriptors, so the buffer is not going to reach the device until it
> makes at least 8 more descriptors as used. SVQ checked for the amount
> of available room, as you said, but it cannot forward the available
> one.
>
> Since the guest already sent kick when it made the descriptor
> available, we need another mechanism to know when we have all the
> needed free slots in the SVQ vring. And that's what I meant with the
> check after marking some buffers as available.
>
> I still think it is not worth it to protect the forwarding methods of
> hogging BQL, since there must be a limit sooner or later, but it is
> something that is worth putting on the table again. But this requires
> changes for the next version for sure.
>
> I can think in more scenarios, like guest making available an indirect
> descriptor of vq size that needs to be splitted in even more sgs. Qemu
> already does not support more than 1024 sgs buffers in VirtQueue, but
> a driver (as SVQ) must *not* create an indirect descriptor chain
> longer than the Queue Size. Should we always increase vq size to 1024
> always? I think these are highly unlikely, but again these concerns
> must be at least commented here.
>
> Does it make sense?

Makes a lot of sense. It's better to make the code robust without any
assumption on both host and guest configuration.

Thanks

>
> Thanks!
>
> > Thanks
> >
> > >
> > >
> > > > Thanks
> > > >
> > > > >
> > > > > Thanks
> > > > >
> > > > > >
> > > > > > Thanks!
> > > > > >
> > > > > > >
> > > > > > > > +        /*
> > > > > > > > +         * Map cannot be NULL since iova map contains all guest space and
> > > > > > > > +         * qemu already has a physical address mapped
> > > > > > > > +         */
> > > > > > > > +        assert(map);
> > > > > > > > +
> > > > > > > > +        /*
> > > > > > > > +         * Map->iova chunk size is ignored. What to do if descriptor
> > > > > > > > +         * (addr, size) does not fit is delegated to the device.
> > > > > > > > +         */
> > > > > > > > +        off = needle.translated_addr - map->translated_addr;
> > > > > > > > +        iovec[i].iov_base = (void *)(map->iova + off);
> > > > > > > > +    }
> > > > > > > > +}
> > > > > > > > +
> > > > > > > >   static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
> > > > > > > >                                       const struct iovec *iovec,
> > > > > > > >                                       size_t num, bool more_descs, bool write)
> > > > > > > > @@ -156,8 +217,9 @@ static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
> > > > > > > >   }
> > > > > > > >
> > > > > > > >   static unsigned vhost_svq_add_split(VhostShadowVirtqueue *svq,
> > > > > > > > -                                    VirtQueueElement *elem)
> > > > > > > > +                                    SVQElement *svq_elem)
> > > > > > > >   {
> > > > > > > > +    VirtQueueElement *elem = &svq_elem->elem;
> > > > > > > >       int head;
> > > > > > > >       unsigned avail_idx;
> > > > > > > >       vring_avail_t *avail = svq->vring.avail;
> > > > > > > > @@ -167,6 +229,12 @@ static unsigned vhost_svq_add_split(VhostShadowVirtqueue *svq,
> > > > > > > >       /* We need some descriptors here */
> > > > > > > >       assert(elem->out_num || elem->in_num);
> > > > > > > >
> > > > > > > > +    vhost_svq_stash_addr(&svq_elem->in_sg_stash, elem->in_sg, elem->in_num);
> > > > > > > > +    vhost_svq_stash_addr(&svq_elem->out_sg_stash, elem->out_sg, elem->out_num);
> > > > > > >
> > > > > > >
> > > > > > > I wonder if we can solve the trick like stash and unstash with a
> > > > > > > dedicated sgs in svq_elem, instead of reusing the elem.
> > > > > > >
> > > > > >
> > > > > > Actually yes, it would be way simpler to use a new sgs array in
> > > > > > svq_elem. I will change that.
> > > > > >
> > > > > > Thanks!
> > > > > >
> > > > > > > Thanks
> > > > > > >
> > > > > > >
> > > > > > > > +
> > > > > > > > +    vhost_svq_translate_addr(svq, elem->in_sg, elem->in_num);
> > > > > > > > +    vhost_svq_translate_addr(svq, elem->out_sg, elem->out_num);
> > > > > > > > +
> > > > > > > >       vhost_vring_write_descs(svq, elem->out_sg, elem->out_num,
> > > > > > > >                               elem->in_num > 0, false);
> > > > > > > >       vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
> > > > > > > > @@ -187,7 +255,7 @@ static unsigned vhost_svq_add_split(VhostShadowVirtqueue *svq,
> > > > > > > >
> > > > > > > >   }
> > > > > > > >
> > > > > > > > -static void vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
> > > > > > > > +static void vhost_svq_add(VhostShadowVirtqueue *svq, SVQElement *elem)
> > > > > > > >   {
> > > > > > > >       unsigned qemu_head = vhost_svq_add_split(svq, elem);
> > > > > > > >
> > > > > > > > @@ -221,7 +289,7 @@ static void vhost_handle_guest_kick(EventNotifier *n)
> > > > > > > >           }
> > > > > > > >
> > > > > > > >           while (true) {
> > > > > > > > -            VirtQueueElement *elem = virtqueue_pop(svq->vq, sizeof(*elem));
> > > > > > > > +            SVQElement *elem = virtqueue_pop(svq->vq, sizeof(*elem));
> > > > > > > >               if (!elem) {
> > > > > > > >                   break;
> > > > > > > >               }
> > > > > > > > @@ -247,7 +315,7 @@ static bool vhost_svq_more_used(VhostShadowVirtqueue *svq)
> > > > > > > >       return svq->used_idx != svq->shadow_used_idx;
> > > > > > > >   }
> > > > > > > >
> > > > > > > > -static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq)
> > > > > > > > +static SVQElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq)
> > > > > > > >   {
> > > > > > > >       vring_desc_t *descs = svq->vring.desc;
> > > > > > > >       const vring_used_t *used = svq->vring.used;
> > > > > > > > @@ -279,7 +347,7 @@ static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq)
> > > > > > > >       descs[used_elem.id].next = svq->free_head;
> > > > > > > >       svq->free_head = used_elem.id;
> > > > > > > >
> > > > > > > > -    svq->ring_id_maps[used_elem.id]->len = used_elem.len;
> > > > > > > > +    svq->ring_id_maps[used_elem.id]->elem.len = used_elem.len;
> > > > > > > >       return g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
> > > > > > > >   }
> > > > > > > >
> > > > > > > > @@ -296,12 +364,19 @@ static void vhost_svq_handle_call_no_test(EventNotifier *n)
> > > > > > > >
> > > > > > > >           vhost_svq_set_notification(svq, false);
> > > > > > > >           while (true) {
> > > > > > > > -            g_autofree VirtQueueElement *elem = vhost_svq_get_buf(svq);
> > > > > > > > -            if (!elem) {
> > > > > > > > +            g_autofree SVQElement *svq_elem = vhost_svq_get_buf(svq);
> > > > > > > > +            VirtQueueElement *elem;
> > > > > > > > +            if (!svq_elem) {
> > > > > > > >                   break;
> > > > > > > >               }
> > > > > > > >
> > > > > > > >               assert(i < svq->vring.num);
> > > > > > > > +            elem = &svq_elem->elem;
> > > > > > > > +
> > > > > > > > +            vhost_svq_unstash_addr(svq_elem->in_sg_stash, elem->in_sg,
> > > > > > > > +                                   elem->in_num);
> > > > > > > > +            vhost_svq_unstash_addr(svq_elem->out_sg_stash, elem->out_sg,
> > > > > > > > +                                   elem->out_num);
> > > > > > > >               virtqueue_fill(vq, elem, elem->len, i++);
> > > > > > > >           }
> > > > > > > >
> > > > > > > > @@ -451,14 +526,24 @@ void vhost_svq_stop(struct vhost_dev *dev, unsigned idx,
> > > > > > > >       event_notifier_set_handler(&svq->host_notifier, NULL);
> > > > > > > >
> > > > > > > >       for (i = 0; i < svq->vring.num; ++i) {
> > > > > > > > -        g_autofree VirtQueueElement *elem = svq->ring_id_maps[i];
> > > > > > > > +        g_autofree SVQElement *svq_elem = svq->ring_id_maps[i];
> > > > > > > > +        VirtQueueElement *elem;
> > > > > > > > +
> > > > > > > > +        if (!svq_elem) {
> > > > > > > > +            continue;
> > > > > > > > +        }
> > > > > > > > +
> > > > > > > > +        elem = &svq_elem->elem;
> > > > > > > > +        vhost_svq_unstash_addr(svq_elem->in_sg_stash, elem->in_sg,
> > > > > > > > +                               elem->in_num);
> > > > > > > > +        vhost_svq_unstash_addr(svq_elem->out_sg_stash, elem->out_sg,
> > > > > > > > +                               elem->out_num);
> > > > > > > > +
> > > > > > > >           /*
> > > > > > > >            * Although the doc says we must unpop in order, it's ok to unpop
> > > > > > > >            * everything.
> > > > > > > >            */
> > > > > > > > -        if (elem) {
> > > > > > > > -            virtqueue_unpop(svq->vq, elem, elem->len);
> > > > > > > > -        }
> > > > > > > > +        virtqueue_unpop(svq->vq, elem, elem->len);
> > > > > > > >       }
> > > > > > > >   }
> > > > > > > >
> > > > > > > > @@ -466,7 +551,8 @@ void vhost_svq_stop(struct vhost_dev *dev, unsigned idx,
> > > > > > > >    * Creates vhost shadow virtqueue, and instruct vhost device to use the shadow
> > > > > > > >    * methods and file descriptors.
> > > > > > > >    */
> > > > > > > > -VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx)
> > > > > > > > +VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx,
> > > > > > > > +                                    VhostIOVATree *iova_map)
> > > > > > > >   {
> > > > > > > >       int vq_idx = dev->vq_index + idx;
> > > > > > > >       unsigned num = virtio_queue_get_num(dev->vdev, vq_idx);
> > > > > > > > @@ -500,11 +586,13 @@ VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx)
> > > > > > > >       memset(svq->vring.desc, 0, driver_size);
> > > > > > > >       svq->vring.used = qemu_memalign(qemu_real_host_page_size, device_size);
> > > > > > > >       memset(svq->vring.used, 0, device_size);
> > > > > > > > +    svq->iova_map = iova_map;
> > > > > > > > +
> > > > > > > >       for (i = 0; i < num - 1; i++) {
> > > > > > > >           svq->vring.desc[i].next = cpu_to_le16(i + 1);
> > > > > > > >       }
> > > > > > > >
> > > > > > > > -    svq->ring_id_maps = g_new0(VirtQueueElement *, num);
> > > > > > > > +    svq->ring_id_maps = g_new0(SVQElement *, num);
> > > > > > > >       event_notifier_set_handler(&svq->call_notifier,
> > > > > > > >                                  vhost_svq_handle_call);
> > > > > > > >       return g_steal_pointer(&svq);
> > > > > > > > diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
> > > > > > > > index a9c680b487..f5a12fee9d 100644
> > > > > > > > --- a/hw/virtio/vhost-vdpa.c
> > > > > > > > +++ b/hw/virtio/vhost-vdpa.c
> > > > > > > > @@ -176,6 +176,18 @@ static void vhost_vdpa_listener_region_add(MemoryListener *listener,
> > > > > > > >                                            vaddr, section->readonly);
> > > > > > > >
> > > > > > > >       llsize = int128_sub(llend, int128_make64(iova));
> > > > > > > > +    if (v->shadow_vqs_enabled) {
> > > > > > > > +        VhostDMAMap mem_region = {
> > > > > > > > +            .translated_addr = vaddr,
> > > > > > > > +            .size = int128_get64(llsize) - 1,
> > > > > > > > +            .perm = IOMMU_ACCESS_FLAG(true, section->readonly),
> > > > > > > > +        };
> > > > > > > > +
> > > > > > > > +        int r = vhost_iova_tree_alloc(v->iova_map, &mem_region);
> > > > > > > > +        assert(r == VHOST_DMA_MAP_OK);
> > > > > > > > +
> > > > > > > > +        iova = mem_region.iova;
> > > > > > > > +    }
> > > > > > > >
> > > > > > > >       ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize),
> > > > > > > >                                vaddr, section->readonly);
> > > > > > > > @@ -754,6 +766,23 @@ static bool  vhost_vdpa_force_iommu(struct vhost_dev *dev)
> > > > > > > >       return true;
> > > > > > > >   }
> > > > > > > >
> > > > > > > > +static int vhost_vdpa_get_iova_range(struct vhost_dev *dev,
> > > > > > > > +                                     hwaddr *first, hwaddr *last)
> > > > > > > > +{
> > > > > > > > +    int ret;
> > > > > > > > +    struct vhost_vdpa_iova_range range;
> > > > > > > > +
> > > > > > > > +    ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_IOVA_RANGE, &range);
> > > > > > > > +    if (ret != 0) {
> > > > > > > > +        return ret;
> > > > > > > > +    }
> > > > > > > > +
> > > > > > > > +    *first = range.first;
> > > > > > > > +    *last = range.last;
> > > > > > > > +    trace_vhost_vdpa_get_iova_range(dev, *first, *last);
> > > > > > > > +    return ret;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > >   /**
> > > > > > > >    * Maps QEMU vaddr memory to device in a suitable way for shadow virtqueue:
> > > > > > > >    * - It always reference qemu memory address, not guest's memory.
> > > > > > > > @@ -881,6 +910,7 @@ static bool vhost_vdpa_svq_start_vq(struct vhost_dev *dev, unsigned idx)
> > > > > > > >   static unsigned vhost_vdpa_enable_svq(struct vhost_vdpa *v, bool enable)
> > > > > > > >   {
> > > > > > > >       struct vhost_dev *hdev = v->dev;
> > > > > > > > +    hwaddr iova_first, iova_last;
> > > > > > > >       unsigned n;
> > > > > > > >       int r;
> > > > > > > >
> > > > > > > > @@ -894,7 +924,7 @@ static unsigned vhost_vdpa_enable_svq(struct vhost_vdpa *v, bool enable)
> > > > > > > >           /* Allocate resources */
> > > > > > > >           assert(v->shadow_vqs->len == 0);
> > > > > > > >           for (n = 0; n < hdev->nvqs; ++n) {
> > > > > > > > -            VhostShadowVirtqueue *svq = vhost_svq_new(hdev, n);
> > > > > > > > +            VhostShadowVirtqueue *svq = vhost_svq_new(hdev, n, v->iova_map);
> > > > > > > >               if (unlikely(!svq)) {
> > > > > > > >                   g_ptr_array_set_size(v->shadow_vqs, 0);
> > > > > > > >                   return 0;
> > > > > > > > @@ -903,6 +933,8 @@ static unsigned vhost_vdpa_enable_svq(struct vhost_vdpa *v, bool enable)
> > > > > > > >           }
> > > > > > > >       }
> > > > > > > >
> > > > > > > > +    r = vhost_vdpa_get_iova_range(hdev, &iova_first, &iova_last);
> > > > > > > > +    assert(r == 0);
> > > > > > > >       r = vhost_vdpa_vring_pause(hdev);
> > > > > > > >       assert(r == 0);
> > > > > > > >
> > > > > > > > @@ -913,6 +945,12 @@ static unsigned vhost_vdpa_enable_svq(struct vhost_vdpa *v, bool enable)
> > > > > > > >           }
> > > > > > > >       }
> > > > > > > >
> > > > > > > > +    memory_listener_unregister(&v->listener);
> > > > > > > > +    if (vhost_vdpa_dma_unmap(v, iova_first,
> > > > > > > > +                             (iova_last - iova_first) & TARGET_PAGE_MASK)) {
> > > > > > > > +        error_report("Fail to invalidate device iotlb");
> > > > > > > > +    }
> > > > > > > > +
> > > > > > > >       /* Reset device so it can be configured */
> > > > > > > >       r = vhost_vdpa_dev_start(hdev, false);
> > > > > > > >       assert(r == 0);
> > > > > > > > diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
> > > > > > > > index 8ed19e9d0c..650e521e35 100644
> > > > > > > > --- a/hw/virtio/trace-events
> > > > > > > > +++ b/hw/virtio/trace-events
> > > > > > > > @@ -52,6 +52,7 @@ vhost_vdpa_set_vring_call(void *dev, unsigned int index, int fd) "dev: %p index:
> > > > > > > >   vhost_vdpa_get_features(void *dev, uint64_t features) "dev: %p features: 0x%"PRIx64
> > > > > > > >   vhost_vdpa_set_owner(void *dev) "dev: %p"
> > > > > > > >   vhost_vdpa_vq_get_addr(void *dev, void *vq, uint64_t desc_user_addr, uint64_t avail_user_addr, uint64_t used_user_addr) "dev: %p vq: %p desc_user_addr: 0x%"PRIx64" avail_user_addr: 0x%"PRIx64" used_user_addr: 0x%"PRIx64
> > > > > > > > +vhost_vdpa_get_iova_range(void *dev, uint64_t first, uint64_t last) "dev: %p first: 0x%"PRIx64" last: 0x%"PRIx64
> > > > > > > >
> > > > > > > >   # virtio.c
> > > > > > > >   virtqueue_alloc_element(void *elem, size_t sz, unsigned in_num, unsigned out_num) "elem %p size %zd in_num %u out_num %u"
> > > > > > >
> > > > > >
> > > >
> > >
> >
>