Series comparison

-[PULL V2 00/25] Net patches
+[PULL V3 00/15] Net patches
-The following changes since commit d48125de38f48a61d6423ef6a01156d6dff9ee2c:
+The following changes since commit 352998df1c53b366413690d95b35f76d0721ebed:
-  Merge tag 'kraxel-20220719-pull-request' of https://gitlab.com/kraxel/qemu into staging (2022-07-19 17:40:36 +0100)
+  Merge tag 'i2c-20220314' of https://github.com/philmd/qemu into staging (2022-03-14 14:39:33 +0000)
 are available in the git repository at:
   https://github.com/jasowang/qemu.git tags/net-pull-request
-for you to fetch changes up to 8bdab83b34efb0b598be4e5b98e4f466ca5f2f80:
+for you to fetch changes up to 12a195fa343aae2ead1301ce04727bd0ae25eb15:
-  net/colo.c: fix segmentation fault when packet is not parsed correctly (2022-07-20 16:58:08 +0800)
+  vdpa: Expose VHOST_F_LOG_ALL on SVQ (2022-03-15 13:57:44 +0800)
 ----------------------------------------------------------------
-Changes since V1:
+Changes since V2:
-- Fix build erros of vhost-vdpa when virtio-net is not set
+- fix 32bit build errros
 ----------------------------------------------------------------
-Eugenio Pérez (21):
+Eugenio Pérez (14):
-      vhost: move descriptor translation to vhost_svq_vring_write_descs
+      vhost: Add VhostShadowVirtqueue
-      virtio-net: Expose MAC_TABLE_ENTRIES
+      vhost: Add Shadow VirtQueue kick forwarding capabilities
-      virtio-net: Expose ctrl virtqueue logic
+      vhost: Add Shadow VirtQueue call forwarding capabilities
-      vdpa: Avoid compiler to squash reads to used idx
+      vhost: Add vhost_svq_valid_features to shadow vq
-      vhost: Reorder vhost_svq_kick
+      virtio: Add vhost_svq_get_vring_addr
-      vhost: Move vhost_svq_kick call to vhost_svq_add
+      vdpa: adapt vhost_ops callbacks to svq
-      vhost: Check for queue full at vhost_svq_add
+      vhost: Shadow virtqueue buffers forwarding
-      vhost: Decouple vhost_svq_add from VirtQueueElement
+      util: Add iova_tree_alloc_map
-      vhost: Add SVQDescState
+      util: add iova_tree_find_iova
-      vhost: Track number of descs in SVQDescState
+      vhost: Add VhostIOVATree
-      vhost: add vhost_svq_push_elem
+      vdpa: Add custom IOTLB translations to SVQ
-      vhost: Expose vhost_svq_add
+      vdpa: Adapt vhost_vdpa_get_vring_base to SVQ
-      vhost: add vhost_svq_poll
+      vdpa: Never set log_base addr if SVQ is enabled
-      vhost: Add svq avail_handler callback
+      vdpa: Expose VHOST_F_LOG_ALL on SVQ
       vdpa: Export vhost_vdpa_dma_map and unmap calls
       vhost-net-vdpa: add stubs for when no virtio-net device is present
       vdpa: manual forward CVQ buffers
       vdpa: Buffer CVQ support on shadow virtqueue
       vdpa: Extract get features part from vhost_vdpa_get_max_queue_pairs
       vdpa: Add device migration blocker
       vdpa: Add x-svq to NetdevVhostVDPAOptions
-Zhang Chen (4):
+Jason Wang (1):
-      softmmu/runstate.c: add RunStateTransition support form COLO to PRELAUNCH
+      virtio-net: fix map leaking on error during receive
       net/colo: Fix a "double free" crash to clear the conn_list
       net/colo.c: No need to track conn_list for filter-rewriter
       net/colo.c: fix segmentation fault when packet is not parsed correctly
- hw/net/virtio-net.c                |  85 +++++----
+ hw/net/virtio-net.c                |   1 +
- hw/virtio/vhost-shadow-virtqueue.c | 210 +++++++++++++++-------
+ hw/virtio/meson.build              |   2 +-
- hw/virtio/vhost-shadow-virtqueue.h |  52 +++++-
+ hw/virtio/vhost-iova-tree.c        | 110 +++++++
- hw/virtio/vhost-vdpa.c             |  26 ++-
+ hw/virtio/vhost-iova-tree.h        |  27 ++
  hw/virtio/vhost-shadow-virtqueue.c | 636 +++++++++++++++++++++++++++++++++++++
  hw/virtio/vhost-shadow-virtqueue.h |  87 +++++
  hw/virtio/vhost-vdpa.c             | 522 +++++++++++++++++++++++++++++-
  include/hw/virtio/vhost-vdpa.h     |   8 +
- include/hw/virtio/virtio-net.h     |   7 +
+ include/qemu/iova-tree.h           |  38 ++-
- net/colo-compare.c                 |   2 +-
+ util/iova-tree.c                   | 170 ++++++++++
- net/colo.c                         |  11 +-
+files changed, 1584 insertions(+), 17 deletions(-)
- net/filter-rewriter.c              |   2 +-
+ create mode 100644 hw/virtio/vhost-iova-tree.c
- net/meson.build                    |   3 +-
+ create mode 100644 hw/virtio/vhost-iova-tree.h
- net/trace-events                   |   1 +
+ create mode 100644 hw/virtio/vhost-shadow-virtqueue.c
- net/vhost-vdpa-stub.c              |  21 +++
+ create mode 100644 hw/virtio/vhost-shadow-virtqueue.h
  net/vhost-vdpa.c                   | 357 +++++++++++++++++++++++++++++++++++--
  qapi/net.json                      |   9 +-
  softmmu/runstate.c                 |   1 +
 files changed, 671 insertions(+), 124 deletions(-)
  create mode 100644 net/vhost-vdpa-stub.c

-[PULL V2 01/25] vhost: move descriptor translation to vhost_svq_vring_write_descs
+Deleted patch
-From: Eugenio Pérez <eperezma@redhat.com>
-It's done for both in and out descriptors so it's better placed here.
-Acked-by: Jason Wang <jasowang@redhat.com>
-Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- hw/virtio/vhost-shadow-virtqueue.c | 38 +++++++++++++++++++++++++++-----------
-file changed, 27 insertions(+), 11 deletions(-)
-diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/virtio/vhost-shadow-virtqueue.c
-+++ b/hw/virtio/vhost-shadow-virtqueue.c
-@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_translate_addr(const VhostShadowVirtqueue *svq,
-     return true;
- }
--static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg,
--                                    const struct iovec *iovec, size_t num,
--                                    bool more_descs, bool write)
-+/**
-+ * Write descriptors to SVQ vring
-+ *
-+ * @svq: The shadow virtqueue
-+ * @sg: Cache for hwaddr
-+ * @iovec: The iovec from the guest
-+ * @num: iovec length
-+ * @more_descs: True if more descriptors come in the chain
-+ * @write: True if they are writeable descriptors
-+ *
-+ * Return true if success, false otherwise and print error.
-+ */
-+static bool vhost_svq_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg,
-+                                        const struct iovec *iovec, size_t num,
-+                                        bool more_descs, bool write)
- {
-     uint16_t i = svq->free_head, last = svq->free_head;
-     unsigned n;
-     uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0;
-     vring_desc_t *descs = svq->vring.desc;
-+    bool ok;
-     if (num == 0) {
--        return;
-+        return true;
-+    }
-+
-+    ok = vhost_svq_translate_addr(svq, sg, iovec, num);
-+    if (unlikely(!ok)) {
-+        return false;
-     }
-     for (n = 0; n < num; n++) {
-@@ -XXX,XX +XXX,XX @@ static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg,
-     }
-     svq->free_head = le16_to_cpu(svq->desc_next[last]);
-+    return true;
- }
- static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
-@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
-         return false;
-     }
--    ok = vhost_svq_translate_addr(svq, sgs, elem->out_sg, elem->out_num);
-+    ok = vhost_svq_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num,
-+                                     elem->in_num > 0, false);
-     if (unlikely(!ok)) {
-         return false;
-     }
--    vhost_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num,
--                            elem->in_num > 0, false);
--
--    ok = vhost_svq_translate_addr(svq, sgs, elem->in_sg, elem->in_num);
-+    ok = vhost_svq_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, false,
-+                                     true);
-     if (unlikely(!ok)) {
-         return false;
-     }
--    vhost_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, false, true);
--
-     /*
-      * Put the entry in the available array (but don't update avail->idx until
-      * they do sync).
---
-.7.4

-[PULL V2 02/25] virtio-net: Expose MAC_TABLE_ENTRIES
+[PULL V3 01/15] virtio-net: fix map leaking on error during receive
-From: Eugenio Pérez <eperezma@redhat.com>
+Commit bedd7e93d0196 ("virtio-net: fix use after unmap/free for sg")
 tries to fix the use after free of the sg by caching the virtqueue
 elements in an array and unmap them at once after receiving the
 packets, But it forgot to unmap the cached elements on error which
 will lead to leaking of mapping and other unexpected results.
-vhost-vdpa control virtqueue needs to know the maximum entries supported
+Fixing this by detaching the cached elements on error. This addresses
-by the virtio-net device, so we know if it is possible to apply the
+CVE-2022-26353.
 filter.
-Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
+Reported-by: Victor Tom <vv474172261@gmail.com>
 Cc: qemu-stable@nongnu.org
 Fixes: CVE-2022-26353
 Fixes: bedd7e93d0196 ("virtio-net: fix use after unmap/free for sg")
 Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/virtio-net.c            | 1 -
+ hw/net/virtio-net.c | 1 +
- include/hw/virtio/virtio-net.h | 3 +++
+file changed, 1 insertion(+)
 files changed, 3 insertions(+), 1 deletion(-)
 diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/net/virtio-net.c
 +++ b/hw/net/virtio-net.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
- #define VIRTIO_NET_VM_VERSION    11
+ err:
+     for (j = 0; j < i; j++) {
--#define MAC_TABLE_ENTRIES    64
++        virtqueue_detach_element(q->rx_vq, elems[j], lens[j]);
- #define MAX_VLAN    (1 << 12)   /* Per 802.1Q definition */
+         g_free(elems[j]);
+     }
- /* previously fixed value */
 diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/hw/virtio/virtio-net.h
 +++ b/include/hw/virtio/virtio-net.h
@@ -XXX,XX +XXX,XX @@ OBJECT_DECLARE_SIMPLE_TYPE(VirtIONet, VIRTIO_NET)
   * and latency. */
  #define TX_BURST 256
 +/* Maximum VIRTIO_NET_CTRL_MAC_TABLE_SET unicast + multicast entries. */
 +#define MAC_TABLE_ENTRIES    64
 +
  typedef struct virtio_net_conf
  {
      uint32_t txtimer;
 --
 .7.4

-[PULL V2 16/25] vhost-net-vdpa: add stubs for when no virtio-net device is present
+[PULL V3 02/15] vhost: Add VhostShadowVirtqueue
 From: Eugenio Pérez <eperezma@redhat.com>
-net/vhost-vdpa.c will need functions that are declared in
+Vhost shadow virtqueue (SVQ) is an intermediate jump for virtqueue
-vhost-shadow-virtqueue.c, that needs functions of virtio-net.c.
+notifications and buffers, allowing qemu to track them. While qemu is
 forwarding the buffers and virtqueue changes, it is able to commit the
 memory it's being dirtied, the same way regular qemu's VirtIO devices
 do.
-Copy the vhost-vdpa-stub.c code so
+This commit only exposes basic SVQ allocation and free. Next patches of
-only the constructor net_init_vhost_vdpa needs to be defined.
+the series add functionality like notifications and buffers forwarding.
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- net/meson.build       |  3 ++-
+ hw/virtio/meson.build              |  2 +-
- net/vhost-vdpa-stub.c | 21 +++++++++++++++++++++
+ hw/virtio/vhost-shadow-virtqueue.c | 62 ++++++++++++++++++++++++++++++++++++++
-files changed, 23 insertions(+), 1 deletion(-)
+ hw/virtio/vhost-shadow-virtqueue.h | 28 +++++++++++++++++
- create mode 100644 net/vhost-vdpa-stub.c
+files changed, 91 insertions(+), 1 deletion(-)
  create mode 100644 hw/virtio/vhost-shadow-virtqueue.c
  create mode 100644 hw/virtio/vhost-shadow-virtqueue.h
-diff --git a/net/meson.build b/net/meson.build
+diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
 index XXXXXXX..XXXXXXX 100644
---- a/net/meson.build
+--- a/hw/virtio/meson.build
-+++ b/net/meson.build
++++ b/hw/virtio/meson.build
-@@ -XXX,XX +XXX,XX @@ endif
+@@ -XXX,XX +XXX,XX @@ softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-stub.c'))
- softmmu_ss.add(when: 'CONFIG_POSIX', if_true: files(tap_posix))
- softmmu_ss.add(when: 'CONFIG_WIN32', if_true: files('tap-win32.c'))
+ virtio_ss = ss.source_set()
- if have_vhost_net_vdpa
+ virtio_ss.add(files('virtio.c'))
--  softmmu_ss.add(files('vhost-vdpa.c'))
+-virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c'))
-+  softmmu_ss.add(when: 'CONFIG_VIRTIO_NET', if_true: files('vhost-vdpa.c'), if_false: files('vhost-vdpa-stub.c'))
++virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c'))
-+  softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-vdpa-stub.c'))
+ virtio_ss.add(when: 'CONFIG_VHOST_USER', if_true: files('vhost-user.c'))
- endif
+ virtio_ss.add(when: 'CONFIG_VHOST_VDPA', if_true: files('vhost-vdpa.c'))
+ virtio_ss.add(when: 'CONFIG_VIRTIO_BALLOON', if_true: files('virtio-balloon.c'))
- vmnet_files = files(
+diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 diff --git a/net/vhost-vdpa-stub.c b/net/vhost-vdpa-stub.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
-+++ b/net/vhost-vdpa-stub.c
++++ b/hw/virtio/vhost-shadow-virtqueue.c
 @@ -XXX,XX +XXX,XX @@
 +/*
-+ * vhost-vdpa-stub.c
++ * vhost shadow virtqueue
 + *
-+ * Copyright (c) 2022 Red Hat, Inc.
++ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
 + * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
 + *
-+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
++ * SPDX-License-Identifier: GPL-2.0-or-later
 + * See the COPYING file in the top-level directory.
 + *
 + */
 +
 +#include "qemu/osdep.h"
-+#include "clients.h"
++#include "hw/virtio/vhost-shadow-virtqueue.h"
 +#include "net/vhost-vdpa.h"
 +#include "qapi/error.h"
 +
-+int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
++#include "qemu/error-report.h"
-+                        NetClientState *peer, Error **errp)
++
 +/**
 + * Creates vhost shadow virtqueue, and instructs the vhost device to use the
 + * shadow methods and file descriptors.
 + *
 + * Returns the new virtqueue or NULL.
 + *
 + * In case of error, reason is reported through error_report.
 + */
 +VhostShadowVirtqueue *vhost_svq_new(void)
 +{
-+    error_setg(errp, "vhost-vdpa requires frontend driver virtio-net-*");
++    g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
-+    return -1;
++    int r;
 +
 +    r = event_notifier_init(&svq->hdev_kick, 0);
 +    if (r != 0) {
 +        error_report("Couldn't create kick event notifier: %s (%d)",
 +                     g_strerror(errno), errno);
 +        goto err_init_hdev_kick;
 +    }
 +
 +    r = event_notifier_init(&svq->hdev_call, 0);
 +    if (r != 0) {
 +        error_report("Couldn't create call event notifier: %s (%d)",
 +                     g_strerror(errno), errno);
 +        goto err_init_hdev_call;
 +    }
 +
 +    return g_steal_pointer(&svq);
 +
 +err_init_hdev_call:
 +    event_notifier_cleanup(&svq->hdev_kick);
 +
 +err_init_hdev_kick:
 +    return NULL;
 +}
++
++/**
++ * Free the resources of the shadow virtqueue.
++ *
++ * @pvq: gpointer to SVQ so it can be used by autofree functions.
++ */
++void vhost_svq_free(gpointer pvq)
++{
++    VhostShadowVirtqueue *vq = pvq;
++    event_notifier_cleanup(&vq->hdev_kick);
++    event_notifier_cleanup(&vq->hdev_call);
++    g_free(vq);
++}
+diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
+new file mode 100644
+index XXXXXXX..XXXXXXX
+--- /dev/null
++++ b/hw/virtio/vhost-shadow-virtqueue.h
+@@ -XXX,XX +XXX,XX @@
++/*
++ * vhost shadow virtqueue
++ *
++ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
++ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
++ *
++ * SPDX-License-Identifier: GPL-2.0-or-later
++ */
++
++#ifndef VHOST_SHADOW_VIRTQUEUE_H
++#define VHOST_SHADOW_VIRTQUEUE_H
++
++#include "qemu/event_notifier.h"
++
++/* Shadow virtqueue to relay notifications */
++typedef struct VhostShadowVirtqueue {
++    /* Shadow kick notifier, sent to vhost */
++    EventNotifier hdev_kick;
++    /* Shadow call notifier, sent to vhost */
++    EventNotifier hdev_call;
++} VhostShadowVirtqueue;
++
++VhostShadowVirtqueue *vhost_svq_new(void);
++
++void vhost_svq_free(gpointer vq);
++G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
++
++#endif
 --
 .7.4

-[PULL V2 17/25] vdpa: manual forward CVQ buffers
+[PULL V3 03/15] vhost: Add Shadow VirtQueue kick forwarding capabilities
 From: Eugenio Pérez <eperezma@redhat.com>
-Do a simple forwarding of CVQ buffers, the same work SVQ could do but
+At this mode no buffer forwarding will be performed in SVQ mode: Qemu
-through callbacks. No functional change intended.
+will just forward the guest's kicks to the device.
 Host memory notifiers regions are left out for simplicity, and they will
 not be addressed in this series.
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/virtio/vhost-vdpa.c         |  3 ++-
+ hw/virtio/vhost-shadow-virtqueue.c |  55 ++++++++++++++
- include/hw/virtio/vhost-vdpa.h |  3 +++
+ hw/virtio/vhost-shadow-virtqueue.h |  14 ++++
- net/vhost-vdpa.c               | 58 ++++++++++++++++++++++++++++++++++++++++++
+ hw/virtio/vhost-vdpa.c             | 144 ++++++++++++++++++++++++++++++++++++-
-files changed, 63 insertions(+), 1 deletion(-)
+ include/hw/virtio/vhost-vdpa.h     |   4 ++
+files changed, 215 insertions(+), 2 deletions(-)
 diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-shadow-virtqueue.c
 +++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@
  #include "hw/virtio/vhost-shadow-virtqueue.h"
  #include "qemu/error-report.h"
 +#include "qemu/main-loop.h"
 +#include "linux-headers/linux/vhost.h"
 +
 +/**
 + * Forward guest notifications.
 + *
 + * @n: guest kick event notifier, the one that guest set to notify svq.
 + */
 +static void vhost_handle_guest_kick(EventNotifier *n)
 +{
 +    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, svq_kick);
 +    event_notifier_test_and_clear(n);
 +    event_notifier_set(&svq->hdev_kick);
 +}
 +
 +/**
 + * Set a new file descriptor for the guest to kick the SVQ and notify for avail
 + *
 + * @svq: The svq
 + * @svq_kick_fd: The svq kick fd
 + *
 + * Note that the SVQ will never close the old file descriptor.
 + */
 +void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
 +{
 +    EventNotifier *svq_kick = &svq->svq_kick;
 +    bool poll_stop = VHOST_FILE_UNBIND != event_notifier_get_fd(svq_kick);
 +    bool poll_start = svq_kick_fd != VHOST_FILE_UNBIND;
 +
 +    if (poll_stop) {
 +        event_notifier_set_handler(svq_kick, NULL);
 +    }
 +
 +    /*
 +     * event_notifier_set_handler already checks for guest's notifications if
 +     * they arrive at the new file descriptor in the switch, so there is no
 +     * need to explicitly check for them.
 +     */
 +    if (poll_start) {
 +        event_notifier_init_fd(svq_kick, svq_kick_fd);
 +        event_notifier_set(svq_kick);
 +        event_notifier_set_handler(svq_kick, vhost_handle_guest_kick);
 +    }
 +}
 +
 +/**
 + * Stop the shadow virtqueue operation.
 + * @svq: Shadow Virtqueue
 + */
 +void vhost_svq_stop(VhostShadowVirtqueue *svq)
 +{
 +    event_notifier_set_handler(&svq->svq_kick, NULL);
 +}
  /**
   * Creates vhost shadow virtqueue, and instructs the vhost device to use the
@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(void)
          goto err_init_hdev_call;
      }
 +    event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
      return g_steal_pointer(&svq);
  err_init_hdev_call:
@@ -XXX,XX +XXX,XX @@ err_init_hdev_kick:
  void vhost_svq_free(gpointer pvq)
  {
      VhostShadowVirtqueue *vq = pvq;
 +    vhost_svq_stop(vq);
      event_notifier_cleanup(&vq->hdev_kick);
      event_notifier_cleanup(&vq->hdev_call);
      g_free(vq);
 diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-shadow-virtqueue.h
 +++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
      EventNotifier hdev_kick;
      /* Shadow call notifier, sent to vhost */
      EventNotifier hdev_call;
 +
 +    /*
 +     * Borrowed virtqueue's guest to host notifier. To borrow it in this event
 +     * notifier allows to recover the VhostShadowVirtqueue from the event loop
 +     * easily. If we use the VirtQueue's one, we don't have an easy way to
 +     * retrieve VhostShadowVirtqueue.
 +     *
 +     * So shadow virtqueue must not clean it, or we would lose VirtQueue one.
 +     */
 +    EventNotifier svq_kick;
  } VhostShadowVirtqueue;
 +void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
 +
 +void vhost_svq_stop(VhostShadowVirtqueue *svq);
 +
  VhostShadowVirtqueue *vhost_svq_new(void);
  void vhost_svq_free(gpointer vq);
 diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-vdpa.c
 +++ b/hw/virtio/vhost-vdpa.c
-@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
+@@ -XXX,XX +XXX,XX @@
-     for (unsigned n = 0; n < hdev->nvqs; ++n) {
+ #include "hw/virtio/vhost.h"
-         g_autoptr(VhostShadowVirtqueue) svq;
+ #include "hw/virtio/vhost-backend.h"
+ #include "hw/virtio/virtio-net.h"
--        svq = vhost_svq_new(v->iova_tree, NULL, NULL);
++#include "hw/virtio/vhost-shadow-virtqueue.h"
-+        svq = vhost_svq_new(v->iova_tree, v->shadow_vq_ops,
+ #include "hw/virtio/vhost-vdpa.h"
-+                            v->shadow_vq_ops_opaque);
+ #include "exec/address-spaces.h"
-         if (unlikely(!svq)) {
+ #include "qemu/main-loop.h"
-             error_setg(errp, "Cannot create svq %u", n);
+ #include "cpu.h"
-             return -1;
+ #include "trace.h"
  #include "qemu-common.h"
 +#include "qapi/error.h"
  /*
   * Return one past the end of the end of section. Be careful with uint64_t
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_one_time_request(struct vhost_dev *dev)
      return v->index != 0;
  }
 +static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
 +                               Error **errp)
 +{
 +    g_autoptr(GPtrArray) shadow_vqs = NULL;
 +
 +    if (!v->shadow_vqs_enabled) {
 +        return 0;
 +    }
 +
 +    shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
 +    for (unsigned n = 0; n < hdev->nvqs; ++n) {
 +        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new();
 +
 +        if (unlikely(!svq)) {
 +            error_setg(errp, "Cannot create svq %u", n);
 +            return -1;
 +        }
 +        g_ptr_array_add(shadow_vqs, g_steal_pointer(&svq));
 +    }
 +
 +    v->shadow_vqs = g_steal_pointer(&shadow_vqs);
 +    return 0;
 +}
 +
  static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
  {
      struct vhost_vdpa *v;
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
      dev->opaque =  opaque ;
      v->listener = vhost_vdpa_memory_listener;
      v->msg_type = VHOST_IOTLB_MSG_V2;
 +    ret = vhost_vdpa_init_svq(dev, v, errp);
 +    if (ret) {
 +        goto err;
 +    }
      vhost_vdpa_get_iova_range(v);
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
                                 VIRTIO_CONFIG_S_DRIVER);
      return 0;
 +
 +err:
 +    ram_block_discard_disable(false);
 +    return ret;
  }
  static void vhost_vdpa_host_notifier_uninit(struct vhost_dev *dev,
@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_host_notifiers_uninit(struct vhost_dev *dev, int n)
  static void vhost_vdpa_host_notifiers_init(struct vhost_dev *dev)
  {
 +    struct vhost_vdpa *v = dev->opaque;
      int i;
 +    if (v->shadow_vqs_enabled) {
 +        /* FIXME SVQ is not compatible with host notifiers mr */
 +        return;
 +    }
 +
      for (i = dev->vq_index; i < dev->vq_index + dev->nvqs; i++) {
          if (vhost_vdpa_host_notifier_init(dev, i)) {
              goto err;
@@ -XXX,XX +XXX,XX @@ err:
      return;
  }
 +static void vhost_vdpa_svq_cleanup(struct vhost_dev *dev)
 +{
 +    struct vhost_vdpa *v = dev->opaque;
 +    size_t idx;
 +
 +    if (!v->shadow_vqs) {
 +        return;
 +    }
 +
 +    for (idx = 0; idx < v->shadow_vqs->len; ++idx) {
 +        vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, idx));
 +    }
 +    g_ptr_array_free(v->shadow_vqs, true);
 +}
 +
  static int vhost_vdpa_cleanup(struct vhost_dev *dev)
  {
      struct vhost_vdpa *v;
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_cleanup(struct vhost_dev *dev)
      trace_vhost_vdpa_cleanup(dev, v);
      vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
      memory_listener_unregister(&v->listener);
 +    vhost_vdpa_svq_cleanup(dev);
      dev->opaque = NULL;
      ram_block_discard_disable(false);
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_device_id(struct vhost_dev *dev,
      return ret;
  }
 +static void vhost_vdpa_reset_svq(struct vhost_vdpa *v)
 +{
 +    if (!v->shadow_vqs_enabled) {
 +        return;
 +    }
 +
 +    for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
 +        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
 +        vhost_svq_stop(svq);
 +    }
 +}
 +
  static int vhost_vdpa_reset_device(struct vhost_dev *dev)
  {
 +    struct vhost_vdpa *v = dev->opaque;
      int ret;
      uint8_t status = 0;
 +    vhost_vdpa_reset_svq(v);
 +
      ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status);
      trace_vhost_vdpa_reset_device(dev, status);
      return ret;
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config,
      return ret;
   }
 +static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
 +                                         struct vhost_vring_file *file)
 +{
 +    trace_vhost_vdpa_set_vring_kick(dev, file->index, file->fd);
 +    return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
 +}
 +
 +/**
 + * Set the shadow virtqueue descriptors to the device
 + *
 + * @dev: The vhost device model
 + * @svq: The shadow virtqueue
 + * @idx: The index of the virtqueue in the vhost device
 + * @errp: Error
 + */
 +static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
 +                                 VhostShadowVirtqueue *svq, unsigned idx,
 +                                 Error **errp)
 +{
 +    struct vhost_vring_file file = {
 +        .index = dev->vq_index + idx,
 +    };
 +    const EventNotifier *event_notifier = &svq->hdev_kick;
 +    int r;
 +
 +    file.fd = event_notifier_get_fd(event_notifier);
 +    r = vhost_vdpa_set_vring_dev_kick(dev, &file);
 +    if (unlikely(r != 0)) {
 +        error_setg_errno(errp, -r, "Can't set device kick fd");
 +    }
 +
 +    return r == 0;
 +}
 +
 +static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
 +{
 +    struct vhost_vdpa *v = dev->opaque;
 +    Error *err = NULL;
 +    unsigned i;
 +
 +    if (!v->shadow_vqs) {
 +        return true;
 +    }
 +
 +    for (i = 0; i < v->shadow_vqs->len; ++i) {
 +        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
 +        bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err);
 +        if (unlikely(!ok)) {
 +            error_reportf_err(err, "Cannot setup SVQ %u: ", i);
 +            return false;
 +        }
 +    }
 +
 +    return true;
 +}
 +
  static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
  {
      struct vhost_vdpa *v = dev->opaque;
 +    bool ok;
      trace_vhost_vdpa_dev_start(dev, started);
      if (started) {
          vhost_vdpa_host_notifiers_init(dev);
 +        ok = vhost_vdpa_svqs_start(dev);
 +        if (unlikely(!ok)) {
 +            return -1;
 +        }
          vhost_vdpa_set_vring_ready(dev);
      } else {
          vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
  static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev,
                                         struct vhost_vring_file *file)
  {
 -    trace_vhost_vdpa_set_vring_kick(dev, file->index, file->fd);
 -    return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
 +    struct vhost_vdpa *v = dev->opaque;
 +    int vdpa_idx = file->index - dev->vq_index;
 +
 +    if (v->shadow_vqs_enabled) {
 +        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
 +        vhost_svq_set_svq_kick_fd(svq, file->fd);
 +        return 0;
 +    } else {
 +        return vhost_vdpa_set_vring_dev_kick(dev, file);
 +    }
  }
  static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
 diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/hw/virtio/vhost-vdpa.h
 +++ b/include/hw/virtio/vhost-vdpa.h
 @@ -XXX,XX +XXX,XX @@
- #include <gmodule.h>
+ #ifndef HW_VIRTIO_VHOST_VDPA_H
+ #define HW_VIRTIO_VHOST_VDPA_H
- #include "hw/virtio/vhost-iova-tree.h"
-+#include "hw/virtio/vhost-shadow-virtqueue.h"
++#include <gmodule.h>
 +
  #include "hw/virtio/virtio.h"
  #include "standard-headers/linux/vhost_types.h"
 @@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
-     /* IOVA mapping used by the Shadow Virtqueue */
+     bool iotlb_batch_begin_sent;
-     VhostIOVATree *iova_tree;
+     MemoryListener listener;
-     GPtrArray *shadow_vqs;
+     struct vhost_vdpa_iova_range iova_range;
-+    const VhostShadowVirtqueueOps *shadow_vq_ops;
++    bool shadow_vqs_enabled;
-+    void *shadow_vq_ops_opaque;
++    GPtrArray *shadow_vqs;
      struct vhost_dev *dev;
      VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
  } VhostVDPA;
-diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
-index XXXXXXX..XXXXXXX 100644
---- a/net/vhost-vdpa.c
-+++ b/net/vhost-vdpa.c
-@@ -XXX,XX +XXX,XX @@
- #include "qemu/osdep.h"
- #include "clients.h"
-+#include "hw/virtio/virtio-net.h"
- #include "net/vhost_net.h"
- #include "net/vhost-vdpa.h"
- #include "hw/virtio/vhost-vdpa.h"
- #include "qemu/config-file.h"
- #include "qemu/error-report.h"
-+#include "qemu/log.h"
-+#include "qemu/memalign.h"
- #include "qemu/option.h"
- #include "qapi/error.h"
- #include <linux/vhost.h>
-@@ -XXX,XX +XXX,XX @@ static NetClientInfo net_vhost_vdpa_info = {
-         .check_peer_type = vhost_vdpa_check_peer_type,
- };
-+/**
-+ * Forward buffer for the moment.
-+ */
-+static int vhost_vdpa_net_handle_ctrl_avail(VhostShadowVirtqueue *svq,
-+                                            VirtQueueElement *elem,
-+                                            void *opaque)
-+{
-+    unsigned int n = elem->out_num + elem->in_num;
-+    g_autofree struct iovec *dev_buffers = g_new(struct iovec, n);
-+    size_t in_len, dev_written;
-+    virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
-+    int r;
-+
-+    memcpy(dev_buffers, elem->out_sg, elem->out_num);
-+    memcpy(dev_buffers + elem->out_num, elem->in_sg, elem->in_num);
-+
-+    r = vhost_svq_add(svq, &dev_buffers[0], elem->out_num, &dev_buffers[1],
-+                      elem->in_num, elem);
-+    if (unlikely(r != 0)) {
-+        if (unlikely(r == -ENOSPC)) {
-+            qemu_log_mask(LOG_GUEST_ERROR, "%s: No space on device queue\n",
-+                          __func__);
-+        }
-+        goto out;
-+    }
-+
-+    /*
-+     * We can poll here since we've had BQL from the time we sent the
-+     * descriptor. Also, we need to take the answer before SVQ pulls by itself,
-+     * when BQL is released
-+     */
-+    dev_written = vhost_svq_poll(svq);
-+    if (unlikely(dev_written < sizeof(status))) {
-+        error_report("Insufficient written data (%zu)", dev_written);
-+    }
-+
-+out:
-+    in_len = iov_from_buf(elem->in_sg, elem->in_num, 0, &status,
-+                          sizeof(status));
-+    if (unlikely(in_len < sizeof(status))) {
-+        error_report("Bad device CVQ written length");
-+    }
-+    vhost_svq_push_elem(svq, elem, MIN(in_len, sizeof(status)));
-+    g_free(elem);
-+    return r;
-+}
-+
-+static const VhostShadowVirtqueueOps vhost_vdpa_net_svq_ops = {
-+    .avail_handler = vhost_vdpa_net_handle_ctrl_avail,
-+};
-+
- static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
-                                            const char *device,
-                                            const char *name,
-@@ -XXX,XX +XXX,XX @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
-     s->vhost_vdpa.device_fd = vdpa_device_fd;
-     s->vhost_vdpa.index = queue_pair_index;
-+    if (!is_datapath) {
-+        s->vhost_vdpa.shadow_vq_ops = &vhost_vdpa_net_svq_ops;
-+        s->vhost_vdpa.shadow_vq_ops_opaque = s;
-+    }
-     ret = vhost_vdpa_add(nc, (void *)&s->vhost_vdpa, queue_pair_index, nvqs);
-     if (ret) {
-         qemu_del_net_client(nc);
 --
 .7.4

-[PULL V2 09/25] vhost: Add SVQDescState
+[PULL V3 04/15] vhost: Add Shadow VirtQueue call forwarding capabilities
 From: Eugenio Pérez <eperezma@redhat.com>
-This will allow SVQ to add context to the different queue elements.
+This will make qemu aware of the device used buffers, allowing it to
+write the guest memory with its contents if needed.
 This patch only store the actual element, no functional change intended.
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/virtio/vhost-shadow-virtqueue.c | 16 ++++++++--------
+ hw/virtio/vhost-shadow-virtqueue.c | 38 ++++++++++++++++++++++++++++++++++++++
- hw/virtio/vhost-shadow-virtqueue.h |  8 ++++++--
+ hw/virtio/vhost-shadow-virtqueue.h |  4 ++++
-files changed, 14 insertions(+), 10 deletions(-)
+ hw/virtio/vhost-vdpa.c             | 31 +++++++++++++++++++++++++++++--
 files changed, 71 insertions(+), 2 deletions(-)
 diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-shadow-virtqueue.c
 +++ b/hw/virtio/vhost-shadow-virtqueue.c
-@@ -XXX,XX +XXX,XX @@ static int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg,
+@@ -XXX,XX +XXX,XX @@ static void vhost_handle_guest_kick(EventNotifier *n)
-         return -EINVAL;
+ }
  /**
 + * Forward vhost notifications
 + *
 + * @n: hdev call event notifier, the one that device set to notify svq.
 + */
 +static void vhost_svq_handle_call(EventNotifier *n)
 +{
 +    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
 +                                             hdev_call);
 +    event_notifier_test_and_clear(n);
 +    event_notifier_set(&svq->svq_call);
 +}
 +
 +/**
 + * Set the call notifier for the SVQ to call the guest
 + *
 + * @svq: Shadow virtqueue
 + * @call_fd: call notifier
 + *
 + * Called on BQL context.
 + */
 +void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
 +{
 +    if (call_fd == VHOST_FILE_UNBIND) {
 +        /*
 +         * Fail event_notifier_set if called handling device call.
 +         *
 +         * SVQ still needs device notifications, since it needs to keep
 +         * forwarding used buffers even with the unbind.
 +         */
 +        memset(&svq->svq_call, 0, sizeof(svq->svq_call));
 +    } else {
 +        event_notifier_init_fd(&svq->svq_call, call_fd);
 +    }
 +}
 +
 +/**
   * Set a new file descriptor for the guest to kick the SVQ and notify for avail
   *
   * @svq: The svq
@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(void)
      }
--    svq->ring_id_maps[qemu_head] = elem;
+     event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
-+    svq->desc_state[qemu_head].elem = elem;
++    event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
-     vhost_svq_kick(svq);
+     return g_steal_pointer(&svq);
-     return 0;
- }
+ err_init_hdev_call:
-@@ -XXX,XX +XXX,XX @@ static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq,
+@@ -XXX,XX +XXX,XX @@ void vhost_svq_free(gpointer pvq)
-         return NULL;
+     VhostShadowVirtqueue *vq = pvq;
-     }
+     vhost_svq_stop(vq);
+     event_notifier_cleanup(&vq->hdev_kick);
--    if (unlikely(!svq->ring_id_maps[used_elem.id])) {
++    event_notifier_set_handler(&vq->hdev_call, NULL);
-+    if (unlikely(!svq->desc_state[used_elem.id].elem)) {
+     event_notifier_cleanup(&vq->hdev_call);
-         qemu_log_mask(LOG_GUEST_ERROR,
+     g_free(vq);
              "Device %s says index %u is used, but it was not available",
              svq->vdev->name, used_elem.id);
          return NULL;
      }
 -    num = svq->ring_id_maps[used_elem.id]->in_num +
 -          svq->ring_id_maps[used_elem.id]->out_num;
 +    num = svq->desc_state[used_elem.id].elem->in_num +
 +          svq->desc_state[used_elem.id].elem->out_num;
      last_used_chain = vhost_svq_last_desc_of_chain(svq, num, used_elem.id);
      svq->desc_next[last_used_chain] = svq->free_head;
      svq->free_head = used_elem.id;
      *len = used_elem.len;
 -    return g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
 +    return g_steal_pointer(&svq->desc_state[used_elem.id].elem);
  }
  static void vhost_svq_flush(VhostShadowVirtqueue *svq,
@@ -XXX,XX +XXX,XX @@ void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
      memset(svq->vring.desc, 0, driver_size);
      svq->vring.used = qemu_memalign(qemu_real_host_page_size(), device_size);
      memset(svq->vring.used, 0, device_size);
 -    svq->ring_id_maps = g_new0(VirtQueueElement *, svq->vring.num);
 +    svq->desc_state = g_new0(SVQDescState, svq->vring.num);
      svq->desc_next = g_new0(uint16_t, svq->vring.num);
      for (unsigned i = 0; i < svq->vring.num - 1; i++) {
          svq->desc_next[i] = cpu_to_le16(i + 1);
@@ -XXX,XX +XXX,XX @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
      for (unsigned i = 0; i < svq->vring.num; ++i) {
          g_autofree VirtQueueElement *elem = NULL;
 -        elem = g_steal_pointer(&svq->ring_id_maps[i]);
 +        elem = g_steal_pointer(&svq->desc_state[i].elem);
          if (elem) {
              virtqueue_detach_element(svq->vq, elem, 0);
          }
@@ -XXX,XX +XXX,XX @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
      }
      svq->vq = NULL;
      g_free(svq->desc_next);
 -    g_free(svq->ring_id_maps);
 +    g_free(svq->desc_state);
      qemu_vfree(svq->vring.desc);
      qemu_vfree(svq->vring.used);
  }
 diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-shadow-virtqueue.h
 +++ b/hw/virtio/vhost-shadow-virtqueue.h
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
- #include "standard-headers/linux/vhost_types.h"
+      * So shadow virtqueue must not clean it, or we would lose VirtQueue one.
- #include "hw/virtio/vhost-iova-tree.h"
+      */
+     EventNotifier svq_kick;
 +typedef struct SVQDescState {
 +    VirtQueueElement *elem;
 +} SVQDescState;
 +
- /* Shadow virtqueue to relay notifications */
++    /* Guest's call notifier, where the SVQ calls guest. */
- typedef struct VhostShadowVirtqueue {
++    EventNotifier svq_call;
-     /* Shadow vring */
+ } VhostShadowVirtqueue;
-@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
-     /* IOVA mapping */
+ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
-     VhostIOVATree *iova_tree;
++void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
--    /* Map for use the guest's descriptors */
+ void vhost_svq_stop(VhostShadowVirtqueue *svq);
--    VirtQueueElement **ring_id_maps;
-+    /* SVQ vring descriptors state */
+diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
-+    SVQDescState *desc_state;
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/virtio/vhost-vdpa.c
-     /* Next VirtQueue element that guest made available */
++++ b/hw/virtio/vhost-vdpa.c
-     VirtQueueElement *next_guest_avail_elem;
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
      return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
  }
 +static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
 +                                         struct vhost_vring_file *file)
 +{
 +    trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
 +    return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
 +}
 +
  /**
   * Set the shadow virtqueue descriptors to the device
   *
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
   * @svq: The shadow virtqueue
   * @idx: The index of the virtqueue in the vhost device
   * @errp: Error
 + *
 + * Note that this function does not rewind kick file descriptor if cannot set
 + * call one.
   */
  static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
                                   VhostShadowVirtqueue *svq, unsigned idx,
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
      r = vhost_vdpa_set_vring_dev_kick(dev, &file);
      if (unlikely(r != 0)) {
          error_setg_errno(errp, -r, "Can't set device kick fd");
 +        return false;
 +    }
 +
 +    event_notifier = &svq->hdev_call;
 +    file.fd = event_notifier_get_fd(event_notifier);
 +    r = vhost_vdpa_set_vring_dev_call(dev, &file);
 +    if (unlikely(r != 0)) {
 +        error_setg_errno(errp, -r, "Can't set device call fd");
      }
      return r == 0;
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev,
  static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
                                         struct vhost_vring_file *file)
  {
 -    trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
 -    return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
 +    struct vhost_vdpa *v = dev->opaque;
 +
 +    if (v->shadow_vqs_enabled) {
 +        int vdpa_idx = file->index - dev->vq_index;
 +        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
 +
 +        vhost_svq_set_svq_call_fd(svq, file->fd);
 +        return 0;
 +    } else {
 +        return vhost_vdpa_set_vring_dev_call(dev, file);
 +    }
  }
  static int vhost_vdpa_get_features(struct vhost_dev *dev,
 --
 .7.4

-[PULL V2 11/25] vhost: add vhost_svq_push_elem
+[PULL V3 05/15] vhost: Add vhost_svq_valid_features to shadow vq
 From: Eugenio Pérez <eperezma@redhat.com>
-This function allows external SVQ users to return guest's available
+This allows SVQ to negotiate features with the guest and the device. For
-buffers.
+the device, SVQ is a driver. While this function bypasses all
 non-transport features, it needs to disable the features that SVQ does
 not support when forwarding buffers. This includes packed vq layout,
 indirect descriptors or event idx.
 Future changes can add support to offer more features to the guest,
 since the use of VirtQueue gives this for free. This is left out at the
 moment for simplicity.
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/virtio/vhost-shadow-virtqueue.c | 16 ++++++++++++++++
+ hw/virtio/vhost-shadow-virtqueue.c | 44 ++++++++++++++++++++++++++++++++++++++
- hw/virtio/vhost-shadow-virtqueue.h |  3 +++
+ hw/virtio/vhost-shadow-virtqueue.h |  2 ++
-files changed, 19 insertions(+)
+ hw/virtio/vhost-vdpa.c             | 15 +++++++++++++
 files changed, 61 insertions(+)
 diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-shadow-virtqueue.c
 +++ b/hw/virtio/vhost-shadow-virtqueue.c
-@@ -XXX,XX +XXX,XX @@ static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq,
+@@ -XXX,XX +XXX,XX @@
-     return g_steal_pointer(&svq->desc_state[used_elem.id].elem);
+ #include "hw/virtio/vhost-shadow-virtqueue.h"
- }
+ #include "qemu/error-report.h"
-+/**
++#include "qapi/error.h"
-+ * Push an element to SVQ, returning it to the guest.
+ #include "qemu/main-loop.h"
  #include "linux-headers/linux/vhost.h"
  /**
 + * Validate the transport device features that both guests can use with the SVQ
 + * and SVQs can use with the device.
 + *
 + * @dev_features: The features
 + * @errp: Error pointer
 + */
-+void vhost_svq_push_elem(VhostShadowVirtqueue *svq,
++bool vhost_svq_valid_features(uint64_t features, Error **errp)
 +                         const VirtQueueElement *elem, uint32_t len)
 +{
-+    virtqueue_push(svq->vq, elem, len);
++    bool ok = true;
-+    if (svq->next_guest_avail_elem) {
++    uint64_t svq_features = features;
-+        /*
++
-+         * Avail ring was full when vhost_svq_flush was called, so it's a
++    for (uint64_t b = VIRTIO_TRANSPORT_F_START; b <= VIRTIO_TRANSPORT_F_END;
-+         * good moment to make more descriptors available if possible.
++         ++b) {
-+         */
++        switch (b) {
-+        vhost_handle_guest_kick(svq);
++        case VIRTIO_F_ANY_LAYOUT:
 +            continue;
 +
 +        case VIRTIO_F_ACCESS_PLATFORM:
 +            /* SVQ trust in the host's IOMMU to translate addresses */
 +        case VIRTIO_F_VERSION_1:
 +            /* SVQ trust that the guest vring is little endian */
 +            if (!(svq_features & BIT_ULL(b))) {
 +                svq_features |= BIT_ULL(b);
 +                ok = false;
 +            }
 +            continue;
 +
 +        default:
 +            if (svq_features & BIT_ULL(b)) {
 +                svq_features &= ~BIT_ULL(b);
 +                ok = false;
 +            }
 +        }
 +    }
++
++    if (!ok) {
++        error_setg(errp, "SVQ Invalid device feature flags, offer: 0x%"PRIx64
++                         ", ok: 0x%"PRIx64, features, svq_features);
++    }
++    return ok;
 +}
 +
- static void vhost_svq_flush(VhostShadowVirtqueue *svq,
++/**
-                             bool check_for_avail_queue)
+  * Forward guest notifications.
- {
+  *
   * @n: guest kick event notifier, the one that guest set to notify svq.
 diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-shadow-virtqueue.h
 +++ b/hw/virtio/vhost-shadow-virtqueue.h
 @@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
+     EventNotifier svq_call;
- bool vhost_svq_valid_features(uint64_t features, Error **errp);
+ } VhostShadowVirtqueue;
-+void vhost_svq_push_elem(VhostShadowVirtqueue *svq,
++bool vhost_svq_valid_features(uint64_t features, Error **errp);
 +                         const VirtQueueElement *elem, uint32_t len);
 +
  void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
  void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
- void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
 diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-vdpa.c
 +++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
                                 Error **errp)
  {
      g_autoptr(GPtrArray) shadow_vqs = NULL;
 +    uint64_t dev_features, svq_features;
 +    int r;
 +    bool ok;
      if (!v->shadow_vqs_enabled) {
          return 0;
      }
 +    r = hdev->vhost_ops->vhost_get_features(hdev, &dev_features);
 +    if (r != 0) {
 +        error_setg_errno(errp, -r, "Can't get vdpa device features");
 +        return r;
 +    }
 +
 +    svq_features = dev_features;
 +    ok = vhost_svq_valid_features(svq_features, errp);
 +    if (unlikely(!ok)) {
 +        return -1;
 +    }
 +
      shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
      for (unsigned n = 0; n < hdev->nvqs; ++n) {
          g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new();
 --
 .7.4

-[PULL V2 13/25] vhost: add vhost_svq_poll
+[PULL V3 06/15] virtio: Add vhost_svq_get_vring_addr
 From: Eugenio Pérez <eperezma@redhat.com>
-It allows the Shadow Control VirtQueue to wait for the device to use the
+It reports the shadow virtqueue address from qemu virtual address space.
-available buffers.
 Since this will be different from the guest's vaddr, but the device can
 access it, SVQ takes special care about its alignment & lack of garbage
 data. It assumes that IOMMU will work in host_page_size ranges for that.
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/virtio/vhost-shadow-virtqueue.c | 27 +++++++++++++++++++++++++++
+ hw/virtio/vhost-shadow-virtqueue.c | 29 +++++++++++++++++++++++++++++
- hw/virtio/vhost-shadow-virtqueue.h |  1 +
+ hw/virtio/vhost-shadow-virtqueue.h |  9 +++++++++
-files changed, 28 insertions(+)
+files changed, 38 insertions(+)
 diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-shadow-virtqueue.c
 +++ b/hw/virtio/vhost-shadow-virtqueue.c
-@@ -XXX,XX +XXX,XX @@ static void vhost_svq_flush(VhostShadowVirtqueue *svq,
+@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
  }
  /**
-+ * Poll the SVQ for one device used buffer.
++ * Get the shadow vq vring address.
-+ *
++ * @svq: Shadow virtqueue
-+ * This function race with main event loop SVQ polling, so extra
++ * @addr: Destination to store address
 + * synchronization is needed.
 + *
 + * Return the length written by the device.
 + */
-+size_t vhost_svq_poll(VhostShadowVirtqueue *svq)
++void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
 +                              struct vhost_vring_addr *addr)
 +{
-+    int64_t start_us = g_get_monotonic_time();
++    addr->desc_user_addr = (uint64_t)(intptr_t)svq->vring.desc;
-+    do {
++    addr->avail_user_addr = (uint64_t)(intptr_t)svq->vring.avail;
-+        uint32_t len;
++    addr->used_user_addr = (uint64_t)(intptr_t)svq->vring.used;
-+        VirtQueueElement *elem = vhost_svq_get_buf(svq, &len);
++}
 +        if (elem) {
 +            return len;
 +        }
 +
-+        if (unlikely(g_get_monotonic_time() - start_us > 10e6)) {
++size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq)
-+            return 0;
++{
-+        }
++    size_t desc_size = sizeof(vring_desc_t) * svq->vring.num;
 +    size_t avail_size = offsetof(vring_avail_t, ring) +
 +                                             sizeof(uint16_t) * svq->vring.num;
 +
-+        /* Make sure we read new used_idx */
++    return ROUND_UP(desc_size + avail_size, qemu_real_host_page_size);
-+        smp_rmb();
++}
-+    } while (true);
++
 +size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq)
 +{
 +    size_t used_size = offsetof(vring_used_t, ring) +
 +                                    sizeof(vring_used_elem_t) * svq->vring.num;
 +    return ROUND_UP(used_size, qemu_real_host_page_size);
 +}
 +
 +/**
-  * Forward used buffers.
+  * Set a new file descriptor for the guest to kick the SVQ and notify for avail
   *
-  * @n: hdev call event notifier, the one that device set to notify svq.
+  * @svq: The svq
 diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-shadow-virtqueue.h
 +++ b/hw/virtio/vhost-shadow-virtqueue.h
-@@ -XXX,XX +XXX,XX @@ void vhost_svq_push_elem(VhostShadowVirtqueue *svq,
+@@ -XXX,XX +XXX,XX @@
- int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg,
+ #define VHOST_SHADOW_VIRTQUEUE_H
-                   size_t out_num, const struct iovec *in_sg, size_t in_num,
-                   VirtQueueElement *elem);
+ #include "qemu/event_notifier.h"
-+size_t vhost_svq_poll(VhostShadowVirtqueue *svq);
++#include "hw/virtio/virtio.h"
 +#include "standard-headers/linux/vhost_types.h"
  /* Shadow virtqueue to relay notifications */
  typedef struct VhostShadowVirtqueue {
 +    /* Shadow vring */
 +    struct vring vring;
 +
      /* Shadow kick notifier, sent to vhost */
      EventNotifier hdev_kick;
      /* Shadow call notifier, sent to vhost */
@@ -XXX,XX +XXX,XX @@ bool vhost_svq_valid_features(uint64_t features, Error **errp);
  void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
  void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
++void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
++                              struct vhost_vring_addr *addr);
++size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
++size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
+ void vhost_svq_stop(VhostShadowVirtqueue *svq);
 --
 .7.4

-[PULL V2 03/25] virtio-net: Expose ctrl virtqueue logic
+[PULL V3 07/15] vdpa: adapt vhost_ops callbacks to svq
 From: Eugenio Pérez <eperezma@redhat.com>
-This allows external vhost-net devices to modify the state of the
+First half of the buffers forwarding part, preparing vhost-vdpa
-VirtIO device model once the vhost-vdpa device has acknowledged the
+callbacks to SVQ to offer it. QEMU cannot enable it at this moment, so
-control commands.
+this is effectively dead code at the moment, but it helps to reduce
 patch size.
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/virtio-net.c            | 84 ++++++++++++++++++++++++------------------
+ hw/virtio/vhost-vdpa.c | 48 +++++++++++++++++++++++++++++++++++++++++-------
- include/hw/virtio/virtio-net.h |  4 ++
+file changed, 41 insertions(+), 7 deletions(-)
 files changed, 53 insertions(+), 35 deletions(-)
-diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
+diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/virtio-net.c
+--- a/hw/virtio/vhost-vdpa.c
-+++ b/hw/net/virtio-net.c
++++ b/hw/virtio/vhost-vdpa.c
-@@ -XXX,XX +XXX,XX @@ static int virtio_net_handle_mq(VirtIONet *n, uint8_t cmd,
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config,
-     return VIRTIO_NET_OK;
+     return ret;
   }
 +static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev,
 +                                         struct vhost_vring_state *ring)
 +{
 +    trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
 +    return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
 +}
 +
  static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
                                           struct vhost_vring_file *file)
  {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
      return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
  }
--static void virtio_net_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq)
++static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
-+size_t virtio_net_handle_ctrl_iov(VirtIODevice *vdev,
++                                         struct vhost_vring_addr *addr)
-+                                  const struct iovec *in_sg, unsigned in_num,
++{
-+                                  const struct iovec *out_sg,
++    trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
-+                                  unsigned out_num)
++                                addr->desc_user_addr, addr->used_user_addr,
 +                                addr->avail_user_addr,
 +                                addr->log_guest_addr);
 +
 +    return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
 +
 +}
 +
  /**
   * Set the shadow virtqueue descriptors to the device
   *
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
  static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev,
                                         struct vhost_vring_addr *addr)
  {
-     VirtIONet *n = VIRTIO_NET(vdev);
+-    trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
-     struct virtio_net_ctrl_hdr ctrl;
+-                                    addr->desc_user_addr, addr->used_user_addr,
-     virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
+-                                    addr->avail_user_addr,
--    VirtQueueElement *elem;
+-                                    addr->log_guest_addr);
-     size_t s;
+-    return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
-     struct iovec *iov, *iov2;
++    struct vhost_vdpa *v = dev->opaque;
 -    unsigned int iov_cnt;
 +
-+    if (iov_size(in_sg, in_num) < sizeof(status) ||
++    if (v->shadow_vqs_enabled) {
-+        iov_size(out_sg, out_num) < sizeof(ctrl)) {
++        /*
-+        virtio_error(vdev, "virtio-net ctrl missing headers");
++         * Device vring addr was set at device start. SVQ base is handled by
 +         * VirtQueue code.
 +         */
 +        return 0;
 +    }
 +
-+    iov2 = iov = g_memdup2(out_sg, sizeof(struct iovec) * out_num);
++    return vhost_vdpa_set_vring_dev_addr(dev, addr);
-+    s = iov_to_buf(iov, out_num, 0, &ctrl, sizeof(ctrl));
+ }
-+    iov_discard_front(&iov, &out_num, sizeof(ctrl));
-+    if (s != sizeof(ctrl)) {
+ static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
-+        status = VIRTIO_NET_ERR;
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
-+    } else if (ctrl.class == VIRTIO_NET_CTRL_RX) {
+ static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
-+        status = virtio_net_handle_rx_mode(n, ctrl.cmd, iov, out_num);
+                                        struct vhost_vring_state *ring)
-+    } else if (ctrl.class == VIRTIO_NET_CTRL_MAC) {
+ {
-+        status = virtio_net_handle_mac(n, ctrl.cmd, iov, out_num);
+-    trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
-+    } else if (ctrl.class == VIRTIO_NET_CTRL_VLAN) {
+-    return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
-+        status = virtio_net_handle_vlan_table(n, ctrl.cmd, iov, out_num);
++    struct vhost_vdpa *v = dev->opaque;
-+    } else if (ctrl.class == VIRTIO_NET_CTRL_ANNOUNCE) {
++
-+        status = virtio_net_handle_announce(n, ctrl.cmd, iov, out_num);
++    if (v->shadow_vqs_enabled) {
-+    } else if (ctrl.class == VIRTIO_NET_CTRL_MQ) {
++        /*
-+        status = virtio_net_handle_mq(n, ctrl.cmd, iov, out_num);
++         * Device vring base was set at device start. SVQ base is handled by
-+    } else if (ctrl.class == VIRTIO_NET_CTRL_GUEST_OFFLOADS) {
++         * VirtQueue code.
-+        status = virtio_net_handle_offloads(n, ctrl.cmd, iov, out_num);
++         */
 +        return 0;
 +    }
 +
-+    s = iov_from_buf(in_sg, in_num, 0, &status, sizeof(status));
++    return vhost_vdpa_set_dev_vring_base(dev, ring);
 +    assert(s == sizeof(status));
 +
 +    g_free(iov2);
 +    return sizeof(status);
 +}
 +
 +static void virtio_net_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq)
 +{
 +    VirtQueueElement *elem;
      for (;;) {
 +        size_t written;
          elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
          if (!elem) {
              break;
          }
 -        if (iov_size(elem->in_sg, elem->in_num) < sizeof(status) ||
 -            iov_size(elem->out_sg, elem->out_num) < sizeof(ctrl)) {
 -            virtio_error(vdev, "virtio-net ctrl missing headers");
 +
 +        written = virtio_net_handle_ctrl_iov(vdev, elem->in_sg, elem->in_num,
 +                                             elem->out_sg, elem->out_num);
 +        if (written > 0) {
 +            virtqueue_push(vq, elem, written);
 +            virtio_notify(vdev, vq);
 +            g_free(elem);
 +        } else {
              virtqueue_detach_element(vq, elem, 0);
              g_free(elem);
              break;
          }
 -
 -        iov_cnt = elem->out_num;
 -        iov2 = iov = g_memdup2(elem->out_sg,
 -                               sizeof(struct iovec) * elem->out_num);
 -        s = iov_to_buf(iov, iov_cnt, 0, &ctrl, sizeof(ctrl));
 -        iov_discard_front(&iov, &iov_cnt, sizeof(ctrl));
 -        if (s != sizeof(ctrl)) {
 -            status = VIRTIO_NET_ERR;
 -        } else if (ctrl.class == VIRTIO_NET_CTRL_RX) {
 -            status = virtio_net_handle_rx_mode(n, ctrl.cmd, iov, iov_cnt);
 -        } else if (ctrl.class == VIRTIO_NET_CTRL_MAC) {
 -            status = virtio_net_handle_mac(n, ctrl.cmd, iov, iov_cnt);
 -        } else if (ctrl.class == VIRTIO_NET_CTRL_VLAN) {
 -            status = virtio_net_handle_vlan_table(n, ctrl.cmd, iov, iov_cnt);
 -        } else if (ctrl.class == VIRTIO_NET_CTRL_ANNOUNCE) {
 -            status = virtio_net_handle_announce(n, ctrl.cmd, iov, iov_cnt);
 -        } else if (ctrl.class == VIRTIO_NET_CTRL_MQ) {
 -            status = virtio_net_handle_mq(n, ctrl.cmd, iov, iov_cnt);
 -        } else if (ctrl.class == VIRTIO_NET_CTRL_GUEST_OFFLOADS) {
 -            status = virtio_net_handle_offloads(n, ctrl.cmd, iov, iov_cnt);
 -        }
 -
 -        s = iov_from_buf(elem->in_sg, elem->in_num, 0, &status, sizeof(status));
 -        assert(s == sizeof(status));
 -
 -        virtqueue_push(vq, elem, sizeof(status));
 -        virtio_notify(vdev, vq);
 -        g_free(iov2);
 -        g_free(elem);
      }
  }
-diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h
+ static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
 index XXXXXXX..XXXXXXX 100644
 --- a/include/hw/virtio/virtio-net.h
 +++ b/include/hw/virtio/virtio-net.h
@@ -XXX,XX +XXX,XX @@ struct VirtIONet {
      struct EBPFRSSContext ebpf_rss;
  };
 +size_t virtio_net_handle_ctrl_iov(VirtIODevice *vdev,
 +                                  const struct iovec *in_sg, unsigned in_num,
 +                                  const struct iovec *out_sg,
 +                                  unsigned out_num);
  void virtio_net_set_netclient_name(VirtIONet *n, const char *name,
                                     const char *type);
 --
 .7.4

-[PULL V2 05/25] vhost: Reorder vhost_svq_kick
+[PULL V3 08/15] vhost: Shadow virtqueue buffers forwarding
 From: Eugenio Pérez <eperezma@redhat.com>
-Future code needs to call it from vhost_svq_add.
+Initial version of shadow virtqueue that actually forward buffers. There
+is no iommu support at the moment, and that will be addressed in future
-No functional change intended.
+patches of this series. Since all vhost-vdpa devices use forced IOMMU,
 this means that SVQ is not usable at this point of the series on any
 device.
 For simplicity it only supports modern devices, that expects vring
 in little endian, with split ring and no event idx or indirect
 descriptors. Support for them will not be added in this series.
 It reuses the VirtQueue code for the device part. The driver part is
 based on Linux's virtio_ring driver, but with stripped functionality
 and optimizations so it's easier to review.
 However, forwarding buffers have some particular pieces: One of the most
 unexpected ones is that a guest's buffer can expand through more than
 one descriptor in SVQ. While this is handled gracefully by qemu's
 emulated virtio devices, it may cause unexpected SVQ queue full. This
 patch also solves it by checking for this condition at both guest's
 kicks and device's calls. The code may be more elegant in the future if
 SVQ code runs in its own iocontext.
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/virtio/vhost-shadow-virtqueue.c | 28 ++++++++++++++--------------
+ hw/virtio/vhost-shadow-virtqueue.c | 352 ++++++++++++++++++++++++++++++++++++-
-file changed, 14 insertions(+), 14 deletions(-)
+ hw/virtio/vhost-shadow-virtqueue.h |  26 +++
  hw/virtio/vhost-vdpa.c             | 155 +++++++++++++++-
 files changed, 522 insertions(+), 11 deletions(-)
 diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-shadow-virtqueue.c
 +++ b/hw/virtio/vhost-shadow-virtqueue.c
-@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
+@@ -XXX,XX +XXX,XX @@
-     return true;
+ #include "qemu/error-report.h"
  #include "qapi/error.h"
  #include "qemu/main-loop.h"
 +#include "qemu/log.h"
 +#include "qemu/memalign.h"
  #include "linux-headers/linux/vhost.h"
  /**
@@ -XXX,XX +XXX,XX @@ bool vhost_svq_valid_features(uint64_t features, Error **errp)
  }
+ /**
+- * Forward guest notifications.
++ * Number of descriptors that the SVQ can make available from the guest.
++ *
++ * @svq: The svq
++ */
++static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
++{
++    return svq->vring.num - (svq->shadow_avail_idx - svq->shadow_used_idx);
++}
++
++static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
++                                    const struct iovec *iovec, size_t num,
++                                    bool more_descs, bool write)
++{
++    uint16_t i = svq->free_head, last = svq->free_head;
++    unsigned n;
++    uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0;
++    vring_desc_t *descs = svq->vring.desc;
++
++    if (num == 0) {
++        return;
++    }
++
++    for (n = 0; n < num; n++) {
++        if (more_descs || (n + 1 < num)) {
++            descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT);
++        } else {
++            descs[i].flags = flags;
++        }
++        descs[i].addr = cpu_to_le64((hwaddr)(intptr_t)iovec[n].iov_base);
++        descs[i].len = cpu_to_le32(iovec[n].iov_len);
++
++        last = i;
++        i = cpu_to_le16(descs[i].next);
++    }
++
++    svq->free_head = le16_to_cpu(descs[last].next);
++}
++
++static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
++                                VirtQueueElement *elem, unsigned *head)
++{
++    unsigned avail_idx;
++    vring_avail_t *avail = svq->vring.avail;
++
++    *head = svq->free_head;
++
++    /* We need some descriptors here */
++    if (unlikely(!elem->out_num && !elem->in_num)) {
++        qemu_log_mask(LOG_GUEST_ERROR,
++                      "Guest provided element with no descriptors");
++        return false;
++    }
++
++    vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, elem->in_num > 0,
++                            false);
++    vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
++
++    /*
++     * Put the entry in the available array (but don't update avail->idx until
++     * they do sync).
++     */
++    avail_idx = svq->shadow_avail_idx & (svq->vring.num - 1);
++    avail->ring[avail_idx] = cpu_to_le16(*head);
++    svq->shadow_avail_idx++;
++
++    /* Update the avail index after write the descriptor */
++    smp_wmb();
++    avail->idx = cpu_to_le16(svq->shadow_avail_idx);
++
++    return true;
++}
++
++static bool vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
++{
++    unsigned qemu_head;
++    bool ok = vhost_svq_add_split(svq, elem, &qemu_head);
++    if (unlikely(!ok)) {
++        return false;
++    }
++
++    svq->ring_id_maps[qemu_head] = elem;
++    return true;
++}
++
 +static void vhost_svq_kick(VhostShadowVirtqueue *svq)
 +{
 +    /*
 +     * We need to expose the available array entries before checking the used
 +     * flags
 ...
 +    }
 +
 +    event_notifier_set(&svq->hdev_kick);
 +}
 +
++/**
++ * Forward available buffers.
++ *
++ * @svq: Shadow VirtQueue
++ *
++ * Note that this function does not guarantee that all guest's available
++ * buffers are available to the device in SVQ avail ring. The guest may have
++ * exposed a GPA / GIOVA contiguous buffer, but it may not be contiguous in
++ * qemu vaddr.
++ *
++ * If that happens, guest's kick notifications will be disabled until the
++ * device uses some buffers.
++ */
++static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
++{
++    /* Clear event notifier */
++    event_notifier_test_and_clear(&svq->svq_kick);
++
++    /* Forward to the device as many available buffers as possible */
++    do {
++        virtio_queue_set_notification(svq->vq, false);
++
++        while (true) {
++            VirtQueueElement *elem;
++            bool ok;
++
++            if (svq->next_guest_avail_elem) {
++                elem = g_steal_pointer(&svq->next_guest_avail_elem);
++            } else {
++                elem = virtqueue_pop(svq->vq, sizeof(*elem));
++            }
++
++            if (!elem) {
++                break;
++            }
++
++            if (elem->out_num + elem->in_num > vhost_svq_available_slots(svq)) {
++                /*
++                 * This condition is possible since a contiguous buffer in GPA
++                 * does not imply a contiguous buffer in qemu's VA
++                 * scatter-gather segments. If that happens, the buffer exposed
++                 * to the device needs to be a chain of descriptors at this
++                 * moment.
++                 *
++                 * SVQ cannot hold more available buffers if we are here:
++                 * queue the current guest descriptor and ignore further kicks
++                 * until some elements are used.
++                 */
++                svq->next_guest_avail_elem = elem;
++                return;
++            }
++
++            ok = vhost_svq_add(svq, elem);
++            if (unlikely(!ok)) {
++                /* VQ is broken, just return and ignore any other kicks */
++                return;
++            }
++            vhost_svq_kick(svq);
++        }
++
++        virtio_queue_set_notification(svq->vq, true);
++    } while (!virtio_queue_empty(svq->vq));
++}
++
++/**
++ * Handle guest's kick.
+  *
+  * @n: guest kick event notifier, the one that guest set to notify svq.
+  */
+-static void vhost_handle_guest_kick(EventNotifier *n)
++static void vhost_handle_guest_kick_notifier(EventNotifier *n)
+ {
+     VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, svq_kick);
+     event_notifier_test_and_clear(n);
+-    event_notifier_set(&svq->hdev_kick);
++    vhost_handle_guest_kick(svq);
++}
++
++static bool vhost_svq_more_used(VhostShadowVirtqueue *svq)
++{
++    if (svq->last_used_idx != svq->shadow_used_idx) {
++        return true;
++    }
++
++    svq->shadow_used_idx = cpu_to_le16(svq->vring.used->idx);
++
++    return svq->last_used_idx != svq->shadow_used_idx;
+ }
  /**
-  * Add an element to a SVQ.
+- * Forward vhost notifications
 + * Enable vhost device calls after disable them.
 + *
 + * @svq: The svq
 + *
 + * It returns false if there are pending used buffers from the vhost device,
 + * avoiding the possible races between SVQ checking for more work and enabling
 + * callbacks. True if SVQ used vring has no more pending buffers.
 + */
 +static bool vhost_svq_enable_notification(VhostShadowVirtqueue *svq)
 +{
 +    svq->vring.avail->flags &= ~cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
 +    /* Make sure the flag is written before the read of used_idx */
 +    smp_mb();
 +    return !vhost_svq_more_used(svq);
 +}
 +
 +static void vhost_svq_disable_notification(VhostShadowVirtqueue *svq)
 +{
 +    svq->vring.avail->flags |= cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
 +}
 +
 +static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq,
 +                                           uint32_t *len)
 +{
 +    vring_desc_t *descs = svq->vring.desc;
 +    const vring_used_t *used = svq->vring.used;
 +    vring_used_elem_t used_elem;
 +    uint16_t last_used;
 +
 +    if (!vhost_svq_more_used(svq)) {
 +        return NULL;
 +    }
 +
 +    /* Only get used array entries after they have been exposed by dev */
 +    smp_rmb();
 +    last_used = svq->last_used_idx & (svq->vring.num - 1);
 +    used_elem.id = le32_to_cpu(used->ring[last_used].id);
 +    used_elem.len = le32_to_cpu(used->ring[last_used].len);
 +
 +    svq->last_used_idx++;
 +    if (unlikely(used_elem.id >= svq->vring.num)) {
 +        qemu_log_mask(LOG_GUEST_ERROR, "Device %s says index %u is used",
 +                      svq->vdev->name, used_elem.id);
 +        return NULL;
 +    }
 +
 +    if (unlikely(!svq->ring_id_maps[used_elem.id])) {
 +        qemu_log_mask(LOG_GUEST_ERROR,
 +            "Device %s says index %u is used, but it was not available",
 +            svq->vdev->name, used_elem.id);
 +        return NULL;
 +    }
 +
 +    descs[used_elem.id].next = svq->free_head;
 +    svq->free_head = used_elem.id;
 +
 +    *len = used_elem.len;
 +    return g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
 +}
 +
 +static void vhost_svq_flush(VhostShadowVirtqueue *svq,
 +                            bool check_for_avail_queue)
 +{
 +    VirtQueue *vq = svq->vq;
 +
 +    /* Forward as many used buffers as possible. */
 +    do {
 +        unsigned i = 0;
 +
 +        vhost_svq_disable_notification(svq);
 +        while (true) {
 +            uint32_t len;
 +            g_autofree VirtQueueElement *elem = vhost_svq_get_buf(svq, &len);
 +            if (!elem) {
 +                break;
 +            }
 +
 +            if (unlikely(i >= svq->vring.num)) {
 +                qemu_log_mask(LOG_GUEST_ERROR,
 +                         "More than %u used buffers obtained in a %u size SVQ",
 +                         i, svq->vring.num);
 +                virtqueue_fill(vq, elem, len, i);
 +                virtqueue_flush(vq, i);
 +                return;
 +            }
 +            virtqueue_fill(vq, elem, len, i++);
 +        }
 +
 +        virtqueue_flush(vq, i);
 +        event_notifier_set(&svq->svq_call);
 +
 +        if (check_for_avail_queue && svq->next_guest_avail_elem) {
 +            /*
 +             * Avail ring was full when vhost_svq_flush was called, so it's a
 +             * good moment to make more descriptors available if possible.
 +             */
 +            vhost_handle_guest_kick(svq);
 +        }
 +    } while (!vhost_svq_enable_notification(svq));
 +}
 +
 +/**
 + * Forward used buffers.
   *
-@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
+  * @n: hdev call event notifier, the one that device set to notify svq.
-     return true;
++ *
 + * Note that we are not making any buffers available in the loop, there is no
 + * way that it runs more than virtqueue size times.
   */
  static void vhost_svq_handle_call(EventNotifier *n)
  {
      VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
                                               hdev_call);
      event_notifier_test_and_clear(n);
 -    event_notifier_set(&svq->svq_call);
 +    vhost_svq_flush(svq, true);
  }
--static void vhost_svq_kick(VhostShadowVirtqueue *svq)
--{
--    /*
--     * We need to expose the available array entries before checking the used
--     * flags
--     */
--    smp_mb();
--    if (svq->vring.used->flags & VRING_USED_F_NO_NOTIFY) {
--        return;
--    }
--
--    event_notifier_set(&svq->hdev_kick);
--}
--
  /**
-  * Forward available buffers.
+@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
-  *
+     if (poll_start) {
          event_notifier_init_fd(svq_kick, svq_kick_fd);
          event_notifier_set(svq_kick);
 -        event_notifier_set_handler(svq_kick, vhost_handle_guest_kick);
 +        event_notifier_set_handler(svq_kick, vhost_handle_guest_kick_notifier);
 +    }
 +}
 +
 +/**
 + * Start the shadow virtqueue operation.
 + *
 + * @svq: Shadow Virtqueue
 + * @vdev: VirtIO device
 + * @vq: Virtqueue to shadow
 + */
 +void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
 +                     VirtQueue *vq)
 +{
 +    size_t desc_size, driver_size, device_size;
 +
 +    svq->next_guest_avail_elem = NULL;
 +    svq->shadow_avail_idx = 0;
 +    svq->shadow_used_idx = 0;
 +    svq->last_used_idx = 0;
 +    svq->vdev = vdev;
 +    svq->vq = vq;
 +
 +    svq->vring.num = virtio_queue_get_num(vdev, virtio_get_queue_index(vq));
 +    driver_size = vhost_svq_driver_area_size(svq);
 +    device_size = vhost_svq_device_area_size(svq);
 +    svq->vring.desc = qemu_memalign(qemu_real_host_page_size, driver_size);
 +    desc_size = sizeof(vring_desc_t) * svq->vring.num;
 +    svq->vring.avail = (void *)((char *)svq->vring.desc + desc_size);
 +    memset(svq->vring.desc, 0, driver_size);
 +    svq->vring.used = qemu_memalign(qemu_real_host_page_size, device_size);
 +    memset(svq->vring.used, 0, device_size);
 +    svq->ring_id_maps = g_new0(VirtQueueElement *, svq->vring.num);
 +    for (unsigned i = 0; i < svq->vring.num - 1; i++) {
 +        svq->vring.desc[i].next = cpu_to_le16(i + 1);
      }
  }
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
  void vhost_svq_stop(VhostShadowVirtqueue *svq)
  {
      event_notifier_set_handler(&svq->svq_kick, NULL);
 +    g_autofree VirtQueueElement *next_avail_elem = NULL;
 +
 +    if (!svq->vq) {
 +        return;
 +    }
 +
 +    /* Send all pending used descriptors to guest */
 +    vhost_svq_flush(svq, false);
 +
 +    for (unsigned i = 0; i < svq->vring.num; ++i) {
 +        g_autofree VirtQueueElement *elem = NULL;
 +        elem = g_steal_pointer(&svq->ring_id_maps[i]);
 +        if (elem) {
 +            virtqueue_detach_element(svq->vq, elem, 0);
 +        }
 +    }
 +
 +    next_avail_elem = g_steal_pointer(&svq->next_guest_avail_elem);
 +    if (next_avail_elem) {
 +        virtqueue_detach_element(svq->vq, next_avail_elem, 0);
 +    }
 +    svq->vq = NULL;
 +    g_free(svq->ring_id_maps);
 +    qemu_vfree(svq->vring.desc);
 +    qemu_vfree(svq->vring.used);
  }
  /**
 diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-shadow-virtqueue.h
 +++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
      /* Guest's call notifier, where the SVQ calls guest. */
      EventNotifier svq_call;
 +
 +    /* Virtio queue shadowing */
 +    VirtQueue *vq;
 +
 +    /* Virtio device */
 +    VirtIODevice *vdev;
 +
 +    /* Map for use the guest's descriptors */
 +    VirtQueueElement **ring_id_maps;
 +
 +    /* Next VirtQueue element that guest made available */
 +    VirtQueueElement *next_guest_avail_elem;
 +
 +    /* Next head to expose to the device */
 +    uint16_t shadow_avail_idx;
 +
 +    /* Next free descriptor */
 +    uint16_t free_head;
 +
 +    /* Last seen used idx */
 +    uint16_t shadow_used_idx;
 +
 +    /* Next head to consume from the device */
 +    uint16_t last_used_idx;
  } VhostShadowVirtqueue;
  bool vhost_svq_valid_features(uint64_t features, Error **errp);
@@ -XXX,XX +XXX,XX @@ void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
  size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
  size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
 +void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
 +                     VirtQueue *vq);
  void vhost_svq_stop(VhostShadowVirtqueue *svq);
  VhostShadowVirtqueue *vhost_svq_new(void);
 diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-vdpa.c
 +++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
   * Note that this function does not rewind kick file descriptor if cannot set
   * call one.
   */
 -static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
 -                                 VhostShadowVirtqueue *svq, unsigned idx,
 -                                 Error **errp)
 +static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
 +                                  VhostShadowVirtqueue *svq, unsigned idx,
 +                                  Error **errp)
  {
      struct vhost_vring_file file = {
          .index = dev->vq_index + idx,
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
      r = vhost_vdpa_set_vring_dev_kick(dev, &file);
      if (unlikely(r != 0)) {
          error_setg_errno(errp, -r, "Can't set device kick fd");
 -        return false;
 +        return r;
      }
      event_notifier = &svq->hdev_call;
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
          error_setg_errno(errp, -r, "Can't set device call fd");
      }
 +    return r;
 +}
 +
 +/**
 + * Unmap a SVQ area in the device
 + */
 +static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
 +                                      hwaddr size)
 +{
 +    int r;
 +
 +    size = ROUND_UP(size, qemu_real_host_page_size);
 +    r = vhost_vdpa_dma_unmap(v, iova, size);
 +    return r == 0;
 +}
 +
 +static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
 +                                       const VhostShadowVirtqueue *svq)
 +{
 +    struct vhost_vdpa *v = dev->opaque;
 +    struct vhost_vring_addr svq_addr;
 +    size_t device_size = vhost_svq_device_area_size(svq);
 +    size_t driver_size = vhost_svq_driver_area_size(svq);
 +    bool ok;
 +
 +    vhost_svq_get_vring_addr(svq, &svq_addr);
 +
 +    ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size);
 +    if (unlikely(!ok)) {
 +        return false;
 +    }
 +
 +    return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size);
 +}
 +
 +/**
 + * Map the shadow virtqueue rings in the device
 + *
 + * @dev: The vhost device
 + * @svq: The shadow virtqueue
 + * @addr: Assigned IOVA addresses
 + * @errp: Error pointer
 + */
 +static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
 +                                     const VhostShadowVirtqueue *svq,
 +                                     struct vhost_vring_addr *addr,
 +                                     Error **errp)
 +{
 +    struct vhost_vdpa *v = dev->opaque;
 +    size_t device_size = vhost_svq_device_area_size(svq);
 +    size_t driver_size = vhost_svq_driver_area_size(svq);
 +    int r;
 +
 +    ERRP_GUARD();
 +    vhost_svq_get_vring_addr(svq, addr);
 +
 +    r = vhost_vdpa_dma_map(v, addr->desc_user_addr, driver_size,
 +                           (void *)(uintptr_t)addr->desc_user_addr, true);
 +    if (unlikely(r != 0)) {
 +        error_setg_errno(errp, -r, "Cannot create vq driver region: ");
 +        return false;
 +    }
 +
 +    r = vhost_vdpa_dma_map(v, addr->used_user_addr, device_size,
 +                           (void *)(intptr_t)addr->used_user_addr, false);
 +    if (unlikely(r != 0)) {
 +        error_setg_errno(errp, -r, "Cannot create vq device region: ");
 +    }
 +
 +    return r == 0;
 +}
 +
 +static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
 +                                 VhostShadowVirtqueue *svq, unsigned idx,
 +                                 Error **errp)
 +{
 +    uint16_t vq_index = dev->vq_index + idx;
 +    struct vhost_vring_state s = {
 +        .index = vq_index,
 +    };
 +    int r;
 +
 +    r = vhost_vdpa_set_dev_vring_base(dev, &s);
 +    if (unlikely(r)) {
 +        error_setg_errno(errp, -r, "Cannot set vring base");
 +        return false;
 +    }
 +
 +    r = vhost_vdpa_svq_set_fds(dev, svq, idx, errp);
      return r == 0;
  }
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
      }
      for (i = 0; i < v->shadow_vqs->len; ++i) {
 +        VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
          VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
 +        struct vhost_vring_addr addr = {
 +            .index = i,
 +        };
 +        int r;
          bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err);
          if (unlikely(!ok)) {
 -            error_reportf_err(err, "Cannot setup SVQ %u: ", i);
 +            goto err;
 +        }
 +
 +        vhost_svq_start(svq, dev->vdev, vq);
 +        ok = vhost_vdpa_svq_map_rings(dev, svq, &addr, &err);
 +        if (unlikely(!ok)) {
 +            goto err_map;
 +        }
 +
 +        /* Override vring GPA set by vhost subsystem */
 +        r = vhost_vdpa_set_vring_dev_addr(dev, &addr);
 +        if (unlikely(r != 0)) {
 +            error_setg_errno(&err, -r, "Cannot set device address");
 +            goto err_set_addr;
 +        }
 +    }
 +
 +    return true;
 +
 +err_set_addr:
 +    vhost_vdpa_svq_unmap_rings(dev, g_ptr_array_index(v->shadow_vqs, i));
 +
 +err_map:
 +    vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, i));
 +
 +err:
 +    error_reportf_err(err, "Cannot setup SVQ %u: ", i);
 +    for (unsigned j = 0; j < i; ++j) {
 +        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, j);
 +        vhost_vdpa_svq_unmap_rings(dev, svq);
 +        vhost_svq_stop(svq);
 +    }
 +
 +    return false;
 +}
 +
 +static bool vhost_vdpa_svqs_stop(struct vhost_dev *dev)
 +{
 +    struct vhost_vdpa *v = dev->opaque;
 +
 +    if (!v->shadow_vqs) {
 +        return true;
 +    }
 +
 +    for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
 +        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
 +        bool ok = vhost_vdpa_svq_unmap_rings(dev, svq);
 +        if (unlikely(!ok)) {
              return false;
          }
      }
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
          }
          vhost_vdpa_set_vring_ready(dev);
      } else {
 +        ok = vhost_vdpa_svqs_stop(dev);
 +        if (unlikely(!ok)) {
 +            return -1;
 +        }
          vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
      }
 --
 .7.4

-[PULL V2 19/25] vdpa: Extract get features part from vhost_vdpa_get_max_queue_pairs
+[PULL V3 09/15] util: Add iova_tree_alloc_map
 From: Eugenio Pérez <eperezma@redhat.com>
-To know the device features is needed for CVQ SVQ, so SVQ knows if it
+This iova tree function allows it to look for a hole in allocated
-can handle all commands or not. Extract from
+regions and return a totally new translation for a given translated
-vhost_vdpa_get_max_queue_pairs so we can reuse it.
+address.
 It's usage is mainly to allow devices to access qemu address space,
 remapping guest's one into a new iova space where qemu can add chunks of
 addresses.
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-Acked-by: Jason Wang <jasowang@redhat.com>
+Reviewed-by: Peter Xu <peterx@redhat.com>
-Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- net/vhost-vdpa.c | 30 ++++++++++++++++++++----------
+ include/qemu/iova-tree.h |  18 +++++++
-file changed, 20 insertions(+), 10 deletions(-)
+ util/iova-tree.c         | 136 +++++++++++++++++++++++++++++++++++++++++++++++
+files changed, 154 insertions(+)
-diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
 diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h
 index XXXXXXX..XXXXXXX 100644
---- a/net/vhost-vdpa.c
+--- a/include/qemu/iova-tree.h
-+++ b/net/vhost-vdpa.c
++++ b/include/qemu/iova-tree.h
-@@ -XXX,XX +XXX,XX @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
+@@ -XXX,XX +XXX,XX @@
-     return nc;
+ #define  IOVA_OK           (0)
  #define  IOVA_ERR_INVALID  (-1) /* Invalid parameters */
  #define  IOVA_ERR_OVERLAP  (-2) /* IOVA range overlapped */
 +#define  IOVA_ERR_NOMEM    (-3) /* Cannot allocate */
  typedef struct IOVATree IOVATree;
  typedef struct DMAMap {
@@ -XXX,XX +XXX,XX @@ const DMAMap *iova_tree_find_address(const IOVATree *tree, hwaddr iova);
  void iova_tree_foreach(IOVATree *tree, iova_tree_iterator iterator);
  /**
 + * iova_tree_alloc_map:
 + *
 + * @tree: the iova tree to allocate from
 + * @map: the new map (as translated addr & size) to allocate in the iova region
 + * @iova_begin: the minimum address of the allocation
 + * @iova_end: the maximum addressable direction of the allocation
 + *
 + * Allocates a new region of a given size, between iova_min and iova_max.
 + *
 + * Return: Same as iova_tree_insert, but cannot overlap and can return error if
 + * iova tree is out of free contiguous range. The caller gets the assigned iova
 + * in map->iova.
 + */
 +int iova_tree_alloc_map(IOVATree *tree, DMAMap *map, hwaddr iova_begin,
 +                        hwaddr iova_end);
 +
 +/**
   * iova_tree_destroy:
   *
   * @tree: the iova tree to destroy
 diff --git a/util/iova-tree.c b/util/iova-tree.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/iova-tree.c
 +++ b/util/iova-tree.c
@@ -XXX,XX +XXX,XX @@ struct IOVATree {
      GTree *tree;
  };
 +/* Args to pass to iova_tree_alloc foreach function. */
 +struct IOVATreeAllocArgs {
 +    /* Size of the desired allocation */
 +    size_t new_size;
 +
 +    /* The minimum address allowed in the allocation */
 +    hwaddr iova_begin;
 +
 +    /* Map at the left of the hole, can be NULL if "this" is first one */
 +    const DMAMap *prev;
 +
 +    /* Map at the right of the hole, can be NULL if "prev" is the last one */
 +    const DMAMap *this;
 +
 +    /* If found, we fill in the IOVA here */
 +    hwaddr iova_result;
 +
 +    /* Whether have we found a valid IOVA */
 +    bool iova_found;
 +};
 +
 +/**
 + * Iterate args to the next hole
 + *
 + * @args: The alloc arguments
 + * @next: The next mapping in the tree. Can be NULL to signal the last one
 + */
 +static void iova_tree_alloc_args_iterate(struct IOVATreeAllocArgs *args,
 +                                         const DMAMap *next)
 +{
 +    args->prev = args->this;
 +    args->this = next;
 +}
 +
  static int iova_tree_compare(gconstpointer a, gconstpointer b, gpointer data)
  {
      const DMAMap *m1 = a, *m2 = b;
@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map)
      return IOVA_OK;
  }
--static int vhost_vdpa_get_max_queue_pairs(int fd, int *has_cvq, Error **errp)
++/**
-+static int vhost_vdpa_get_features(int fd, uint64_t *features, Error **errp)
++ * Try to find an unallocated IOVA range between prev and this elements.
-+{
++ *
-+    int ret = ioctl(fd, VHOST_GET_FEATURES, features);
++ * @args: Arguments to allocation
-+    if (unlikely(ret < 0)) {
++ *
-+        error_setg_errno(errp, errno,
++ * Cases:
-+                         "Fail to query features from vhost-vDPA device");
++ *
-+    }
++ * (1) !prev, !this: No entries allocated, always succeed
-+    return ret;
++ *
-+}
++ * (2) !prev, this: We're iterating at the 1st element.
-+
++ *
-+static int vhost_vdpa_get_max_queue_pairs(int fd, uint64_t features,
++ * (3) prev, !this: We're iterating at the last element.
-+                                          int *has_cvq, Error **errp)
++ *
 + * (4) prev, this: this is the most common case, we'll try to find a hole
 + * between "prev" and "this" mapping.
 + *
 + * Note that this function assumes the last valid iova is HWADDR_MAX, but it
 + * searches linearly so it's easy to discard the result if it's not the case.
 + */
 +static void iova_tree_alloc_map_in_hole(struct IOVATreeAllocArgs *args)
 +{
 +    const DMAMap *prev = args->prev, *this = args->this;
 +    uint64_t hole_start, hole_last;
 +
 +    if (this && this->iova + this->size < args->iova_begin) {
 +        return;
 +    }
 +
 +    hole_start = MAX(prev ? prev->iova + prev->size + 1 : 0, args->iova_begin);
 +    hole_last = this ? this->iova : HWADDR_MAX;
 +
 +    if (hole_last - hole_start > args->new_size) {
 +        args->iova_result = hole_start;
 +        args->iova_found = true;
 +    }
 +}
 +
 +/**
 + * Foreach dma node in the tree, compare if there is a hole with its previous
 + * node (or minimum iova address allowed) and the node.
 + *
 + * @key: Node iterating
 + * @value: Node iterating
 + * @pargs: Struct to communicate with the outside world
 + *
 + * Return: false to keep iterating, true if needs break.
 + */
 +static gboolean iova_tree_alloc_traverse(gpointer key, gpointer value,
 +                                         gpointer pargs)
 +{
 +    struct IOVATreeAllocArgs *args = pargs;
 +    DMAMap *node = value;
 +
 +    assert(key == value);
 +
 +    iova_tree_alloc_args_iterate(args, node);
 +    iova_tree_alloc_map_in_hole(args);
 +    return args->iova_found;
 +}
 +
 +int iova_tree_alloc_map(IOVATree *tree, DMAMap *map, hwaddr iova_begin,
 +                        hwaddr iova_last)
 +{
 +    struct IOVATreeAllocArgs args = {
 +        .new_size = map->size,
 +        .iova_begin = iova_begin,
 +    };
 +
 +    if (unlikely(iova_last < iova_begin)) {
 +        return IOVA_ERR_INVALID;
 +    }
 +
 +    /*
 +     * Find a valid hole for the mapping
 +     *
 +     * Assuming low iova_begin, so no need to do a binary search to
 +     * locate the first node.
 +     *
 +     * TODO: Replace all this with g_tree_node_first/next/last when available
 +     * (from glib since 2.68). To do it with g_tree_foreach complicates the
 +     * code a lot.
 +     *
 +     */
 +    g_tree_foreach(tree->tree, iova_tree_alloc_traverse, &args);
 +    if (!args.iova_found) {
 +        /*
 +         * Either tree is empty or the last hole is still not checked.
 +         * g_tree_foreach does not compare (last, iova_last] range, so we check
 +         * it here.
 +         */
 +        iova_tree_alloc_args_iterate(&args, NULL);
 +        iova_tree_alloc_map_in_hole(&args);
 +    }
 +
 +    if (!args.iova_found || args.iova_result + map->size > iova_last) {
 +        return IOVA_ERR_NOMEM;
 +    }
 +
 +    map->iova = args.iova_result;
 +    return iova_tree_insert(tree, map);
 +}
 +
  void iova_tree_destroy(IOVATree *tree)
  {
-     unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
+     g_tree_destroy(tree->tree);
      g_autofree struct vhost_vdpa_config *config = NULL;
      __virtio16 *max_queue_pairs;
 -    uint64_t features;
      int ret;
 -    ret = ioctl(fd, VHOST_GET_FEATURES, &features);
 -    if (ret) {
 -        error_setg(errp, "Fail to query features from vhost-vDPA device");
 -        return ret;
 -    }
 -
      if (features & (1 << VIRTIO_NET_F_CTRL_VQ)) {
          *has_cvq = 1;
      } else {
@@ -XXX,XX +XXX,XX @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
                          NetClientState *peer, Error **errp)
  {
      const NetdevVhostVDPAOptions *opts;
 +    uint64_t features;
      int vdpa_device_fd;
      g_autofree NetClientState **ncs = NULL;
      NetClientState *nc;
 -    int queue_pairs, i, has_cvq = 0;
 +    int queue_pairs, r, i, has_cvq = 0;
      assert(netdev->type == NET_CLIENT_DRIVER_VHOST_VDPA);
      opts = &netdev->u.vhost_vdpa;
@@ -XXX,XX +XXX,XX @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
          return -errno;
      }
 -    queue_pairs = vhost_vdpa_get_max_queue_pairs(vdpa_device_fd,
 +    r = vhost_vdpa_get_features(vdpa_device_fd, &features, errp);
 +    if (unlikely(r < 0)) {
 +        return r;
 +    }
 +
 +    queue_pairs = vhost_vdpa_get_max_queue_pairs(vdpa_device_fd, features,
                                                   &has_cvq, errp);
      if (queue_pairs < 0) {
          qemu_close(vdpa_device_fd);
 --
 .7.4

-[PULL V2 18/25] vdpa: Buffer CVQ support on shadow virtqueue
+[PULL V3 10/15] util: add iova_tree_find_iova
 From: Eugenio Pérez <eperezma@redhat.com>
-Introduce the control virtqueue support for vDPA shadow virtqueue. This
+This function does the reverse operation of iova_tree_find: To look for
-is needed for advanced networking features like rx filtering.
+a mapping that match a translated address so we can do the reverse.
-Virtio-net control VQ copies the descriptors to qemu's VA, so we avoid
+This have linear complexity instead of logarithmic, but it supports
-TOCTOU with the guest's or device's memory every time there is a device
+overlapping HVA. Future developments could reduce it.
 model change.  Otherwise, the guest could change the memory content in
 the time between qemu and the device read it.
 To demonstrate command handling, VIRTIO_NET_F_CTRL_MACADDR is
 implemented.  If the virtio-net driver changes MAC the virtio-net device
 model will be updated with the new one, and a rx filtering change event
 will be raised.
 More cvq commands could be added here straightforwardly but they have
 not been tested.
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- net/vhost-vdpa.c | 213 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
+ include/qemu/iova-tree.h | 20 +++++++++++++++++++-
-file changed, 205 insertions(+), 8 deletions(-)
+ util/iova-tree.c         | 34 ++++++++++++++++++++++++++++++++++
 files changed, 53 insertions(+), 1 deletion(-)
-diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
+diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h
 index XXXXXXX..XXXXXXX 100644
---- a/net/vhost-vdpa.c
+--- a/include/qemu/iova-tree.h
-+++ b/net/vhost-vdpa.c
++++ b/include/qemu/iova-tree.h
-@@ -XXX,XX +XXX,XX @@ typedef struct VhostVDPAState {
+@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map);
-     NetClientState nc;
+  * @tree: the iova tree to search from
-     struct vhost_vdpa vhost_vdpa;
+  * @map: the mapping to search
-     VHostNetState *vhost_net;
+  *
 - * Search for a mapping in the iova tree that overlaps with the
 + * Search for a mapping in the iova tree that iova overlaps with the
   * mapping range specified.  Only the first found mapping will be
   * returned.
   *
@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map);
  const DMAMap *iova_tree_find(const IOVATree *tree, const DMAMap *map);
  /**
 + * iova_tree_find_iova:
 + *
 + * @tree: the iova tree to search from
 + * @map: the mapping to search
 + *
 + * Search for a mapping in the iova tree that translated_addr overlaps with the
 + * mapping range specified.  Only the first found mapping will be
 + * returned.
 + *
 + * Return: DMAMap pointer if found, or NULL if not found.  Note that
 + * the returned DMAMap pointer is maintained internally.  User should
 + * only read the content but never modify or free the content.  Also,
 + * user is responsible to make sure the pointer is valid (say, no
 + * concurrent deletion in progress).
 + */
 +const DMAMap *iova_tree_find_iova(const IOVATree *tree, const DMAMap *map);
 +
-+    /* Control commands shadow buffers */
++/**
-+    void *cvq_cmd_out_buffer, *cvq_cmd_in_buffer;
+  * iova_tree_find_address:
-     bool started;
+  *
- } VhostVDPAState;
+  * @tree: the iova tree to search from
+diff --git a/util/iova-tree.c b/util/iova-tree.c
-@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_cleanup(NetClientState *nc)
+index XXXXXXX..XXXXXXX 100644
- {
+--- a/util/iova-tree.c
-     VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
++++ b/util/iova-tree.c
+@@ -XXX,XX +XXX,XX @@ struct IOVATreeAllocArgs {
-+    qemu_vfree(s->cvq_cmd_out_buffer);
+     bool iova_found;
 +    qemu_vfree(s->cvq_cmd_in_buffer);
      if (s->vhost_net) {
          vhost_net_cleanup(s->vhost_net);
          g_free(s->vhost_net);
@@ -XXX,XX +XXX,XX @@ static NetClientInfo net_vhost_vdpa_info = {
          .check_peer_type = vhost_vdpa_check_peer_type,
  };
-+static void vhost_vdpa_cvq_unmap_buf(struct vhost_vdpa *v, void *addr)
++typedef struct IOVATreeFindIOVAArgs {
 +    const DMAMap *needle;
 +    const DMAMap *result;
 +} IOVATreeFindIOVAArgs;
 +
  /**
   * Iterate args to the next hole
   *
@@ -XXX,XX +XXX,XX @@ const DMAMap *iova_tree_find(const IOVATree *tree, const DMAMap *map)
      return g_tree_lookup(tree->tree, map);
  }
 +static gboolean iova_tree_find_address_iterator(gpointer key, gpointer value,
 +                                                gpointer data)
 +{
-+    VhostIOVATree *tree = v->iova_tree;
++    const DMAMap *map = key;
-+    DMAMap needle = {
++    IOVATreeFindIOVAArgs *args = data;
-+        /*
++    const DMAMap *needle;
 +         * No need to specify size or to look for more translations since
 +         * this contiguous chunk was allocated by us.
 +         */
 +        .translated_addr = (hwaddr)(uintptr_t)addr,
 +    };
 +    const DMAMap *map = vhost_iova_tree_find_iova(tree, &needle);
 +    int r;
 +
-+    if (unlikely(!map)) {
++    g_assert(key == value);
 +        error_report("Cannot locate expected map");
 +        return;
 +    }
 +
-+    r = vhost_vdpa_dma_unmap(v, map->iova, map->size + 1);
++    needle = args->needle;
-+    if (unlikely(r != 0)) {
++    if (map->translated_addr + map->size < needle->translated_addr ||
-+        error_report("Device cannot unmap: %s(%d)", g_strerror(r), r);
++        needle->translated_addr + needle->size < map->translated_addr) {
 +    }
 +
 +    vhost_iova_tree_remove(tree, map);
 +}
 +
 +static size_t vhost_vdpa_net_cvq_cmd_len(void)
 +{
 +    /*
 +     * MAC_TABLE_SET is the ctrl command that produces the longer out buffer.
 +     * In buffer is always 1 byte, so it should fit here
 +     */
 +    return sizeof(struct virtio_net_ctrl_hdr) +
 +           2 * sizeof(struct virtio_net_ctrl_mac) +
 +           MAC_TABLE_ENTRIES * ETH_ALEN;
 +}
 +
 +static size_t vhost_vdpa_net_cvq_cmd_page_len(void)
 +{
 +    return ROUND_UP(vhost_vdpa_net_cvq_cmd_len(), qemu_real_host_page_size());
 +}
 +
 +/** Copy and map a guest buffer. */
 +static bool vhost_vdpa_cvq_map_buf(struct vhost_vdpa *v,
 +                                   const struct iovec *out_data,
 +                                   size_t out_num, size_t data_len, void *buf,
 +                                   size_t *written, bool write)
 +{
 +    DMAMap map = {};
 +    int r;
 +
 +    if (unlikely(!data_len)) {
 +        qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid legnth of %s buffer\n",
 +                      __func__, write ? "in" : "out");
 +        return false;
 +    }
 +
-+    *written = iov_to_buf(out_data, out_num, 0, buf, data_len);
++    args->result = map;
 +    map.translated_addr = (hwaddr)(uintptr_t)buf;
 +    map.size = vhost_vdpa_net_cvq_cmd_page_len() - 1;
 +    map.perm = write ? IOMMU_RW : IOMMU_RO,
 +    r = vhost_iova_tree_map_alloc(v->iova_tree, &map);
 +    if (unlikely(r != IOVA_OK)) {
 +        error_report("Cannot map injected element");
 +        return false;
 +    }
 +
 +    r = vhost_vdpa_dma_map(v, map.iova, vhost_vdpa_net_cvq_cmd_page_len(), buf,
 +                           !write);
 +    if (unlikely(r < 0)) {
 +        goto dma_map_err;
 +    }
 +
 +    return true;
 +
 +dma_map_err:
 +    vhost_iova_tree_remove(v->iova_tree, &map);
 +    return false;
 +}
 +
  /**
 - * Forward buffer for the moment.
 + * Copy the guest element into a dedicated buffer suitable to be sent to NIC
 + *
 + * @iov: [0] is the out buffer, [1] is the in one
 + */
 +static bool vhost_vdpa_net_cvq_map_elem(VhostVDPAState *s,
 +                                        VirtQueueElement *elem,
 +                                        struct iovec *iov)
 +{
 +    size_t in_copied;
 +    bool ok;
 +
 +    iov[0].iov_base = s->cvq_cmd_out_buffer;
 +    ok = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, elem->out_sg, elem->out_num,
 +                                vhost_vdpa_net_cvq_cmd_len(), iov[0].iov_base,
 +                                &iov[0].iov_len, false);
 +    if (unlikely(!ok)) {
 +        return false;
 +    }
 +
 +    iov[1].iov_base = s->cvq_cmd_in_buffer;
 +    ok = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, NULL, 0,
 +                                sizeof(virtio_net_ctrl_ack), iov[1].iov_base,
 +                                &in_copied, true);
 +    if (unlikely(!ok)) {
 +        vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer);
 +        return false;
 +    }
 +
 +    iov[1].iov_len = sizeof(virtio_net_ctrl_ack);
 +    return true;
 +}
 +
-+/**
++const DMAMap *iova_tree_find_iova(const IOVATree *tree, const DMAMap *map)
 + * Do not forward commands not supported by SVQ. Otherwise, the device could
 + * accept it and qemu would not know how to update the device model.
 + */
 +static bool vhost_vdpa_net_cvq_validate_cmd(const struct iovec *out,
 +                                            size_t out_num)
 +{
-+    struct virtio_net_ctrl_hdr ctrl;
++    IOVATreeFindIOVAArgs args = {
-+    size_t n;
++        .needle = map,
 +
 +    n = iov_to_buf(out, out_num, 0, &ctrl, sizeof(ctrl));
 +    if (unlikely(n < sizeof(ctrl))) {
 +        qemu_log_mask(LOG_GUEST_ERROR,
 +                      "%s: invalid legnth of out buffer %zu\n", __func__, n);
 +        return false;
 +    }
 +
 +    switch (ctrl.class) {
 +    case VIRTIO_NET_CTRL_MAC:
 +        switch (ctrl.cmd) {
 +        case VIRTIO_NET_CTRL_MAC_ADDR_SET:
 +            return true;
 +        default:
 +            qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid mac cmd %u\n",
 +                          __func__, ctrl.cmd);
 +        };
 +        break;
 +    default:
 +        qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid control class %u\n",
 +                      __func__, ctrl.class);
 +    };
 +
-+    return false;
++    g_tree_foreach(tree->tree, iova_tree_find_address_iterator, &args);
 +    return args.result;
 +}
 +
-+/**
+ const DMAMap *iova_tree_find_address(const IOVATree *tree, hwaddr iova)
 + * Validate and copy control virtqueue commands.
 + *
 + * Following QEMU guidelines, we offer a copy of the buffers to the device to
 + * prevent TOCTOU bugs.
   */
  static int vhost_vdpa_net_handle_ctrl_avail(VhostShadowVirtqueue *svq,
                                              VirtQueueElement *elem,
                                              void *opaque)
  {
--    unsigned int n = elem->out_num + elem->in_num;
+     const DMAMap map = { .iova = iova, .size = 0 };
 -    g_autofree struct iovec *dev_buffers = g_new(struct iovec, n);
 +    VhostVDPAState *s = opaque;
      size_t in_len, dev_written;
      virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
 -    int r;
 +    /* out and in buffers sent to the device */
 +    struct iovec dev_buffers[2] = {
 +        { .iov_base = s->cvq_cmd_out_buffer },
 +        { .iov_base = s->cvq_cmd_in_buffer },
 +    };
 +    /* in buffer used for device model */
 +    const struct iovec in = {
 +        .iov_base = &status,
 +        .iov_len = sizeof(status),
 +    };
 +    int r = -EINVAL;
 +    bool ok;
 +
 +    ok = vhost_vdpa_net_cvq_map_elem(s, elem, dev_buffers);
 +    if (unlikely(!ok)) {
 +        goto out;
 +    }
 -    memcpy(dev_buffers, elem->out_sg, elem->out_num);
 -    memcpy(dev_buffers + elem->out_num, elem->in_sg, elem->in_num);
 +    ok = vhost_vdpa_net_cvq_validate_cmd(&dev_buffers[0], 1);
 +    if (unlikely(!ok)) {
 +        goto out;
 +    }
 -    r = vhost_svq_add(svq, &dev_buffers[0], elem->out_num, &dev_buffers[1],
 -                      elem->in_num, elem);
 +    r = vhost_svq_add(svq, &dev_buffers[0], 1, &dev_buffers[1], 1, elem);
      if (unlikely(r != 0)) {
          if (unlikely(r == -ENOSPC)) {
              qemu_log_mask(LOG_GUEST_ERROR, "%s: No space on device queue\n",
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_net_handle_ctrl_avail(VhostShadowVirtqueue *svq,
      dev_written = vhost_svq_poll(svq);
      if (unlikely(dev_written < sizeof(status))) {
          error_report("Insufficient written data (%zu)", dev_written);
 +        goto out;
 +    }
 +
 +    memcpy(&status, dev_buffers[1].iov_base, sizeof(status));
 +    if (status != VIRTIO_NET_OK) {
 +        goto out;
 +    }
 +
 +    status = VIRTIO_NET_ERR;
 +    virtio_net_handle_ctrl_iov(svq->vdev, &in, 1, dev_buffers, 1);
 +    if (status != VIRTIO_NET_OK) {
 +        error_report("Bad CVQ processing in model");
      }
  out:
@@ -XXX,XX +XXX,XX @@ out:
      }
      vhost_svq_push_elem(svq, elem, MIN(in_len, sizeof(status)));
      g_free(elem);
 +    if (dev_buffers[0].iov_base) {
 +        vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, dev_buffers[0].iov_base);
 +    }
 +    if (dev_buffers[1].iov_base) {
 +        vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, dev_buffers[1].iov_base);
 +    }
      return r;
  }
@@ -XXX,XX +XXX,XX @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
      s->vhost_vdpa.device_fd = vdpa_device_fd;
      s->vhost_vdpa.index = queue_pair_index;
      if (!is_datapath) {
 +        s->cvq_cmd_out_buffer = qemu_memalign(qemu_real_host_page_size(),
 +                                            vhost_vdpa_net_cvq_cmd_page_len());
 +        memset(s->cvq_cmd_out_buffer, 0, vhost_vdpa_net_cvq_cmd_page_len());
 +        s->cvq_cmd_in_buffer = qemu_memalign(qemu_real_host_page_size(),
 +                                            vhost_vdpa_net_cvq_cmd_page_len());
 +        memset(s->cvq_cmd_in_buffer, 0, vhost_vdpa_net_cvq_cmd_page_len());
 +
          s->vhost_vdpa.shadow_vq_ops = &vhost_vdpa_net_svq_ops;
          s->vhost_vdpa.shadow_vq_ops_opaque = s;
      }
 --
 .7.4

-[PULL V2 21/25] vdpa: Add x-svq to NetdevVhostVDPAOptions
+[PULL V3 11/15] vhost: Add VhostIOVATree
 From: Eugenio Pérez <eperezma@redhat.com>
-Finally offering the possibility to enable SVQ from the command line.
+This tree is able to look for a translated address from an IOVA address.
 At first glance it is similar to util/iova-tree. However, SVQ working on
 devices with limited IOVA space need more capabilities, like allocating
 IOVA chunks or performing reverse translations (qemu addresses to iova).
 The allocation capability, as "assign a free IOVA address to this chunk
 of memory in qemu's address space" allows shadow virtqueue to create a
 new address space that is not restricted by guest's addressable one, so
 we can allocate shadow vqs vrings outside of it.
 It duplicates the tree so it can search efficiently in both directions,
 and it will signal overlap if iova or the translated address is present
 in any tree.
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-Acked-by: Markus Armbruster <armbru@redhat.com>
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- net/vhost-vdpa.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++++---
+ hw/virtio/meson.build       |   2 +-
- qapi/net.json    |  9 ++++++-
+ hw/virtio/vhost-iova-tree.c | 110 ++++++++++++++++++++++++++++++++++++++++++++
-files changed, 77 insertions(+), 4 deletions(-)
+ hw/virtio/vhost-iova-tree.h |  27 +++++++++++
 files changed, 138 insertions(+), 1 deletion(-)
  create mode 100644 hw/virtio/vhost-iova-tree.c
  create mode 100644 hw/virtio/vhost-iova-tree.h
-diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
+diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
 index XXXXXXX..XXXXXXX 100644
---- a/net/vhost-vdpa.c
+--- a/hw/virtio/meson.build
-+++ b/net/vhost-vdpa.c
++++ b/hw/virtio/meson.build
-@@ -XXX,XX +XXX,XX @@ const int vdpa_feature_bits[] = {
+@@ -XXX,XX +XXX,XX @@ softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-stub.c'))
-     VHOST_INVALID_FEATURE_BIT
- };
+ virtio_ss = ss.source_set()
+ virtio_ss.add(files('virtio.c'))
-+/** Supported device specific feature bits with SVQ */
+-virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c'))
-+static const uint64_t vdpa_svq_device_features =
++virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c', 'vhost-iova-tree.c'))
-+    BIT_ULL(VIRTIO_NET_F_CSUM) |
+ virtio_ss.add(when: 'CONFIG_VHOST_USER', if_true: files('vhost-user.c'))
-+    BIT_ULL(VIRTIO_NET_F_GUEST_CSUM) |
+ virtio_ss.add(when: 'CONFIG_VHOST_VDPA', if_true: files('vhost-vdpa.c'))
-+    BIT_ULL(VIRTIO_NET_F_MTU) |
+ virtio_ss.add(when: 'CONFIG_VIRTIO_BALLOON', if_true: files('virtio-balloon.c'))
-+    BIT_ULL(VIRTIO_NET_F_MAC) |
+diff --git a/hw/virtio/vhost-iova-tree.c b/hw/virtio/vhost-iova-tree.c
-+    BIT_ULL(VIRTIO_NET_F_GUEST_TSO4) |
+new file mode 100644
-+    BIT_ULL(VIRTIO_NET_F_GUEST_TSO6) |
+index XXXXXXX..XXXXXXX
-+    BIT_ULL(VIRTIO_NET_F_GUEST_ECN) |
+--- /dev/null
-+    BIT_ULL(VIRTIO_NET_F_GUEST_UFO) |
++++ b/hw/virtio/vhost-iova-tree.c
-+    BIT_ULL(VIRTIO_NET_F_HOST_TSO4) |
+@@ -XXX,XX +XXX,XX @@
-+    BIT_ULL(VIRTIO_NET_F_HOST_TSO6) |
++/*
-+    BIT_ULL(VIRTIO_NET_F_HOST_ECN) |
++ * vhost software live migration iova tree
-+    BIT_ULL(VIRTIO_NET_F_HOST_UFO) |
++ *
-+    BIT_ULL(VIRTIO_NET_F_MRG_RXBUF) |
++ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
-+    BIT_ULL(VIRTIO_NET_F_STATUS) |
++ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
-+    BIT_ULL(VIRTIO_NET_F_CTRL_VQ) |
++ *
-+    BIT_ULL(VIRTIO_F_ANY_LAYOUT) |
++ * SPDX-License-Identifier: GPL-2.0-or-later
-+    BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR) |
++ */
 +    BIT_ULL(VIRTIO_NET_F_RSC_EXT) |
 +    BIT_ULL(VIRTIO_NET_F_STANDBY);
 +
- VHostNetState *vhost_vdpa_get_vhost_net(NetClientState *nc)
++#include "qemu/osdep.h"
- {
++#include "qemu/iova-tree.h"
-     VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
++#include "vhost-iova-tree.h"
-@@ -XXX,XX +XXX,XX @@ err_init:
++
- static void vhost_vdpa_cleanup(NetClientState *nc)
++#define iova_min_addr qemu_real_host_page_size
- {
++
-     VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
++/**
-+    struct vhost_dev *dev = &s->vhost_net->dev;
++ * VhostIOVATree, able to:
++ * - Translate iova address
-     qemu_vfree(s->cvq_cmd_out_buffer);
++ * - Reverse translate iova address (from translated to iova)
-     qemu_vfree(s->cvq_cmd_in_buffer);
++ * - Allocate IOVA regions for translated range (linear operation)
-+    if (dev->vq_index + dev->nvqs == dev->vq_index_end) {
++ */
-+        g_clear_pointer(&s->vhost_vdpa.iova_tree, vhost_iova_tree_delete);
++struct VhostIOVATree {
-+    }
++    /* First addressable iova address in the device */
-     if (s->vhost_net) {
++    uint64_t iova_first;
-         vhost_net_cleanup(s->vhost_net);
++
-         g_free(s->vhost_net);
++    /* Last addressable iova address in the device */
-@@ -XXX,XX +XXX,XX @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
++    uint64_t iova_last;
-                                            int vdpa_device_fd,
++
-                                            int queue_pair_index,
++    /* IOVA address to qemu memory maps. */
-                                            int nvqs,
++    IOVATree *iova_taddr_map;
--                                           bool is_datapath)
++};
-+                                           bool is_datapath,
++
-+                                           bool svq,
++/**
-+                                           VhostIOVATree *iova_tree)
++ * Create a new IOVA tree
- {
++ *
-     NetClientState *nc = NULL;
++ * Returns the new IOVA tree
-     VhostVDPAState *s;
++ */
-@@ -XXX,XX +XXX,XX @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
++VhostIOVATree *vhost_iova_tree_new(hwaddr iova_first, hwaddr iova_last)
      s->vhost_vdpa.device_fd = vdpa_device_fd;
      s->vhost_vdpa.index = queue_pair_index;
 +    s->vhost_vdpa.shadow_vqs_enabled = svq;
 +    s->vhost_vdpa.iova_tree = iova_tree;
      if (!is_datapath) {
          s->cvq_cmd_out_buffer = qemu_memalign(qemu_real_host_page_size(),
                                              vhost_vdpa_net_cvq_cmd_page_len());
@@ -XXX,XX +XXX,XX @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
          s->vhost_vdpa.shadow_vq_ops = &vhost_vdpa_net_svq_ops;
          s->vhost_vdpa.shadow_vq_ops_opaque = s;
 +        error_setg(&s->vhost_vdpa.migration_blocker,
 +                   "Migration disabled: vhost-vdpa uses CVQ.");
      }
      ret = vhost_vdpa_add(nc, (void *)&s->vhost_vdpa, queue_pair_index, nvqs);
      if (ret) {
@@ -XXX,XX +XXX,XX @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
      return nc;
  }
 +static int vhost_vdpa_get_iova_range(int fd,
 +                                     struct vhost_vdpa_iova_range *iova_range)
 +{
-+    int ret = ioctl(fd, VHOST_VDPA_GET_IOVA_RANGE, iova_range);
++    VhostIOVATree *tree = g_new(VhostIOVATree, 1);
 +
-+    return ret < 0 ? -errno : 0;
++    /* Some devices do not like 0 addresses */
 +    tree->iova_first = MAX(iova_first, iova_min_addr);
 +    tree->iova_last = iova_last;
 +
 +    tree->iova_taddr_map = iova_tree_new();
 +    return tree;
 +}
 +
- static int vhost_vdpa_get_features(int fd, uint64_t *features, Error **errp)
++/**
- {
++ * Delete an iova tree
-     int ret = ioctl(fd, VHOST_GET_FEATURES, features);
++ */
-@@ -XXX,XX +XXX,XX @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
++void vhost_iova_tree_delete(VhostIOVATree *iova_tree)
-     uint64_t features;
++{
-     int vdpa_device_fd;
++    iova_tree_destroy(iova_tree->iova_taddr_map);
-     g_autofree NetClientState **ncs = NULL;
++    g_free(iova_tree);
-+    g_autoptr(VhostIOVATree) iova_tree = NULL;
++}
      NetClientState *nc;
      int queue_pairs, r, i, has_cvq = 0;
@@ -XXX,XX +XXX,XX @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
          return queue_pairs;
      }
 +    if (opts->x_svq) {
 +        struct vhost_vdpa_iova_range iova_range;
 +
-+        uint64_t invalid_dev_features =
++/**
-+            features & ~vdpa_svq_device_features &
++ * Find the IOVA address stored from a memory address
-+            /* Transport are all accepted at this point */
++ *
-+            ~MAKE_64BIT_MASK(VIRTIO_TRANSPORT_F_START,
++ * @tree: The iova tree
-+                             VIRTIO_TRANSPORT_F_END - VIRTIO_TRANSPORT_F_START);
++ * @map: The map with the memory address
 + *
 + * Return the stored mapping, or NULL if not found.
 + */
 +const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *tree,
 +                                        const DMAMap *map)
 +{
 +    return iova_tree_find_iova(tree->iova_taddr_map, map);
 +}
 +
-+        if (invalid_dev_features) {
++/**
-+            error_setg(errp, "vdpa svq does not work with features 0x%" PRIx64,
++ * Allocate a new mapping
-+                       invalid_dev_features);
++ *
-+            goto err_svq;
++ * @tree: The iova tree
-+        }
++ * @map: The iova map
 + *
 + * Returns:
 + * - IOVA_OK if the map fits in the container
 + * - IOVA_ERR_INVALID if the map does not make sense (like size overflow)
 + * - IOVA_ERR_NOMEM if tree cannot allocate more space.
 + *
 + * It returns assignated iova in map->iova if return value is VHOST_DMA_MAP_OK.
 + */
 +int vhost_iova_tree_map_alloc(VhostIOVATree *tree, DMAMap *map)
 +{
 +    /* Some vhost devices do not like addr 0. Skip first page */
 +    hwaddr iova_first = tree->iova_first ?: qemu_real_host_page_size;
 +
-+        vhost_vdpa_get_iova_range(vdpa_device_fd, &iova_range);
++    if (map->translated_addr + map->size < map->translated_addr ||
-+        iova_tree = vhost_iova_tree_new(iova_range.first, iova_range.last);
++        map->perm == IOMMU_NONE) {
 +        return IOVA_ERR_INVALID;
 +    }
 +
-     ncs = g_malloc0(sizeof(*ncs) * queue_pairs);
++    /* Allocate a node in IOVA address */
++    return iova_tree_alloc_map(tree->iova_taddr_map, map, iova_first,
-     for (i = 0; i < queue_pairs; i++) {
++                               tree->iova_last);
-         ncs[i] = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
++}
 -                                     vdpa_device_fd, i, 2, true);
 +                                     vdpa_device_fd, i, 2, true, opts->x_svq,
 +                                     iova_tree);
          if (!ncs[i])
              goto err;
      }
      if (has_cvq) {
          nc = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
 -                                 vdpa_device_fd, i, 1, false);
 +                                 vdpa_device_fd, i, 1, false,
 +                                 opts->x_svq, iova_tree);
          if (!nc)
              goto err;
      }
 +    /* iova_tree ownership belongs to last NetClientState */
 +    g_steal_pointer(&iova_tree);
      return 0;
  err:
@@ -XXX,XX +XXX,XX @@ err:
              qemu_del_net_client(ncs[i]);
          }
      }
 +
-+err_svq:
++/**
-     qemu_close(vdpa_device_fd);
++ * Remove existing mappings from iova tree
++ *
-     return -1;
++ * @iova_tree: The vhost iova tree
-diff --git a/qapi/net.json b/qapi/net.json
++ * @map: The map to remove
-index XXXXXXX..XXXXXXX 100644
++ */
---- a/qapi/net.json
++void vhost_iova_tree_remove(VhostIOVATree *iova_tree, const DMAMap *map)
-+++ b/qapi/net.json
++{
 +    iova_tree_remove(iova_tree->iova_taddr_map, map);
 +}
 diff --git a/hw/virtio/vhost-iova-tree.h b/hw/virtio/vhost-iova-tree.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/hw/virtio/vhost-iova-tree.h
 @@ -XXX,XX +XXX,XX @@
- # @queues: number of queues to be created for multiqueue vhost-vdpa
++/*
- #          (default: 1)
++ * vhost software live migration iova tree
- #
++ *
-+# @x-svq: Start device with (experimental) shadow virtqueue. (Since 7.1)
++ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
-+#         (default: false)
++ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
-+#
++ *
-+# Features:
++ * SPDX-License-Identifier: GPL-2.0-or-later
-+# @unstable: Member @x-svq is experimental.
++ */
-+#
++
- # Since: 5.1
++#ifndef HW_VIRTIO_VHOST_IOVA_TREE_H
- ##
++#define HW_VIRTIO_VHOST_IOVA_TREE_H
- { 'struct': 'NetdevVhostVDPAOptions',
++
-   'data': {
++#include "qemu/iova-tree.h"
-     '*vhostdev':     'str',
++#include "exec/memory.h"
--    '*queues':       'int' } }
++
-+    '*queues':       'int',
++typedef struct VhostIOVATree VhostIOVATree;
-+    '*x-svq':        {'type': 'bool', 'features' : [ 'unstable'] } } }
++
++VhostIOVATree *vhost_iova_tree_new(uint64_t iova_first, uint64_t iova_last);
- ##
++void vhost_iova_tree_delete(VhostIOVATree *iova_tree);
- # @NetdevVmnetHostOptions:
++G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostIOVATree, vhost_iova_tree_delete);
 +
 +const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *iova_tree,
 +                                        const DMAMap *map);
 +int vhost_iova_tree_map_alloc(VhostIOVATree *iova_tree, DMAMap *map);
 +void vhost_iova_tree_remove(VhostIOVATree *iova_tree, const DMAMap *map);
 +
 +#endif
 --
 .7.4

-[PULL V2 14/25] vhost: Add svq avail_handler callback
+[PULL V3 12/15] vdpa: Add custom IOTLB translations to SVQ
 From: Eugenio Pérez <eperezma@redhat.com>
-This allows external handlers to be aware of new buffers that the guest
+Use translations added in VhostIOVATree in SVQ.
-places in the virtqueue.
+Only introduce usage here, not allocation and deallocation. As with
-When this callback is defined the ownership of the guest's virtqueue
+previous patches, we use the dead code paths of shadow_vqs_enabled to
-element is transferred to the callback. This means that if the user
+avoid commiting too many changes at once. These are impossible to take
-wants to forward the descriptor it needs to manually inject it. The
+at the moment.
 callback is also free to process the command by itself and use the
 element with svq_push.
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/virtio/vhost-shadow-virtqueue.c | 14 ++++++++++++--
+ hw/virtio/vhost-shadow-virtqueue.c |  86 +++++++++++++++++++++++---
- hw/virtio/vhost-shadow-virtqueue.h | 31 ++++++++++++++++++++++++++++++-
+ hw/virtio/vhost-shadow-virtqueue.h |   6 +-
- hw/virtio/vhost-vdpa.c             |  3 ++-
+ hw/virtio/vhost-vdpa.c             | 122 +++++++++++++++++++++++++++++++------
-files changed, 44 insertions(+), 4 deletions(-)
+ include/hw/virtio/vhost-vdpa.h     |   3 +
 files changed, 187 insertions(+), 30 deletions(-)
 diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-shadow-virtqueue.c
 +++ b/hw/virtio/vhost-shadow-virtqueue.c
-@@ -XXX,XX +XXX,XX @@ static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
+@@ -XXX,XX +XXX,XX @@ static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
-                 break;
+     return svq->vring.num - (svq->shadow_avail_idx - svq->shadow_used_idx);
-             }
+ }
--            r = vhost_svq_add_element(svq, elem);
+-static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
-+            if (svq->ops) {
++/**
-+                r = svq->ops->avail_handler(svq, elem, svq->ops_opaque);
++ * Translate addresses between the qemu's virtual address and the SVQ IOVA
-+            } else {
++ *
-+                r = vhost_svq_add_element(svq, elem);
++ * @svq: Shadow VirtQueue
-+            }
++ * @vaddr: Translated IOVA addresses
-             if (unlikely(r != 0)) {
++ * @iovec: Source qemu's VA addresses
-                 if (r == -ENOSPC) {
++ * @num: Length of iovec and minimum length of vaddr
-                     /*
++ */
 +static bool vhost_svq_translate_addr(const VhostShadowVirtqueue *svq,
 +                                     hwaddr *addrs, const struct iovec *iovec,
 +                                     size_t num)
 +{
 +    if (num == 0) {
 +        return true;
 +    }
 +
 +    for (size_t i = 0; i < num; ++i) {
 +        DMAMap needle = {
 +            .translated_addr = (hwaddr)(uintptr_t)iovec[i].iov_base,
 +            .size = iovec[i].iov_len,
 +        };
 +        Int128 needle_last, map_last;
 +        size_t off;
 +
 +        const DMAMap *map = vhost_iova_tree_find_iova(svq->iova_tree, &needle);
 +        /*
 +         * Map cannot be NULL since iova map contains all guest space and
 +         * qemu already has a physical address mapped
 +         */
 +        if (unlikely(!map)) {
 +            qemu_log_mask(LOG_GUEST_ERROR,
 +                          "Invalid address 0x%"HWADDR_PRIx" given by guest",
 +                          needle.translated_addr);
 +            return false;
 +        }
 +
 +        off = needle.translated_addr - map->translated_addr;
 +        addrs[i] = map->iova + off;
 +
 +        needle_last = int128_add(int128_make64(needle.translated_addr),
 +                                 int128_make64(iovec[i].iov_len));
 +        map_last = int128_make64(map->translated_addr + map->size);
 +        if (unlikely(int128_gt(needle_last, map_last))) {
 +            qemu_log_mask(LOG_GUEST_ERROR,
 +                          "Guest buffer expands over iova range");
 +            return false;
 +        }
 +    }
 +
 +    return true;
 +}
 +
 +static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg,
                                      const struct iovec *iovec, size_t num,
                                      bool more_descs, bool write)
  {
@@ -XXX,XX +XXX,XX @@ static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
          } else {
              descs[i].flags = flags;
          }
 -        descs[i].addr = cpu_to_le64((hwaddr)(intptr_t)iovec[n].iov_base);
 +        descs[i].addr = cpu_to_le64(sg[n]);
          descs[i].len = cpu_to_le32(iovec[n].iov_len);
          last = i;
@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
  {
      unsigned avail_idx;
      vring_avail_t *avail = svq->vring.avail;
 +    bool ok;
 +    g_autofree hwaddr *sgs = g_new(hwaddr, MAX(elem->out_num, elem->in_num));
      *head = svq->free_head;
@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
          return false;
      }
 -    vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, elem->in_num > 0,
 -                            false);
 -    vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
 +    ok = vhost_svq_translate_addr(svq, sgs, elem->out_sg, elem->out_num);
 +    if (unlikely(!ok)) {
 +        return false;
 +    }
 +    vhost_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num,
 +                            elem->in_num > 0, false);
 +
 +
 +    ok = vhost_svq_translate_addr(svq, sgs, elem->in_sg, elem->in_num);
 +    if (unlikely(!ok)) {
 +        return false;
 +    }
 +
 +    vhost_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, false, true);
      /*
       * Put the entry in the available array (but don't update avail->idx until
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
  void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
                                struct vhost_vring_addr *addr)
  {
 -    addr->desc_user_addr = (uint64_t)(intptr_t)svq->vring.desc;
 -    addr->avail_user_addr = (uint64_t)(intptr_t)svq->vring.avail;
 -    addr->used_user_addr = (uint64_t)(intptr_t)svq->vring.used;
 +    addr->desc_user_addr = (uint64_t)(uintptr_t)svq->vring.desc;
 +    addr->avail_user_addr = (uint64_t)(uintptr_t)svq->vring.avail;
 +    addr->used_user_addr = (uint64_t)(uintptr_t)svq->vring.used;
  }
  size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq)
 @@ -XXX,XX +XXX,XX @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
+  * Creates vhost shadow virtqueue, and instructs the vhost device to use the
   * shadow methods and file descriptors.
   *
-  * @iova_tree: Tree to perform descriptors translations
++ * @iova_tree: Tree to perform descriptors translations
-+ * @ops: SVQ owner callbacks
++ *
 + * @ops_opaque: ops opaque pointer
   *
   * Returns the new virtqueue or NULL.
   *
   * In case of error, reason is reported through error_report.
   */
--VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree)
+-VhostShadowVirtqueue *vhost_svq_new(void)
-+VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree,
++VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree)
 +                                    const VhostShadowVirtqueueOps *ops,
 +                                    void *ops_opaque)
  {
      g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
      int r;
-@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree)
+@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(void)
      event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
      event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
-     svq->iova_tree = iova_tree;
++    svq->iova_tree = iova_tree;
 +    svq->ops = ops;
 +    svq->ops_opaque = ops_opaque;
      return g_steal_pointer(&svq);
  err_init_hdev_call:
 diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-shadow-virtqueue.h
 +++ b/hw/virtio/vhost-shadow-virtqueue.h
-@@ -XXX,XX +XXX,XX @@ typedef struct SVQDescState {
+@@ -XXX,XX +XXX,XX @@
-     unsigned int ndescs;
+ #include "qemu/event_notifier.h"
- } SVQDescState;
+ #include "hw/virtio/virtio.h"
+ #include "standard-headers/linux/vhost_types.h"
-+typedef struct VhostShadowVirtqueue VhostShadowVirtqueue;
++#include "hw/virtio/vhost-iova-tree.h"
-+
 +/**
 + * Callback to handle an avail buffer.
 + *
 + * @svq:  Shadow virtqueue
 + * @elem:  Element placed in the queue by the guest
 + * @vq_callback_opaque:  Opaque
 + *
 + * Returns 0 if the vq is running as expected.
 + *
 + * Note that ownership of elem is transferred to the callback.
 + */
 +typedef int (*VirtQueueAvailCallback)(VhostShadowVirtqueue *svq,
 +                                      VirtQueueElement *elem,
 +                                      void *vq_callback_opaque);
 +
 +typedef struct VhostShadowVirtqueueOps {
 +    VirtQueueAvailCallback avail_handler;
 +} VhostShadowVirtqueueOps;
 +
  /* Shadow virtqueue to relay notifications */
  typedef struct VhostShadowVirtqueue {
-     /* Shadow vring */
 @@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
-      */
+     /* Virtio device */
-     uint16_t *desc_next;
+     VirtIODevice *vdev;
-+    /* Caller callbacks */
++    /* IOVA mapping */
-+    const VhostShadowVirtqueueOps *ops;
++    VhostIOVATree *iova_tree;
 +
-+    /* Caller callbacks opaque */
+     /* Map for use the guest's descriptors */
-+    void *ops_opaque;
+     VirtQueueElement **ring_id_maps;
 +
      /* Next head to expose to the device */
      uint16_t shadow_avail_idx;
 @@ -XXX,XX +XXX,XX @@ void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
                       VirtQueue *vq);
  void vhost_svq_stop(VhostShadowVirtqueue *svq);
--VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree);
+-VhostShadowVirtqueue *vhost_svq_new(void);
-+VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree,
++VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree);
 +                                    const VhostShadowVirtqueueOps *ops,
 +                                    void *ops_opaque);
  void vhost_svq_free(gpointer vq);
  G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
 diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-vdpa.c
 +++ b/hw/virtio/vhost-vdpa.c
+@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_listener_region_add(MemoryListener *listener,
+                                          vaddr, section->readonly);
+     llsize = int128_sub(llend, int128_make64(iova));
++    if (v->shadow_vqs_enabled) {
++        DMAMap mem_region = {
++            .translated_addr = (hwaddr)(uintptr_t)vaddr,
++            .size = int128_get64(llsize) - 1,
++            .perm = IOMMU_ACCESS_FLAG(true, section->readonly),
++        };
++
++        int r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region);
++        if (unlikely(r != IOVA_OK)) {
++            error_report("Can't allocate a mapping (%d)", r);
++            goto fail;
++        }
++
++        iova = mem_region.iova;
++    }
+     vhost_vdpa_iotlb_batch_begin_once(v);
+     ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize),
+@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_listener_region_del(MemoryListener *listener,
+     llsize = int128_sub(llend, int128_make64(iova));
++    if (v->shadow_vqs_enabled) {
++        const DMAMap *result;
++        const void *vaddr = memory_region_get_ram_ptr(section->mr) +
++            section->offset_within_region +
++            (iova - section->offset_within_address_space);
++        DMAMap mem_region = {
++            .translated_addr = (hwaddr)(uintptr_t)vaddr,
++            .size = int128_get64(llsize) - 1,
++        };
++
++        result = vhost_iova_tree_find_iova(v->iova_tree, &mem_region);
++        iova = result->iova;
++        vhost_iova_tree_remove(v->iova_tree, &mem_region);
++    }
+     vhost_vdpa_iotlb_batch_begin_once(v);
+     ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize));
+     if (ret) {
 @@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
      shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
      for (unsigned n = 0; n < hdev->nvqs; ++n) {
--        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new(v->iova_tree);
+-        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new();
-+        g_autoptr(VhostShadowVirtqueue) svq;
++        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new(v->iova_tree);
 +        svq = vhost_svq_new(v->iova_tree, NULL, NULL);
          if (unlikely(!svq)) {
              error_setg(errp, "Cannot create svq %u", n);
-             return -1;
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
  /**
   * Unmap a SVQ area in the device
   */
 -static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
 -                                      hwaddr size)
 +static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v,
 +                                      const DMAMap *needle)
  {
 +    const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, needle);
 +    hwaddr size;
      int r;
 -    size = ROUND_UP(size, qemu_real_host_page_size);
 -    r = vhost_vdpa_dma_unmap(v, iova, size);
 +    if (unlikely(!result)) {
 +        error_report("Unable to find SVQ address to unmap");
 +        return false;
 +    }
 +
 +    size = ROUND_UP(result->size, qemu_real_host_page_size);
 +    r = vhost_vdpa_dma_unmap(v, result->iova, size);
      return r == 0;
  }
  static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
                                         const VhostShadowVirtqueue *svq)
  {
 +    DMAMap needle = {};
      struct vhost_vdpa *v = dev->opaque;
      struct vhost_vring_addr svq_addr;
 -    size_t device_size = vhost_svq_device_area_size(svq);
 -    size_t driver_size = vhost_svq_driver_area_size(svq);
      bool ok;
      vhost_svq_get_vring_addr(svq, &svq_addr);
 -    ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size);
 +    needle.translated_addr = svq_addr.desc_user_addr;
 +    ok = vhost_vdpa_svq_unmap_ring(v, &needle);
      if (unlikely(!ok)) {
          return false;
      }
 -    return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size);
 +    needle.translated_addr = svq_addr.used_user_addr;
 +    return vhost_vdpa_svq_unmap_ring(v, &needle);
 +}
 +
 +/**
 + * Map the SVQ area in the device
 + *
 + * @v: Vhost-vdpa device
 + * @needle: The area to search iova
 + * @errorp: Error pointer
 + */
 +static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle,
 +                                    Error **errp)
 +{
 +    int r;
 +
 +    r = vhost_iova_tree_map_alloc(v->iova_tree, needle);
 +    if (unlikely(r != IOVA_OK)) {
 +        error_setg(errp, "Cannot allocate iova (%d)", r);
 +        return false;
 +    }
 +
 +    r = vhost_vdpa_dma_map(v, needle->iova, needle->size + 1,
 +                           (void *)(uintptr_t)needle->translated_addr,
 +                           needle->perm == IOMMU_RO);
 +    if (unlikely(r != 0)) {
 +        error_setg_errno(errp, -r, "Cannot map region to device");
 +        vhost_iova_tree_remove(v->iova_tree, needle);
 +    }
 +
 +    return r == 0;
  }
  /**
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
                                       struct vhost_vring_addr *addr,
                                       Error **errp)
  {
 +    DMAMap device_region, driver_region;
 +    struct vhost_vring_addr svq_addr;
      struct vhost_vdpa *v = dev->opaque;
      size_t device_size = vhost_svq_device_area_size(svq);
      size_t driver_size = vhost_svq_driver_area_size(svq);
 -    int r;
 +    size_t avail_offset;
 +    bool ok;
      ERRP_GUARD();
 -    vhost_svq_get_vring_addr(svq, addr);
 +    vhost_svq_get_vring_addr(svq, &svq_addr);
 -    r = vhost_vdpa_dma_map(v, addr->desc_user_addr, driver_size,
 -                           (void *)(uintptr_t)addr->desc_user_addr, true);
 -    if (unlikely(r != 0)) {
 -        error_setg_errno(errp, -r, "Cannot create vq driver region: ");
 +    driver_region = (DMAMap) {
 +        .translated_addr = svq_addr.desc_user_addr,
 +        .size = driver_size - 1,
 +        .perm = IOMMU_RO,
 +    };
 +    ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp);
 +    if (unlikely(!ok)) {
 +        error_prepend(errp, "Cannot create vq driver region: ");
          return false;
      }
 +    addr->desc_user_addr = driver_region.iova;
 +    avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr;
 +    addr->avail_user_addr = driver_region.iova + avail_offset;
 -    r = vhost_vdpa_dma_map(v, addr->used_user_addr, device_size,
 -                           (void *)(intptr_t)addr->used_user_addr, false);
 -    if (unlikely(r != 0)) {
 -        error_setg_errno(errp, -r, "Cannot create vq device region: ");
 +    device_region = (DMAMap) {
 +        .translated_addr = svq_addr.used_user_addr,
 +        .size = device_size - 1,
 +        .perm = IOMMU_RW,
 +    };
 +    ok = vhost_vdpa_svq_map_ring(v, &device_region, errp);
 +    if (unlikely(!ok)) {
 +        error_prepend(errp, "Cannot create vq device region: ");
 +        vhost_vdpa_svq_unmap_ring(v, &driver_region);
      }
 +    addr->used_user_addr = device_region.iova;
 -    return r == 0;
 +    return ok;
  }
  static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
 diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/hw/virtio/vhost-vdpa.h
 +++ b/include/hw/virtio/vhost-vdpa.h
@@ -XXX,XX +XXX,XX @@
  #include <gmodule.h>
 +#include "hw/virtio/vhost-iova-tree.h"
  #include "hw/virtio/virtio.h"
  #include "standard-headers/linux/vhost_types.h"
@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
      MemoryListener listener;
      struct vhost_vdpa_iova_range iova_range;
      bool shadow_vqs_enabled;
 +    /* IOVA mapping used by the Shadow Virtqueue */
 +    VhostIOVATree *iova_tree;
      GPtrArray *shadow_vqs;
      struct vhost_dev *dev;
      VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
 --
 .7.4

-[PULL V2 15/25] vdpa: Export vhost_vdpa_dma_map and unmap calls
+[PULL V3 13/15] vdpa: Adapt vhost_vdpa_get_vring_base to SVQ
 From: Eugenio Pérez <eperezma@redhat.com>
-Shadow CVQ will copy buffers on qemu VA, so we avoid TOCTOU attacks from
+This is needed to achieve migration, so the destination can restore its
-the guest that could set a different state in qemu device model and vdpa
+index.
 device.
-To do so, it needs to be able to map these new buffers to the device.
+Setting base as last used idx, so destination will see as available all
 the entries that the device did not use, including the in-flight
 processing ones.
 This is ok for networking, but other kinds of devices might have
 problems with these retransmissions.
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-Acked-by: Jason Wang <jasowang@redhat.com>
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/virtio/vhost-vdpa.c         | 7 +++----
+ hw/virtio/vhost-vdpa.c | 17 +++++++++++++++++
- include/hw/virtio/vhost-vdpa.h | 4 ++++
+file changed, 17 insertions(+)
 files changed, 7 insertions(+), 4 deletions(-)
 diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-vdpa.c
 +++ b/hw/virtio/vhost-vdpa.c
-@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section,
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
-     return false;
+ static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
- }
+                                        struct vhost_vring_state *ring)
 -static int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr iova, hwaddr size,
 -                              void *vaddr, bool readonly)
 +int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr iova, hwaddr size,
 +                       void *vaddr, bool readonly)
  {
-     struct vhost_msg_v2 msg = {};
++    struct vhost_vdpa *v = dev->opaque;
-     int fd = v->device_fd;
+     int ret;
-@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr iova, hwaddr size,
 +    if (v->shadow_vqs_enabled) {
 +        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs,
 +                                                      ring->index);
 +
 +        /*
 +         * Setting base as last used idx, so destination will see as available
 +         * all the entries that the device did not use, including the in-flight
 +         * processing ones.
 +         *
 +         * TODO: This is ok for networking, but other kinds of devices might
 +         * have problems with these retransmissions.
 +         */
 +        ring->num = svq->last_used_idx;
 +        return 0;
 +    }
 +
      ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring);
      trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num);
      return ret;
- }
--static int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, hwaddr iova,
--                                hwaddr size)
-+int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, hwaddr iova, hwaddr size)
- {
-     struct vhost_msg_v2 msg = {};
-     int fd = v->device_fd;
-diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/hw/virtio/vhost-vdpa.h
-+++ b/include/hw/virtio/vhost-vdpa.h
-@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
-     VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
- } VhostVDPA;
-+int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr iova, hwaddr size,
-+                       void *vaddr, bool readonly);
-+int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, hwaddr iova, hwaddr size);
-+
- #endif
 --
 .7.4

-[PULL V2 04/25] vdpa: Avoid compiler to squash reads to used idx
+[PULL V3 14/15] vdpa: Never set log_base addr if SVQ is enabled
 From: Eugenio Pérez <eperezma@redhat.com>
-In the next patch we will allow busypolling of this value. The compiler
+Setting the log address would make the device start reporting invalid
-have a running path where shadow_used_idx, last_used_idx, and vring used
+dirty memory because the SVQ vrings are located in qemu's memory.
 idx are not modified within the same thread busypolling.
 This was not an issue before since we always cleared device event
 notifier before checking it, and that could act as memory barrier.
 However, the busypoll needs something similar to kernel READ_ONCE.
 Let's add it here, sepparated from the polling.
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/virtio/vhost-shadow-virtqueue.c | 3 ++-
+ hw/virtio/vhost-vdpa.c | 3 ++-
 file changed, 2 insertions(+), 1 deletion(-)
-diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
+diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
---- a/hw/virtio/vhost-shadow-virtqueue.c
+--- a/hw/virtio/vhost-vdpa.c
-+++ b/hw/virtio/vhost-shadow-virtqueue.c
++++ b/hw/virtio/vhost-vdpa.c
-@@ -XXX,XX +XXX,XX @@ static void vhost_handle_guest_kick_notifier(EventNotifier *n)
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
+ static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
- static bool vhost_svq_more_used(VhostShadowVirtqueue *svq)
+                                      struct vhost_log *log)
  {
-+    uint16_t *used_idx = &svq->vring.used->idx;
+-    if (vhost_vdpa_one_time_request(dev)) {
-     if (svq->last_used_idx != svq->shadow_used_idx) {
++    struct vhost_vdpa *v = dev->opaque;
-         return true;
++    if (v->shadow_vqs_enabled || vhost_vdpa_one_time_request(dev)) {
          return 0;
      }
--    svq->shadow_used_idx = cpu_to_le16(svq->vring.used->idx);
-+    svq->shadow_used_idx = cpu_to_le16(*(volatile uint16_t *)used_idx);
-     return svq->last_used_idx != svq->shadow_used_idx;
- }
 --
 .7.4

-[PULL V2 06/25] vhost: Move vhost_svq_kick call to vhost_svq_add
+Deleted patch
-From: Eugenio Pérez <eperezma@redhat.com>
-The series needs to expose vhost_svq_add with full functionality,
-including kick
-Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- hw/virtio/vhost-shadow-virtqueue.c | 2 +-
-file changed, 1 insertion(+), 1 deletion(-)
-diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/virtio/vhost-shadow-virtqueue.c
-+++ b/hw/virtio/vhost-shadow-virtqueue.c
-@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
-     }
-     svq->ring_id_maps[qemu_head] = elem;
-+    vhost_svq_kick(svq);
-     return true;
- }
-@@ -XXX,XX +XXX,XX @@ static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
-                 /* VQ is broken, just return and ignore any other kicks */
-                 return;
-             }
--            vhost_svq_kick(svq);
-         }
-         virtio_queue_set_notification(svq->vq, true);
---
-.7.4

-[PULL V2 07/25] vhost: Check for queue full at vhost_svq_add
+Deleted patch
-From: Eugenio Pérez <eperezma@redhat.com>
-The series need to expose vhost_svq_add with full functionality,
-including checking for full queue.
-Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- hw/virtio/vhost-shadow-virtqueue.c | 59 +++++++++++++++++++++-----------------
-file changed, 33 insertions(+), 26 deletions(-)
-diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/virtio/vhost-shadow-virtqueue.c
-+++ b/hw/virtio/vhost-shadow-virtqueue.c
-@@ -XXX,XX +XXX,XX @@ static void vhost_svq_kick(VhostShadowVirtqueue *svq)
-  * Add an element to a SVQ.
-  *
-  * The caller must check that there is enough slots for the new element. It
-- * takes ownership of the element: In case of failure, it is free and the SVQ
-- * is considered broken.
-+ * takes ownership of the element: In case of failure not ENOSPC, it is free.
-+ *
-+ * Return -EINVAL if element is invalid, -ENOSPC if dev queue is full
-  */
--static bool vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
-+static int vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
- {
-     unsigned qemu_head;
--    bool ok = vhost_svq_add_split(svq, elem, &qemu_head);
-+    unsigned ndescs = elem->in_num + elem->out_num;
-+    bool ok;
-+
-+    if (unlikely(ndescs > vhost_svq_available_slots(svq))) {
-+        return -ENOSPC;
-+    }
-+
-+    ok = vhost_svq_add_split(svq, elem, &qemu_head);
-     if (unlikely(!ok)) {
-         g_free(elem);
--        return false;
-+        return -EINVAL;
-     }
-     svq->ring_id_maps[qemu_head] = elem;
-     vhost_svq_kick(svq);
--    return true;
-+    return 0;
- }
- /**
-@@ -XXX,XX +XXX,XX @@ static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
-         while (true) {
-             VirtQueueElement *elem;
--            bool ok;
-+            int r;
-             if (svq->next_guest_avail_elem) {
-                 elem = g_steal_pointer(&svq->next_guest_avail_elem);
-@@ -XXX,XX +XXX,XX @@ static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
-                 break;
-             }
--            if (elem->out_num + elem->in_num > vhost_svq_available_slots(svq)) {
--                /*
--                 * This condition is possible since a contiguous buffer in GPA
--                 * does not imply a contiguous buffer in qemu's VA
--                 * scatter-gather segments. If that happens, the buffer exposed
--                 * to the device needs to be a chain of descriptors at this
--                 * moment.
--                 *
--                 * SVQ cannot hold more available buffers if we are here:
--                 * queue the current guest descriptor and ignore further kicks
--                 * until some elements are used.
--                 */
--                svq->next_guest_avail_elem = elem;
--                return;
--            }
--
--            ok = vhost_svq_add(svq, elem);
--            if (unlikely(!ok)) {
--                /* VQ is broken, just return and ignore any other kicks */
-+            r = vhost_svq_add(svq, elem);
-+            if (unlikely(r != 0)) {
-+                if (r == -ENOSPC) {
-+                    /*
-+                     * This condition is possible since a contiguous buffer in
-+                     * GPA does not imply a contiguous buffer in qemu's VA
-+                     * scatter-gather segments. If that happens, the buffer
-+                     * exposed to the device needs to be a chain of descriptors
-+                     * at this moment.
-+                     *
-+                     * SVQ cannot hold more available buffers if we are here:
-+                     * queue the current guest descriptor and ignore kicks
-+                     * until some elements are used.
-+                     */
-+                    svq->next_guest_avail_elem = elem;
-+                }
-+
-+                /* VQ is full or broken, just return and ignore kicks */
-                 return;
-             }
-         }
---
-.7.4

-[PULL V2 08/25] vhost: Decouple vhost_svq_add from VirtQueueElement
+Deleted patch
-From: Eugenio Pérez <eperezma@redhat.com>
-VirtQueueElement comes from the guest, but we're heading SVQ to be able
-to modify the element presented to the device without the guest's
-knowledge.
-To do so, make SVQ accept sg buffers directly, instead of using
-VirtQueueElement.
-Add vhost_svq_add_element to maintain element convenience.
-Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-Acked-by: Jason Wang <jasowang@redhat.com>
-Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- hw/virtio/vhost-shadow-virtqueue.c | 33 ++++++++++++++++++++++-----------
-file changed, 22 insertions(+), 11 deletions(-)
-diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/virtio/vhost-shadow-virtqueue.c
-+++ b/hw/virtio/vhost-shadow-virtqueue.c
-@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg,
- }
- static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
--                                VirtQueueElement *elem, unsigned *head)
-+                                const struct iovec *out_sg, size_t out_num,
-+                                const struct iovec *in_sg, size_t in_num,
-+                                unsigned *head)
- {
-     unsigned avail_idx;
-     vring_avail_t *avail = svq->vring.avail;
-     bool ok;
--    g_autofree hwaddr *sgs = g_new(hwaddr, MAX(elem->out_num, elem->in_num));
-+    g_autofree hwaddr *sgs = g_new(hwaddr, MAX(out_num, in_num));
-     *head = svq->free_head;
-     /* We need some descriptors here */
--    if (unlikely(!elem->out_num && !elem->in_num)) {
-+    if (unlikely(!out_num && !in_num)) {
-         qemu_log_mask(LOG_GUEST_ERROR,
-                       "Guest provided element with no descriptors");
-         return false;
-     }
--    ok = vhost_svq_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num,
--                                     elem->in_num > 0, false);
-+    ok = vhost_svq_vring_write_descs(svq, sgs, out_sg, out_num, in_num > 0,
-+                                     false);
-     if (unlikely(!ok)) {
-         return false;
-     }
--    ok = vhost_svq_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, false,
--                                     true);
-+    ok = vhost_svq_vring_write_descs(svq, sgs, in_sg, in_num, false, true);
-     if (unlikely(!ok)) {
-         return false;
-     }
-@@ -XXX,XX +XXX,XX @@ static void vhost_svq_kick(VhostShadowVirtqueue *svq)
-  *
-  * Return -EINVAL if element is invalid, -ENOSPC if dev queue is full
-  */
--static int vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
-+static int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg,
-+                          size_t out_num, const struct iovec *in_sg,
-+                          size_t in_num, VirtQueueElement *elem)
- {
-     unsigned qemu_head;
--    unsigned ndescs = elem->in_num + elem->out_num;
-+    unsigned ndescs = in_num + out_num;
-     bool ok;
-     if (unlikely(ndescs > vhost_svq_available_slots(svq))) {
-         return -ENOSPC;
-     }
--    ok = vhost_svq_add_split(svq, elem, &qemu_head);
-+    ok = vhost_svq_add_split(svq, out_sg, out_num, in_sg, in_num, &qemu_head);
-     if (unlikely(!ok)) {
-         g_free(elem);
-         return -EINVAL;
-@@ -XXX,XX +XXX,XX @@ static int vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
-     return 0;
- }
-+/* Convenience wrapper to add a guest's element to SVQ */
-+static int vhost_svq_add_element(VhostShadowVirtqueue *svq,
-+                                 VirtQueueElement *elem)
-+{
-+    return vhost_svq_add(svq, elem->out_sg, elem->out_num, elem->in_sg,
-+                         elem->in_num, elem);
-+}
-+
- /**
-  * Forward available buffers.
-  *
-@@ -XXX,XX +XXX,XX @@ static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
-                 break;
-             }
--            r = vhost_svq_add(svq, elem);
-+            r = vhost_svq_add_element(svq, elem);
-             if (unlikely(r != 0)) {
-                 if (r == -ENOSPC) {
-                     /*
---
-.7.4

-[PULL V2 10/25] vhost: Track number of descs in SVQDescState
+Deleted patch
-From: Eugenio Pérez <eperezma@redhat.com>
-A guest's buffer continuos on GPA may need multiple descriptors on
-qemu's VA, so SVQ should track its length sepparatedly.
-Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- hw/virtio/vhost-shadow-virtqueue.c | 4 ++--
- hw/virtio/vhost-shadow-virtqueue.h | 6 ++++++
-files changed, 8 insertions(+), 2 deletions(-)
-diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/virtio/vhost-shadow-virtqueue.c
-+++ b/hw/virtio/vhost-shadow-virtqueue.c
-@@ -XXX,XX +XXX,XX @@ static int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg,
-     }
-     svq->desc_state[qemu_head].elem = elem;
-+    svq->desc_state[qemu_head].ndescs = ndescs;
-     vhost_svq_kick(svq);
-     return 0;
- }
-@@ -XXX,XX +XXX,XX @@ static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq,
-         return NULL;
-     }
--    num = svq->desc_state[used_elem.id].elem->in_num +
--          svq->desc_state[used_elem.id].elem->out_num;
-+    num = svq->desc_state[used_elem.id].ndescs;
-     last_used_chain = vhost_svq_last_desc_of_chain(svq, num, used_elem.id);
-     svq->desc_next[last_used_chain] = svq->free_head;
-     svq->free_head = used_elem.id;
-diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
-index XXXXXXX..XXXXXXX 100644
---- a/hw/virtio/vhost-shadow-virtqueue.h
-+++ b/hw/virtio/vhost-shadow-virtqueue.h
-@@ -XXX,XX +XXX,XX @@
- typedef struct SVQDescState {
-     VirtQueueElement *elem;
-+
-+    /*
-+     * Number of descriptors exposed to the device. May or may not match
-+     * guest's
-+     */
-+    unsigned int ndescs;
- } SVQDescState;
- /* Shadow virtqueue to relay notifications */
---
-.7.4

-[PULL V2 12/25] vhost: Expose vhost_svq_add
+Deleted patch
-From: Eugenio Pérez <eperezma@redhat.com>
-This allows external parts of SVQ to forward custom buffers to the
-device.
-Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- hw/virtio/vhost-shadow-virtqueue.c | 6 +++---
- hw/virtio/vhost-shadow-virtqueue.h | 3 +++
-files changed, 6 insertions(+), 3 deletions(-)
-diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/virtio/vhost-shadow-virtqueue.c
-+++ b/hw/virtio/vhost-shadow-virtqueue.c
-@@ -XXX,XX +XXX,XX @@ static void vhost_svq_kick(VhostShadowVirtqueue *svq)
-  *
-  * Return -EINVAL if element is invalid, -ENOSPC if dev queue is full
-  */
--static int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg,
--                          size_t out_num, const struct iovec *in_sg,
--                          size_t in_num, VirtQueueElement *elem)
-+int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg,
-+                  size_t out_num, const struct iovec *in_sg, size_t in_num,
-+                  VirtQueueElement *elem)
- {
-     unsigned qemu_head;
-     unsigned ndescs = in_num + out_num;
-diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
-index XXXXXXX..XXXXXXX 100644
---- a/hw/virtio/vhost-shadow-virtqueue.h
-+++ b/hw/virtio/vhost-shadow-virtqueue.h
-@@ -XXX,XX +XXX,XX @@ bool vhost_svq_valid_features(uint64_t features, Error **errp);
- void vhost_svq_push_elem(VhostShadowVirtqueue *svq,
-                          const VirtQueueElement *elem, uint32_t len);
-+int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg,
-+                  size_t out_num, const struct iovec *in_sg, size_t in_num,
-+                  VirtQueueElement *elem);
- void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
- void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
---
-.7.4

-[PULL V2 20/25] vdpa: Add device migration blocker
+[PULL V3 15/15] vdpa: Expose VHOST_F_LOG_ALL on SVQ
 From: Eugenio Pérez <eperezma@redhat.com>
-Since the vhost-vdpa device is exposing _F_LOG, adding a migration blocker if
+SVQ is able to log the dirty bits by itself, so let's use it to not
-it uses CVQ.
+block migration.
-However, qemu is able to migrate simple devices with no CVQ as long as
+Also, ignore set and clear of VHOST_F_LOG_ALL on set_features if SVQ is
-they use SVQ. To allow it, add a placeholder error to vhost_vdpa, and
+enabled. Even if the device supports it, the reports would be nonsense
-only add to vhost_dev when used. vhost_dev machinery place the migration
+because SVQ memory is in the qemu region.
-blocker if needed.
 The log region is still allocated. Future changes might skip that, but
 this series is already long enough.
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/virtio/vhost-vdpa.c         | 15 +++++++++++++++
+ hw/virtio/vhost-vdpa.c         | 39 +++++++++++++++++++++++++++++++++++----
  include/hw/virtio/vhost-vdpa.h |  1 +
-files changed, 16 insertions(+)
+files changed, 36 insertions(+), 4 deletions(-)
 diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-vdpa.c
 +++ b/hw/virtio/vhost-vdpa.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_one_time_request(struct vhost_dev *dev)
- #include "hw/virtio/vhost-shadow-virtqueue.h"
+     return v->index != 0;
- #include "hw/virtio/vhost-vdpa.h"
+ }
- #include "exec/address-spaces.h"
-+#include "migration/blocker.h"
++static int vhost_vdpa_get_dev_features(struct vhost_dev *dev,
- #include "qemu/cutils.h"
++                                       uint64_t *features)
- #include "qemu/main-loop.h"
++{
- #include "cpu.h"
++    int ret;
-@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
++
-         return true;
++    ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
 +    trace_vhost_vdpa_get_features(dev, *features);
 +    return ret;
 +}
 +
  static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
                                 Error **errp)
  {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
          return 0;
      }
-+    if (v->migration_blocker) {
+-    r = hdev->vhost_ops->vhost_get_features(hdev, &dev_features);
-+        int r = migrate_add_blocker(v->migration_blocker, &err);
++    r = vhost_vdpa_get_dev_features(hdev, &dev_features);
-+        if (unlikely(r < 0)) {
+     if (r != 0) {
-+            return false;
+         error_setg_errno(errp, -r, "Can't get vdpa device features");
          return r;
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_mem_table(struct vhost_dev *dev,
  static int vhost_vdpa_set_features(struct vhost_dev *dev,
                                     uint64_t features)
  {
 +    struct vhost_vdpa *v = dev->opaque;
      int ret;
      if (vhost_vdpa_one_time_request(dev)) {
          return 0;
      }
 +    if (v->shadow_vqs_enabled) {
 +        if ((v->acked_features ^ features) == BIT_ULL(VHOST_F_LOG_ALL)) {
 +            /*
 +             * QEMU is just trying to enable or disable logging. SVQ handles
 +             * this sepparately, so no need to forward this.
 +             */
 +            v->acked_features = features;
 +            return 0;
 +        }
++
++        v->acked_features = features;
++
++        /* We must not ack _F_LOG if SVQ is enabled */
++        features &= ~BIT_ULL(VHOST_F_LOG_ALL);
 +    }
 +
-     for (i = 0; i < v->shadow_vqs->len; ++i) {
+     trace_vhost_vdpa_set_features(dev, features);
-         VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
+     ret = vhost_vdpa_call(dev, VHOST_SET_FEATURES, &features);
-         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
+     if (ret) {
-@@ -XXX,XX +XXX,XX @@ err:
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
-         vhost_svq_stop(svq);
+ static int vhost_vdpa_get_features(struct vhost_dev *dev,
-     }
+                                      uint64_t *features)
+ {
-+    if (v->migration_blocker) {
+-    int ret;
-+        migrate_del_blocker(v->migration_blocker);
++    struct vhost_vdpa *v = dev->opaque;
 +    int ret = vhost_vdpa_get_dev_features(dev, features);
 +
 +    if (ret == 0 && v->shadow_vqs_enabled) {
 +        /* Add SVQ logging capabilities */
 +        *features |= BIT_ULL(VHOST_F_LOG_ALL);
 +    }
-+
-     return false;
+-    ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
 -    trace_vhost_vdpa_get_features(dev, *features);
      return ret;
  }
-@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svqs_stop(struct vhost_dev *dev)
-         }
-     }
-+    if (v->migration_blocker) {
-+        migrate_del_blocker(v->migration_blocker);
-+    }
-     return true;
- }
 diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/hw/virtio/vhost-vdpa.h
 +++ b/include/hw/virtio/vhost-vdpa.h
 @@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
+     bool iotlb_batch_begin_sent;
+     MemoryListener listener;
+     struct vhost_vdpa_iova_range iova_range;
++    uint64_t acked_features;
      bool shadow_vqs_enabled;
      /* IOVA mapping used by the Shadow Virtqueue */
      VhostIOVATree *iova_tree;
-+    Error *migration_blocker;
-     GPtrArray *shadow_vqs;
-     const VhostShadowVirtqueueOps *shadow_vq_ops;
-     void *shadow_vq_ops_opaque;
 --
 .7.4

-[PULL V2 22/25] softmmu/runstate.c: add RunStateTransition support form COLO to PRELAUNCH
+Deleted patch
-From: Zhang Chen <chen.zhang@intel.com>
-If the checkpoint occurs when the guest finishes restarting
-but has not started running, the runstate_set() may reject
-the transition from COLO to PRELAUNCH with the crash log:
-{"timestamp": {"seconds": 1593484591, "microseconds": 26605},\
-"event": "RESET", "data": {"guest": true, "reason": "guest-reset"}}
-qemu-system-x86_64: invalid runstate transition: 'colo' -> 'prelaunch'
-Long-term testing says that it's pretty safe.
-Signed-off-by: Like Xu <like.xu@linux.intel.com>
-Signed-off-by: Zhang Chen <chen.zhang@intel.com>
-Acked-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- softmmu/runstate.c | 1 +
-file changed, 1 insertion(+)
-diff --git a/softmmu/runstate.c b/softmmu/runstate.c
-index XXXXXXX..XXXXXXX 100644
---- a/softmmu/runstate.c
-+++ b/softmmu/runstate.c
-@@ -XXX,XX +XXX,XX @@ static const RunStateTransition runstate_transitions_def[] = {
-     { RUN_STATE_RESTORE_VM, RUN_STATE_PRELAUNCH },
-     { RUN_STATE_COLO, RUN_STATE_RUNNING },
-+    { RUN_STATE_COLO, RUN_STATE_PRELAUNCH },
-     { RUN_STATE_COLO, RUN_STATE_SHUTDOWN},
-     { RUN_STATE_RUNNING, RUN_STATE_DEBUG },
---
-.7.4

-[PULL V2 23/25] net/colo: Fix a "double free" crash to clear the conn_list
+Deleted patch
-From: Zhang Chen <chen.zhang@intel.com>
-We notice the QEMU may crash when the guest has too many
-incoming network connections with the following log:
-@1593578622.668573:colo_proxy_main : colo proxy connection hashtable full, clear it
-free(): invalid pointer
-[1]    15195 abort (core dumped)  qemu-system-x86_64 ....
-This is because we create the s->connection_track_table with
-g_hash_table_new_full() which is defined as:
-GHashTable * g_hash_table_new_full (GHashFunc hash_func,
-                       GEqualFunc key_equal_func,
-                       GDestroyNotify key_destroy_func,
-                       GDestroyNotify value_destroy_func);
-The fourth parameter connection_destroy() will be called to free the
-memory allocated for all 'Connection' values in the hashtable when
-we call g_hash_table_remove_all() in the connection_hashtable_reset().
-But both connection_track_table and conn_list reference to the same
-conn instance. It will trigger double free in conn_list clear. So this
-patch remove free action on hash table side to avoid double free the
-conn.
-Signed-off-by: Like Xu <like.xu@linux.intel.com>
-Signed-off-by: Zhang Chen <chen.zhang@intel.com>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- net/colo-compare.c    | 2 +-
- net/filter-rewriter.c | 2 +-
-files changed, 2 insertions(+), 2 deletions(-)
-diff --git a/net/colo-compare.c b/net/colo-compare.c
-index XXXXXXX..XXXXXXX 100644
---- a/net/colo-compare.c
-+++ b/net/colo-compare.c
-@@ -XXX,XX +XXX,XX @@ static void colo_compare_complete(UserCreatable *uc, Error **errp)
-     s->connection_track_table = g_hash_table_new_full(connection_key_hash,
-                                                       connection_key_equal,
-                                                       g_free,
--                                                      connection_destroy);
-+                                                      NULL);
-     colo_compare_iothread(s);
-diff --git a/net/filter-rewriter.c b/net/filter-rewriter.c
-index XXXXXXX..XXXXXXX 100644
---- a/net/filter-rewriter.c
-+++ b/net/filter-rewriter.c
-@@ -XXX,XX +XXX,XX @@ static void colo_rewriter_setup(NetFilterState *nf, Error **errp)
-     s->connection_track_table = g_hash_table_new_full(connection_key_hash,
-                                                       connection_key_equal,
-                                                       g_free,
--                                                      connection_destroy);
-+                                                      NULL);
-     s->incoming_queue = qemu_new_net_queue(qemu_netfilter_pass_to_next, nf);
- }
---
-.7.4

-[PULL V2 24/25] net/colo.c: No need to track conn_list for filter-rewriter
+Deleted patch
-From: Zhang Chen <chen.zhang@intel.com>
-Filter-rewriter no need to track connection in conn_list.
-This patch fix the glib g_queue_is_empty assertion when COLO guest
-keep a lot of network connection.
-Signed-off-by: Zhang Chen <chen.zhang@intel.com>
-Reviewed-by: Li Zhijian <lizhijian@fujitsu.com>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- net/colo.c | 2 +-
-file changed, 1 insertion(+), 1 deletion(-)
-diff --git a/net/colo.c b/net/colo.c
-index XXXXXXX..XXXXXXX 100644
---- a/net/colo.c
-+++ b/net/colo.c
-@@ -XXX,XX +XXX,XX @@ Connection *connection_get(GHashTable *connection_track_table,
-             /*
-              * clear the conn_list
-              */
--            while (!g_queue_is_empty(conn_list)) {
-+            while (conn_list && !g_queue_is_empty(conn_list)) {
-                 connection_destroy(g_queue_pop_head(conn_list));
-             }
-         }
---
-.7.4

-[PULL V2 25/25] net/colo.c: fix segmentation fault when packet is not parsed correctly
+Deleted patch
-From: Zhang Chen <chen.zhang@intel.com>
-When COLO use only one vnet_hdr_support parameter between
-filter-redirector and filter-mirror(or colo-compare), COLO will crash
-with segmentation fault. Back track as follow:
-Thread 1 "qemu-system-x86" received signal SIGSEGV, Segmentation fault.
-x0000555555cb200b in eth_get_l2_hdr_length (p=0x0)
-    at /home/tao/project/COLO/colo-qemu/include/net/eth.h:296
-uint16_t proto = be16_to_cpu(PKT_GET_ETH_HDR(p)->h_proto);
-(gdb) bt
-0x0000555555cb200b in eth_get_l2_hdr_length (p=0x0)
-    at /home/tao/project/COLO/colo-qemu/include/net/eth.h:296
-0x0000555555cb22b4 in parse_packet_early (pkt=0x555556a44840) at
-net/colo.c:49
-0x0000555555cb2b91 in is_tcp_packet (pkt=0x555556a44840) at
-net/filter-rewriter.c:63
-So wrong vnet_hdr_len will cause pkt->data become NULL. Add check to
-raise error and add trace-events to track vnet_hdr_len.
-Signed-off-by: Tao Xu <tao3.xu@intel.com>
-Signed-off-by: Zhang Chen <chen.zhang@intel.com>
-Reviewed-by: Li Zhijian <lizhijian@fujitsu.com>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- net/colo.c       | 9 ++++++++-
- net/trace-events | 1 +
-files changed, 9 insertions(+), 1 deletion(-)
-diff --git a/net/colo.c b/net/colo.c
-index XXXXXXX..XXXXXXX 100644
---- a/net/colo.c
-+++ b/net/colo.c
-@@ -XXX,XX +XXX,XX @@ int parse_packet_early(Packet *pkt)
-     static const uint8_t vlan[] = {0x81, 0x00};
-     uint8_t *data = pkt->data + pkt->vnet_hdr_len;
-     uint16_t l3_proto;
--    ssize_t l2hdr_len = eth_get_l2_hdr_length(data);
-+    ssize_t l2hdr_len;
-+
-+    if (data == NULL) {
-+        trace_colo_proxy_main_vnet_info("This packet is not parsed correctly, "
-+                                        "pkt->vnet_hdr_len", pkt->vnet_hdr_len);
-+        return 1;
-+    }
-+    l2hdr_len = eth_get_l2_hdr_length(data);
-     if (pkt->size < ETH_HLEN + pkt->vnet_hdr_len) {
-         trace_colo_proxy_main("pkt->size < ETH_HLEN");
-diff --git a/net/trace-events b/net/trace-events
-index XXXXXXX..XXXXXXX 100644
---- a/net/trace-events
-+++ b/net/trace-events
-@@ -XXX,XX +XXX,XX @@ vhost_user_event(const char *chr, int event) "chr: %s got event: %d"
- # colo.c
- colo_proxy_main(const char *chr) ": %s"
-+colo_proxy_main_vnet_info(const char *sta, int size) ": %s = %d"
- # colo-compare.c
- colo_compare_main(const char *chr) ": %s"
---
-.7.4

The following changes since commit d48125de38f48a61d6423ef6a01156d6dff9ee2c:

Merge tag 'kraxel-20220719-pull-request' of https://gitlab.com/kraxel/qemu into staging (2022-07-19 17:40:36 +0100)

are available in the git repository at:

https://github.com/jasowang/qemu.git tags/net-pull-request

for you to fetch changes up to 8bdab83b34efb0b598be4e5b98e4f466ca5f2f80:

net/colo.c: fix segmentation fault when packet is not parsed correctly (2022-07-20 16:58:08 +0800)

----------------------------------------------------------------

Changes since V1:
- Fix build erros of vhost-vdpa when virtio-net is not set

----------------------------------------------------------------
Eugenio Pérez (21):
      vhost: move descriptor translation to vhost_svq_vring_write_descs
      virtio-net: Expose MAC_TABLE_ENTRIES
      virtio-net: Expose ctrl virtqueue logic
      vdpa: Avoid compiler to squash reads to used idx
      vhost: Reorder vhost_svq_kick
      vhost: Move vhost_svq_kick call to vhost_svq_add
      vhost: Check for queue full at vhost_svq_add
      vhost: Decouple vhost_svq_add from VirtQueueElement
      vhost: Add SVQDescState
      vhost: Track number of descs in SVQDescState
      vhost: add vhost_svq_push_elem
      vhost: Expose vhost_svq_add
      vhost: add vhost_svq_poll
      vhost: Add svq avail_handler callback
      vdpa: Export vhost_vdpa_dma_map and unmap calls
      vhost-net-vdpa: add stubs for when no virtio-net device is present
      vdpa: manual forward CVQ buffers
      vdpa: Buffer CVQ support on shadow virtqueue
      vdpa: Extract get features part from vhost_vdpa_get_max_queue_pairs
      vdpa: Add device migration blocker
      vdpa: Add x-svq to NetdevVhostVDPAOptions

Zhang Chen (4):
      softmmu/runstate.c: add RunStateTransition support form COLO to PRELAUNCH
      net/colo: Fix a "double free" crash to clear the conn_list
      net/colo.c: No need to track conn_list for filter-rewriter
      net/colo.c: fix segmentation fault when packet is not parsed correctly

From: Eugenio Pérez <eperezma@redhat.com>

It's done for both in and out descriptors so it's better placed here.

Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 38 +++++++++++++++++++++++++++-----------
 1 file changed, 27 insertions(+), 11 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_translate_addr(const VhostShadowVirtqueue *svq,
     return true;
 }
 
-static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg,
-                                    const struct iovec *iovec, size_t num,
-                                    bool more_descs, bool write)
+/**
+ * Write descriptors to SVQ vring
+ *
+ * @svq: The shadow virtqueue
+ * @sg: Cache for hwaddr
+ * @iovec: The iovec from the guest
+ * @num: iovec length
+ * @more_descs: True if more descriptors come in the chain
+ * @write: True if they are writeable descriptors
+ *
+ * Return true if success, false otherwise and print error.
+ */
+static bool vhost_svq_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg,
+                                        const struct iovec *iovec, size_t num,
+                                        bool more_descs, bool write)
 {
     uint16_t i = svq->free_head, last = svq->free_head;
     unsigned n;
     uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0;
     vring_desc_t *descs = svq->vring.desc;
+    bool ok;
 
     if (num == 0) {
-        return;
+        return true;
+    }
+
+    ok = vhost_svq_translate_addr(svq, sg, iovec, num);
+    if (unlikely(!ok)) {
+        return false;
     }
 
     for (n = 0; n < num; n++) {
@@ -XXX,XX +XXX,XX @@ static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg,
     }
 
     svq->free_head = le16_to_cpu(svq->desc_next[last]);
+    return true;
 }
 
 static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
         return false;
     }
 
-    ok = vhost_svq_translate_addr(svq, sgs, elem->out_sg, elem->out_num);
+    ok = vhost_svq_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num,
+                                     elem->in_num > 0, false);
     if (unlikely(!ok)) {
         return false;
     }
-    vhost_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num,
-                            elem->in_num > 0, false);
-
 
-    ok = vhost_svq_translate_addr(svq, sgs, elem->in_sg, elem->in_num);
+    ok = vhost_svq_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, false,
+                                     true);
     if (unlikely(!ok)) {
         return false;
     }
 
-    vhost_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, false, true);
-
     /*
      * Put the entry in the available array (but don't update avail->idx until
      * they do sync).
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

vhost-vdpa control virtqueue needs to know the maximum entries supported
by the virtio-net device, so we know if it is possible to apply the
filter.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/virtio-net.c            | 1 -
 include/hw/virtio/virtio-net.h | 3 +++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -XXX,XX +XXX,XX @@
 
 #define VIRTIO_NET_VM_VERSION    11
 
-#define MAC_TABLE_ENTRIES    64
 #define MAX_VLAN    (1 << 12)   /* Per 802.1Q definition */
 
 /* previously fixed value */
diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/virtio/virtio-net.h
+++ b/include/hw/virtio/virtio-net.h
@@ -XXX,XX +XXX,XX @@ OBJECT_DECLARE_SIMPLE_TYPE(VirtIONet, VIRTIO_NET)
  * and latency. */
 #define TX_BURST 256
 
+/* Maximum VIRTIO_NET_CTRL_MAC_TABLE_SET unicast + multicast entries. */
+#define MAC_TABLE_ENTRIES    64
+
 typedef struct virtio_net_conf
 {
     uint32_t txtimer;
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This allows external vhost-net devices to modify the state of the
VirtIO device model once the vhost-vdpa device has acknowledged the
control commands.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/virtio-net.c            | 84 ++++++++++++++++++++++++------------------
 include/hw/virtio/virtio-net.h |  4 ++
 2 files changed, 53 insertions(+), 35 deletions(-)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -XXX,XX +XXX,XX @@ static int virtio_net_handle_mq(VirtIONet *n, uint8_t cmd,
     return VIRTIO_NET_OK;
 }
 
-static void virtio_net_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq)
+size_t virtio_net_handle_ctrl_iov(VirtIODevice *vdev,
+                                  const struct iovec *in_sg, unsigned in_num,
+                                  const struct iovec *out_sg,
+                                  unsigned out_num)
 {
     VirtIONet *n = VIRTIO_NET(vdev);
     struct virtio_net_ctrl_hdr ctrl;
     virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
-    VirtQueueElement *elem;
     size_t s;
     struct iovec *iov, *iov2;
-    unsigned int iov_cnt;
+
+    if (iov_size(in_sg, in_num) < sizeof(status) ||
+        iov_size(out_sg, out_num) < sizeof(ctrl)) {
+        virtio_error(vdev, "virtio-net ctrl missing headers");
+        return 0;
+    }
+
+    iov2 = iov = g_memdup2(out_sg, sizeof(struct iovec) * out_num);
+    s = iov_to_buf(iov, out_num, 0, &ctrl, sizeof(ctrl));
+    iov_discard_front(&iov, &out_num, sizeof(ctrl));
+    if (s != sizeof(ctrl)) {
+        status = VIRTIO_NET_ERR;
+    } else if (ctrl.class == VIRTIO_NET_CTRL_RX) {
+        status = virtio_net_handle_rx_mode(n, ctrl.cmd, iov, out_num);
+    } else if (ctrl.class == VIRTIO_NET_CTRL_MAC) {
+        status = virtio_net_handle_mac(n, ctrl.cmd, iov, out_num);
+    } else if (ctrl.class == VIRTIO_NET_CTRL_VLAN) {
+        status = virtio_net_handle_vlan_table(n, ctrl.cmd, iov, out_num);
+    } else if (ctrl.class == VIRTIO_NET_CTRL_ANNOUNCE) {
+        status = virtio_net_handle_announce(n, ctrl.cmd, iov, out_num);
+    } else if (ctrl.class == VIRTIO_NET_CTRL_MQ) {
+        status = virtio_net_handle_mq(n, ctrl.cmd, iov, out_num);
+    } else if (ctrl.class == VIRTIO_NET_CTRL_GUEST_OFFLOADS) {
+        status = virtio_net_handle_offloads(n, ctrl.cmd, iov, out_num);
+    }
+
+    s = iov_from_buf(in_sg, in_num, 0, &status, sizeof(status));
+    assert(s == sizeof(status));
+
+    g_free(iov2);
+    return sizeof(status);
+}
+
+static void virtio_net_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq)
+{
+    VirtQueueElement *elem;
 
     for (;;) {
+        size_t written;
         elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
         if (!elem) {
             break;
         }
-        if (iov_size(elem->in_sg, elem->in_num) < sizeof(status) ||
-            iov_size(elem->out_sg, elem->out_num) < sizeof(ctrl)) {
-            virtio_error(vdev, "virtio-net ctrl missing headers");
+
+        written = virtio_net_handle_ctrl_iov(vdev, elem->in_sg, elem->in_num,
+                                             elem->out_sg, elem->out_num);
+        if (written > 0) {
+            virtqueue_push(vq, elem, written);
+            virtio_notify(vdev, vq);
+            g_free(elem);
+        } else {
             virtqueue_detach_element(vq, elem, 0);
             g_free(elem);
             break;
         }
-
-        iov_cnt = elem->out_num;
-        iov2 = iov = g_memdup2(elem->out_sg,
-                               sizeof(struct iovec) * elem->out_num);
-        s = iov_to_buf(iov, iov_cnt, 0, &ctrl, sizeof(ctrl));
-        iov_discard_front(&iov, &iov_cnt, sizeof(ctrl));
-        if (s != sizeof(ctrl)) {
-            status = VIRTIO_NET_ERR;
-        } else if (ctrl.class == VIRTIO_NET_CTRL_RX) {
-            status = virtio_net_handle_rx_mode(n, ctrl.cmd, iov, iov_cnt);
-        } else if (ctrl.class == VIRTIO_NET_CTRL_MAC) {
-            status = virtio_net_handle_mac(n, ctrl.cmd, iov, iov_cnt);
-        } else if (ctrl.class == VIRTIO_NET_CTRL_VLAN) {
-            status = virtio_net_handle_vlan_table(n, ctrl.cmd, iov, iov_cnt);
-        } else if (ctrl.class == VIRTIO_NET_CTRL_ANNOUNCE) {
-            status = virtio_net_handle_announce(n, ctrl.cmd, iov, iov_cnt);
-        } else if (ctrl.class == VIRTIO_NET_CTRL_MQ) {
-            status = virtio_net_handle_mq(n, ctrl.cmd, iov, iov_cnt);
-        } else if (ctrl.class == VIRTIO_NET_CTRL_GUEST_OFFLOADS) {
-            status = virtio_net_handle_offloads(n, ctrl.cmd, iov, iov_cnt);
-        }
-
-        s = iov_from_buf(elem->in_sg, elem->in_num, 0, &status, sizeof(status));
-        assert(s == sizeof(status));
-
-        virtqueue_push(vq, elem, sizeof(status));
-        virtio_notify(vdev, vq);
-        g_free(iov2);
-        g_free(elem);
     }
 }
 
diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/virtio/virtio-net.h
+++ b/include/hw/virtio/virtio-net.h
@@ -XXX,XX +XXX,XX @@ struct VirtIONet {
     struct EBPFRSSContext ebpf_rss;
 };
 
+size_t virtio_net_handle_ctrl_iov(VirtIODevice *vdev,
+                                  const struct iovec *in_sg, unsigned in_num,
+                                  const struct iovec *out_sg,
+                                  unsigned out_num);
 void virtio_net_set_netclient_name(VirtIONet *n, const char *name,
                                    const char *type);
 
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

In the next patch we will allow busypolling of this value. The compiler
have a running path where shadow_used_idx, last_used_idx, and vring used
idx are not modified within the same thread busypolling.

This was not an issue before since we always cleared device event
notifier before checking it, and that could act as memory barrier.
However, the busypoll needs something similar to kernel READ_ONCE.

Let's add it here, sepparated from the polling.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@ static void vhost_handle_guest_kick_notifier(EventNotifier *n)
 
 static bool vhost_svq_more_used(VhostShadowVirtqueue *svq)
 {
+    uint16_t *used_idx = &svq->vring.used->idx;
     if (svq->last_used_idx != svq->shadow_used_idx) {
         return true;
     }
 
-    svq->shadow_used_idx = cpu_to_le16(svq->vring.used->idx);
+    svq->shadow_used_idx = cpu_to_le16(*(volatile uint16_t *)used_idx);
 
     return svq->last_used_idx != svq->shadow_used_idx;
 }
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

Future code needs to call it from vhost_svq_add.

No functional change intended.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
     return true;
 }
 
+static void vhost_svq_kick(VhostShadowVirtqueue *svq)
+{
+    /*
+     * We need to expose the available array entries before checking the used
+     * flags
+     */
+    smp_mb();
+    if (svq->vring.used->flags & VRING_USED_F_NO_NOTIFY) {
+        return;
+    }
+
+    event_notifier_set(&svq->hdev_kick);
+}
+
 /**
  * Add an element to a SVQ.
  *
@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
     return true;
 }
 
-static void vhost_svq_kick(VhostShadowVirtqueue *svq)
-{
-    /*
-     * We need to expose the available array entries before checking the used
-     * flags
-     */
-    smp_mb();
-    if (svq->vring.used->flags & VRING_USED_F_NO_NOTIFY) {
-        return;
-    }
-
-    event_notifier_set(&svq->hdev_kick);
-}
-
 /**
  * Forward available buffers.
  *
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

The series needs to expose vhost_svq_add with full functionality,
including kick

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
     }
 
     svq->ring_id_maps[qemu_head] = elem;
+    vhost_svq_kick(svq);
     return true;
 }
 
@@ -XXX,XX +XXX,XX @@ static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
                 /* VQ is broken, just return and ignore any other kicks */
                 return;
             }
-            vhost_svq_kick(svq);
         }
 
         virtio_queue_set_notification(svq->vq, true);
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

The series need to expose vhost_svq_add with full functionality,
including checking for full queue.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 59 +++++++++++++++++++++-----------------
 1 file changed, 33 insertions(+), 26 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@ static void vhost_svq_kick(VhostShadowVirtqueue *svq)
  * Add an element to a SVQ.
  *
  * The caller must check that there is enough slots for the new element. It
- * takes ownership of the element: In case of failure, it is free and the SVQ
- * is considered broken.
+ * takes ownership of the element: In case of failure not ENOSPC, it is free.
+ *
+ * Return -EINVAL if element is invalid, -ENOSPC if dev queue is full
  */
-static bool vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
+static int vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
 {
     unsigned qemu_head;
-    bool ok = vhost_svq_add_split(svq, elem, &qemu_head);
+    unsigned ndescs = elem->in_num + elem->out_num;
+    bool ok;
+
+    if (unlikely(ndescs > vhost_svq_available_slots(svq))) {
+        return -ENOSPC;
+    }
+
+    ok = vhost_svq_add_split(svq, elem, &qemu_head);
     if (unlikely(!ok)) {
         g_free(elem);
-        return false;
+        return -EINVAL;
     }
 
     svq->ring_id_maps[qemu_head] = elem;
     vhost_svq_kick(svq);
-    return true;
+    return 0;
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
 
         while (true) {
             VirtQueueElement *elem;
-            bool ok;
+            int r;
 
             if (svq->next_guest_avail_elem) {
                 elem = g_steal_pointer(&svq->next_guest_avail_elem);
@@ -XXX,XX +XXX,XX @@ static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
                 break;
             }
 
-            if (elem->out_num + elem->in_num > vhost_svq_available_slots(svq)) {
-                /*
-                 * This condition is possible since a contiguous buffer in GPA
-                 * does not imply a contiguous buffer in qemu's VA
-                 * scatter-gather segments. If that happens, the buffer exposed
-                 * to the device needs to be a chain of descriptors at this
-                 * moment.
-                 *
-                 * SVQ cannot hold more available buffers if we are here:
-                 * queue the current guest descriptor and ignore further kicks
-                 * until some elements are used.
-                 */
-                svq->next_guest_avail_elem = elem;
-                return;
-            }
-
-            ok = vhost_svq_add(svq, elem);
-            if (unlikely(!ok)) {
-                /* VQ is broken, just return and ignore any other kicks */
+            r = vhost_svq_add(svq, elem);
+            if (unlikely(r != 0)) {
+                if (r == -ENOSPC) {
+                    /*
+                     * This condition is possible since a contiguous buffer in
+                     * GPA does not imply a contiguous buffer in qemu's VA
+                     * scatter-gather segments. If that happens, the buffer
+                     * exposed to the device needs to be a chain of descriptors
+                     * at this moment.
+                     *
+                     * SVQ cannot hold more available buffers if we are here:
+                     * queue the current guest descriptor and ignore kicks
+                     * until some elements are used.
+                     */
+                    svq->next_guest_avail_elem = elem;
+                }
+
+                /* VQ is full or broken, just return and ignore kicks */
                 return;
             }
         }
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

VirtQueueElement comes from the guest, but we're heading SVQ to be able
to modify the element presented to the device without the guest's
knowledge.

To do so, make SVQ accept sg buffers directly, instead of using
VirtQueueElement.

Add vhost_svq_add_element to maintain element convenience.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 33 ++++++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg,
 }
 
 static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
-                                VirtQueueElement *elem, unsigned *head)
+                                const struct iovec *out_sg, size_t out_num,
+                                const struct iovec *in_sg, size_t in_num,
+                                unsigned *head)
 {
     unsigned avail_idx;
     vring_avail_t *avail = svq->vring.avail;
     bool ok;
-    g_autofree hwaddr *sgs = g_new(hwaddr, MAX(elem->out_num, elem->in_num));
+    g_autofree hwaddr *sgs = g_new(hwaddr, MAX(out_num, in_num));
 
     *head = svq->free_head;
 
     /* We need some descriptors here */
-    if (unlikely(!elem->out_num && !elem->in_num)) {
+    if (unlikely(!out_num && !in_num)) {
         qemu_log_mask(LOG_GUEST_ERROR,
                       "Guest provided element with no descriptors");
         return false;
     }
 
-    ok = vhost_svq_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num,
-                                     elem->in_num > 0, false);
+    ok = vhost_svq_vring_write_descs(svq, sgs, out_sg, out_num, in_num > 0,
+                                     false);
     if (unlikely(!ok)) {
         return false;
     }
 
-    ok = vhost_svq_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, false,
-                                     true);
+    ok = vhost_svq_vring_write_descs(svq, sgs, in_sg, in_num, false, true);
     if (unlikely(!ok)) {
         return false;
     }
@@ -XXX,XX +XXX,XX @@ static void vhost_svq_kick(VhostShadowVirtqueue *svq)
  *
  * Return -EINVAL if element is invalid, -ENOSPC if dev queue is full
  */
-static int vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
+static int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg,
+                          size_t out_num, const struct iovec *in_sg,
+                          size_t in_num, VirtQueueElement *elem)
 {
     unsigned qemu_head;
-    unsigned ndescs = elem->in_num + elem->out_num;
+    unsigned ndescs = in_num + out_num;
     bool ok;
 
     if (unlikely(ndescs > vhost_svq_available_slots(svq))) {
         return -ENOSPC;
     }
 
-    ok = vhost_svq_add_split(svq, elem, &qemu_head);
+    ok = vhost_svq_add_split(svq, out_sg, out_num, in_sg, in_num, &qemu_head);
     if (unlikely(!ok)) {
         g_free(elem);
         return -EINVAL;
@@ -XXX,XX +XXX,XX @@ static int vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
     return 0;
 }
 
+/* Convenience wrapper to add a guest's element to SVQ */
+static int vhost_svq_add_element(VhostShadowVirtqueue *svq,
+                                 VirtQueueElement *elem)
+{
+    return vhost_svq_add(svq, elem->out_sg, elem->out_num, elem->in_sg,
+                         elem->in_num, elem);
+}
+
 /**
  * Forward available buffers.
  *
@@ -XXX,XX +XXX,XX @@ static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
                 break;
             }
 
-            r = vhost_svq_add(svq, elem);
+            r = vhost_svq_add_element(svq, elem);
             if (unlikely(r != 0)) {
                 if (r == -ENOSPC) {
                     /*
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This will allow SVQ to add context to the different queue elements.

This patch only store the actual element, no functional change intended.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 16 ++++++++--------
 hw/virtio/vhost-shadow-virtqueue.h |  8 ++++++--
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@ static int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg,
         return -EINVAL;
     }
 
-    svq->ring_id_maps[qemu_head] = elem;
+    svq->desc_state[qemu_head].elem = elem;
     vhost_svq_kick(svq);
     return 0;
 }
@@ -XXX,XX +XXX,XX @@ static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq,
         return NULL;
     }
 
-    if (unlikely(!svq->ring_id_maps[used_elem.id])) {
+    if (unlikely(!svq->desc_state[used_elem.id].elem)) {
         qemu_log_mask(LOG_GUEST_ERROR,
             "Device %s says index %u is used, but it was not available",
             svq->vdev->name, used_elem.id);
         return NULL;
     }
 
-    num = svq->ring_id_maps[used_elem.id]->in_num +
-          svq->ring_id_maps[used_elem.id]->out_num;
+    num = svq->desc_state[used_elem.id].elem->in_num +
+          svq->desc_state[used_elem.id].elem->out_num;
     last_used_chain = vhost_svq_last_desc_of_chain(svq, num, used_elem.id);
     svq->desc_next[last_used_chain] = svq->free_head;
     svq->free_head = used_elem.id;
 
     *len = used_elem.len;
-    return g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
+    return g_steal_pointer(&svq->desc_state[used_elem.id].elem);
 }
 
 static void vhost_svq_flush(VhostShadowVirtqueue *svq,
@@ -XXX,XX +XXX,XX @@ void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
     memset(svq->vring.desc, 0, driver_size);
     svq->vring.used = qemu_memalign(qemu_real_host_page_size(), device_size);
     memset(svq->vring.used, 0, device_size);
-    svq->ring_id_maps = g_new0(VirtQueueElement *, svq->vring.num);
+    svq->desc_state = g_new0(SVQDescState, svq->vring.num);
     svq->desc_next = g_new0(uint16_t, svq->vring.num);
     for (unsigned i = 0; i < svq->vring.num - 1; i++) {
         svq->desc_next[i] = cpu_to_le16(i + 1);
@@ -XXX,XX +XXX,XX @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
 
     for (unsigned i = 0; i < svq->vring.num; ++i) {
         g_autofree VirtQueueElement *elem = NULL;
-        elem = g_steal_pointer(&svq->ring_id_maps[i]);
+        elem = g_steal_pointer(&svq->desc_state[i].elem);
         if (elem) {
             virtqueue_detach_element(svq->vq, elem, 0);
         }
@@ -XXX,XX +XXX,XX @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
     }
     svq->vq = NULL;
     g_free(svq->desc_next);
-    g_free(svq->ring_id_maps);
+    g_free(svq->desc_state);
     qemu_vfree(svq->vring.desc);
     qemu_vfree(svq->vring.used);
 }
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@
 #include "standard-headers/linux/vhost_types.h"
 #include "hw/virtio/vhost-iova-tree.h"
 
+typedef struct SVQDescState {
+    VirtQueueElement *elem;
+} SVQDescState;
+
 /* Shadow virtqueue to relay notifications */
 typedef struct VhostShadowVirtqueue {
     /* Shadow vring */
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
     /* IOVA mapping */
     VhostIOVATree *iova_tree;
 
-    /* Map for use the guest's descriptors */
-    VirtQueueElement **ring_id_maps;
+    /* SVQ vring descriptors state */
+    SVQDescState *desc_state;
 
     /* Next VirtQueue element that guest made available */
     VirtQueueElement *next_guest_avail_elem;
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

A guest's buffer continuos on GPA may need multiple descriptors on
qemu's VA, so SVQ should track its length sepparatedly.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 4 ++--
 hw/virtio/vhost-shadow-virtqueue.h | 6 ++++++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@ static int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg,
     }
 
     svq->desc_state[qemu_head].elem = elem;
+    svq->desc_state[qemu_head].ndescs = ndescs;
     vhost_svq_kick(svq);
     return 0;
 }
@@ -XXX,XX +XXX,XX @@ static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq,
         return NULL;
     }
 
-    num = svq->desc_state[used_elem.id].elem->in_num +
-          svq->desc_state[used_elem.id].elem->out_num;
+    num = svq->desc_state[used_elem.id].ndescs;
     last_used_chain = vhost_svq_last_desc_of_chain(svq, num, used_elem.id);
     svq->desc_next[last_used_chain] = svq->free_head;
     svq->free_head = used_elem.id;
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@
 
 typedef struct SVQDescState {
     VirtQueueElement *elem;
+
+    /*
+     * Number of descriptors exposed to the device. May or may not match
+     * guest's
+     */
+    unsigned int ndescs;
 } SVQDescState;
 
 /* Shadow virtqueue to relay notifications */
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This function allows external SVQ users to return guest's available
buffers.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 16 ++++++++++++++++
 hw/virtio/vhost-shadow-virtqueue.h |  3 +++
 2 files changed, 19 insertions(+)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@ static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq,
     return g_steal_pointer(&svq->desc_state[used_elem.id].elem);
 }
 
+/**
+ * Push an element to SVQ, returning it to the guest.
+ */
+void vhost_svq_push_elem(VhostShadowVirtqueue *svq,
+                         const VirtQueueElement *elem, uint32_t len)
+{
+    virtqueue_push(svq->vq, elem, len);
+    if (svq->next_guest_avail_elem) {
+        /*
+         * Avail ring was full when vhost_svq_flush was called, so it's a
+         * good moment to make more descriptors available if possible.
+         */
+        vhost_handle_guest_kick(svq);
+    }
+}
+
 static void vhost_svq_flush(VhostShadowVirtqueue *svq,
                             bool check_for_avail_queue)
 {
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
 
 bool vhost_svq_valid_features(uint64_t features, Error **errp);
 
+void vhost_svq_push_elem(VhostShadowVirtqueue *svq,
+                         const VirtQueueElement *elem, uint32_t len);
+
 void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
 void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
 void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This allows external parts of SVQ to forward custom buffers to the
device.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 6 +++---
 hw/virtio/vhost-shadow-virtqueue.h | 3 +++
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@ static void vhost_svq_kick(VhostShadowVirtqueue *svq)
  *
  * Return -EINVAL if element is invalid, -ENOSPC if dev queue is full
  */
-static int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg,
-                          size_t out_num, const struct iovec *in_sg,
-                          size_t in_num, VirtQueueElement *elem)
+int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg,
+                  size_t out_num, const struct iovec *in_sg, size_t in_num,
+                  VirtQueueElement *elem)
 {
     unsigned qemu_head;
     unsigned ndescs = in_num + out_num;
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@ bool vhost_svq_valid_features(uint64_t features, Error **errp);
 
 void vhost_svq_push_elem(VhostShadowVirtqueue *svq,
                          const VirtQueueElement *elem, uint32_t len);
+int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg,
+                  size_t out_num, const struct iovec *in_sg, size_t in_num,
+                  VirtQueueElement *elem);
 
 void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
 void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

It allows the Shadow Control VirtQueue to wait for the device to use the
available buffers.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 27 +++++++++++++++++++++++++++
 hw/virtio/vhost-shadow-virtqueue.h |  1 +
 2 files changed, 28 insertions(+)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@ static void vhost_svq_flush(VhostShadowVirtqueue *svq,
 }
 
 /**
+ * Poll the SVQ for one device used buffer.
+ *
+ * This function race with main event loop SVQ polling, so extra
+ * synchronization is needed.
+ *
+ * Return the length written by the device.
+ */
+size_t vhost_svq_poll(VhostShadowVirtqueue *svq)
+{
+    int64_t start_us = g_get_monotonic_time();
+    do {
+        uint32_t len;
+        VirtQueueElement *elem = vhost_svq_get_buf(svq, &len);
+        if (elem) {
+            return len;
+        }
+
+        if (unlikely(g_get_monotonic_time() - start_us > 10e6)) {
+            return 0;
+        }
+
+        /* Make sure we read new used_idx */
+        smp_rmb();
+    } while (true);
+}
+
+/**
  * Forward used buffers.
  *
  * @n: hdev call event notifier, the one that device set to notify svq.
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@ void vhost_svq_push_elem(VhostShadowVirtqueue *svq,
 int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg,
                   size_t out_num, const struct iovec *in_sg, size_t in_num,
                   VirtQueueElement *elem);
+size_t vhost_svq_poll(VhostShadowVirtqueue *svq);
 
 void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
 void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This allows external handlers to be aware of new buffers that the guest
places in the virtqueue.

When this callback is defined the ownership of the guest's virtqueue
element is transferred to the callback. This means that if the user
wants to forward the descriptor it needs to manually inject it. The
callback is also free to process the command by itself and use the
element with svq_push.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 14 ++++++++++++--
 hw/virtio/vhost-shadow-virtqueue.h | 31 ++++++++++++++++++++++++++++++-
 hw/virtio/vhost-vdpa.c             |  3 ++-
 3 files changed, 44 insertions(+), 4 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@ static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
                 break;
             }
 
-            r = vhost_svq_add_element(svq, elem);
+            if (svq->ops) {
+                r = svq->ops->avail_handler(svq, elem, svq->ops_opaque);
+            } else {
+                r = vhost_svq_add_element(svq, elem);
+            }
             if (unlikely(r != 0)) {
                 if (r == -ENOSPC) {
                     /*
@@ -XXX,XX +XXX,XX @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
  * shadow methods and file descriptors.
  *
  * @iova_tree: Tree to perform descriptors translations
+ * @ops: SVQ owner callbacks
+ * @ops_opaque: ops opaque pointer
  *
  * Returns the new virtqueue or NULL.
  *
  * In case of error, reason is reported through error_report.
  */
-VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree)
+VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree,
+                                    const VhostShadowVirtqueueOps *ops,
+                                    void *ops_opaque)
 {
     g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
     int r;
@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree)
     event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
     event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
     svq->iova_tree = iova_tree;
+    svq->ops = ops;
+    svq->ops_opaque = ops_opaque;
     return g_steal_pointer(&svq);
 
 err_init_hdev_call:
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@ typedef struct SVQDescState {
     unsigned int ndescs;
 } SVQDescState;
 
+typedef struct VhostShadowVirtqueue VhostShadowVirtqueue;
+
+/**
+ * Callback to handle an avail buffer.
+ *
+ * @svq:  Shadow virtqueue
+ * @elem:  Element placed in the queue by the guest
+ * @vq_callback_opaque:  Opaque
+ *
+ * Returns 0 if the vq is running as expected.
+ *
+ * Note that ownership of elem is transferred to the callback.
+ */
+typedef int (*VirtQueueAvailCallback)(VhostShadowVirtqueue *svq,
+                                      VirtQueueElement *elem,
+                                      void *vq_callback_opaque);
+
+typedef struct VhostShadowVirtqueueOps {
+    VirtQueueAvailCallback avail_handler;
+} VhostShadowVirtqueueOps;
+
 /* Shadow virtqueue to relay notifications */
 typedef struct VhostShadowVirtqueue {
     /* Shadow vring */
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
      */
     uint16_t *desc_next;
 
+    /* Caller callbacks */
+    const VhostShadowVirtqueueOps *ops;
+
+    /* Caller callbacks opaque */
+    void *ops_opaque;
+
     /* Next head to expose to the device */
     uint16_t shadow_avail_idx;
 
@@ -XXX,XX +XXX,XX @@ void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
                      VirtQueue *vq);
 void vhost_svq_stop(VhostShadowVirtqueue *svq);
 
-VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree);
+VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree,
+                                    const VhostShadowVirtqueueOps *ops,
+                                    void *ops_opaque);
 
 void vhost_svq_free(gpointer vq);
 G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
 
     shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
     for (unsigned n = 0; n < hdev->nvqs; ++n) {
-        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new(v->iova_tree);
+        g_autoptr(VhostShadowVirtqueue) svq;
 
+        svq = vhost_svq_new(v->iova_tree, NULL, NULL);
         if (unlikely(!svq)) {
             error_setg(errp, "Cannot create svq %u", n);
             return -1;
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

Shadow CVQ will copy buffers on qemu VA, so we avoid TOCTOU attacks from
the guest that could set a different state in qemu device model and vdpa
device.

To do so, it needs to be able to map these new buffers to the device.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-vdpa.c         | 7 +++----
 include/hw/virtio/vhost-vdpa.h | 4 ++++
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section,
     return false;
 }
 
-static int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr iova, hwaddr size,
-                              void *vaddr, bool readonly)
+int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr iova, hwaddr size,
+                       void *vaddr, bool readonly)
 {
     struct vhost_msg_v2 msg = {};
     int fd = v->device_fd;
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr iova, hwaddr size,
     return ret;
 }
 
-static int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, hwaddr iova,
-                                hwaddr size)
+int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, hwaddr iova, hwaddr size)
 {
     struct vhost_msg_v2 msg = {};
     int fd = v->device_fd;
diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/virtio/vhost-vdpa.h
+++ b/include/hw/virtio/vhost-vdpa.h
@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
     VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
 } VhostVDPA;
 
+int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr iova, hwaddr size,
+                       void *vaddr, bool readonly);
+int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, hwaddr iova, hwaddr size);
+
 #endif
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

net/vhost-vdpa.c will need functions that are declared in
vhost-shadow-virtqueue.c, that needs functions of virtio-net.c.

Copy the vhost-vdpa-stub.c code so
only the constructor net_init_vhost_vdpa needs to be defined.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 net/meson.build       |  3 ++-
 net/vhost-vdpa-stub.c | 21 +++++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)
 create mode 100644 net/vhost-vdpa-stub.c

diff --git a/net/meson.build b/net/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/net/meson.build
+++ b/net/meson.build
@@ -XXX,XX +XXX,XX @@ endif
 softmmu_ss.add(when: 'CONFIG_POSIX', if_true: files(tap_posix))
 softmmu_ss.add(when: 'CONFIG_WIN32', if_true: files('tap-win32.c'))
 if have_vhost_net_vdpa
-  softmmu_ss.add(files('vhost-vdpa.c'))
+  softmmu_ss.add(when: 'CONFIG_VIRTIO_NET', if_true: files('vhost-vdpa.c'), if_false: files('vhost-vdpa-stub.c'))
+  softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-vdpa-stub.c'))
 endif
 
 vmnet_files = files(
diff --git a/net/vhost-vdpa-stub.c b/net/vhost-vdpa-stub.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/net/vhost-vdpa-stub.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * vhost-vdpa-stub.c
+ *
+ * Copyright (c) 2022 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "clients.h"
+#include "net/vhost-vdpa.h"
+#include "qapi/error.h"
+
+int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
+                        NetClientState *peer, Error **errp)
+{
+    error_setg(errp, "vhost-vdpa requires frontend driver virtio-net-*");
+    return -1;
+}
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

Do a simple forwarding of CVQ buffers, the same work SVQ could do but
through callbacks. No functional change intended.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-vdpa.c         |  3 ++-
 include/hw/virtio/vhost-vdpa.h |  3 +++
 net/vhost-vdpa.c               | 58 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 63 insertions(+), 1 deletion(-)

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
     for (unsigned n = 0; n < hdev->nvqs; ++n) {
         g_autoptr(VhostShadowVirtqueue) svq;
 
-        svq = vhost_svq_new(v->iova_tree, NULL, NULL);
+        svq = vhost_svq_new(v->iova_tree, v->shadow_vq_ops,
+                            v->shadow_vq_ops_opaque);
         if (unlikely(!svq)) {
             error_setg(errp, "Cannot create svq %u", n);
             return -1;
diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/virtio/vhost-vdpa.h
+++ b/include/hw/virtio/vhost-vdpa.h
@@ -XXX,XX +XXX,XX @@
 #include <gmodule.h>
 
 #include "hw/virtio/vhost-iova-tree.h"
+#include "hw/virtio/vhost-shadow-virtqueue.h"
 #include "hw/virtio/virtio.h"
 #include "standard-headers/linux/vhost_types.h"
 
@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
     /* IOVA mapping used by the Shadow Virtqueue */
     VhostIOVATree *iova_tree;
     GPtrArray *shadow_vqs;
+    const VhostShadowVirtqueueOps *shadow_vq_ops;
+    void *shadow_vq_ops_opaque;
     struct vhost_dev *dev;
     VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
 } VhostVDPA;
diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@
 
 #include "qemu/osdep.h"
 #include "clients.h"
+#include "hw/virtio/virtio-net.h"
 #include "net/vhost_net.h"
 #include "net/vhost-vdpa.h"
 #include "hw/virtio/vhost-vdpa.h"
 #include "qemu/config-file.h"
 #include "qemu/error-report.h"
+#include "qemu/log.h"
+#include "qemu/memalign.h"
 #include "qemu/option.h"
 #include "qapi/error.h"
 #include <linux/vhost.h>
@@ -XXX,XX +XXX,XX @@ static NetClientInfo net_vhost_vdpa_info = {
         .check_peer_type = vhost_vdpa_check_peer_type,
 };
 
+/**
+ * Forward buffer for the moment.
+ */
+static int vhost_vdpa_net_handle_ctrl_avail(VhostShadowVirtqueue *svq,
+                                            VirtQueueElement *elem,
+                                            void *opaque)
+{
+    unsigned int n = elem->out_num + elem->in_num;
+    g_autofree struct iovec *dev_buffers = g_new(struct iovec, n);
+    size_t in_len, dev_written;
+    virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
+    int r;
+
+    memcpy(dev_buffers, elem->out_sg, elem->out_num);
+    memcpy(dev_buffers + elem->out_num, elem->in_sg, elem->in_num);
+
+    r = vhost_svq_add(svq, &dev_buffers[0], elem->out_num, &dev_buffers[1],
+                      elem->in_num, elem);
+    if (unlikely(r != 0)) {
+        if (unlikely(r == -ENOSPC)) {
+            qemu_log_mask(LOG_GUEST_ERROR, "%s: No space on device queue\n",
+                          __func__);
+        }
+        goto out;
+    }
+
+    /*
+     * We can poll here since we've had BQL from the time we sent the
+     * descriptor. Also, we need to take the answer before SVQ pulls by itself,
+     * when BQL is released
+     */
+    dev_written = vhost_svq_poll(svq);
+    if (unlikely(dev_written < sizeof(status))) {
+        error_report("Insufficient written data (%zu)", dev_written);
+    }
+
+out:
+    in_len = iov_from_buf(elem->in_sg, elem->in_num, 0, &status,
+                          sizeof(status));
+    if (unlikely(in_len < sizeof(status))) {
+        error_report("Bad device CVQ written length");
+    }
+    vhost_svq_push_elem(svq, elem, MIN(in_len, sizeof(status)));
+    g_free(elem);
+    return r;
+}
+
+static const VhostShadowVirtqueueOps vhost_vdpa_net_svq_ops = {
+    .avail_handler = vhost_vdpa_net_handle_ctrl_avail,
+};
+
 static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
                                            const char *device,
                                            const char *name,
@@ -XXX,XX +XXX,XX @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
 
     s->vhost_vdpa.device_fd = vdpa_device_fd;
     s->vhost_vdpa.index = queue_pair_index;
+    if (!is_datapath) {
+        s->vhost_vdpa.shadow_vq_ops = &vhost_vdpa_net_svq_ops;
+        s->vhost_vdpa.shadow_vq_ops_opaque = s;
+    }
     ret = vhost_vdpa_add(nc, (void *)&s->vhost_vdpa, queue_pair_index, nvqs);
     if (ret) {
         qemu_del_net_client(nc);
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

Introduce the control virtqueue support for vDPA shadow virtqueue. This
is needed for advanced networking features like rx filtering.

Virtio-net control VQ copies the descriptors to qemu's VA, so we avoid
TOCTOU with the guest's or device's memory every time there is a device
model change.  Otherwise, the guest could change the memory content in
the time between qemu and the device read it.

To demonstrate command handling, VIRTIO_NET_F_CTRL_MACADDR is
implemented.  If the virtio-net driver changes MAC the virtio-net device
model will be updated with the new one, and a rx filtering change event
will be raised.

More cvq commands could be added here straightforwardly but they have
not been tested.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 net/vhost-vdpa.c | 213 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 205 insertions(+), 8 deletions(-)

diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ typedef struct VhostVDPAState {
     NetClientState nc;
     struct vhost_vdpa vhost_vdpa;
     VHostNetState *vhost_net;
+
+    /* Control commands shadow buffers */
+    void *cvq_cmd_out_buffer, *cvq_cmd_in_buffer;
     bool started;
 } VhostVDPAState;
 
@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_cleanup(NetClientState *nc)
 {
     VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
 
+    qemu_vfree(s->cvq_cmd_out_buffer);
+    qemu_vfree(s->cvq_cmd_in_buffer);
     if (s->vhost_net) {
         vhost_net_cleanup(s->vhost_net);
         g_free(s->vhost_net);
@@ -XXX,XX +XXX,XX @@ static NetClientInfo net_vhost_vdpa_info = {
         .check_peer_type = vhost_vdpa_check_peer_type,
 };
 
+static void vhost_vdpa_cvq_unmap_buf(struct vhost_vdpa *v, void *addr)
+{
+    VhostIOVATree *tree = v->iova_tree;
+    DMAMap needle = {
+        /*
+         * No need to specify size or to look for more translations since
+         * this contiguous chunk was allocated by us.
+         */
+        .translated_addr = (hwaddr)(uintptr_t)addr,
+    };
+    const DMAMap *map = vhost_iova_tree_find_iova(tree, &needle);
+    int r;
+
+    if (unlikely(!map)) {
+        error_report("Cannot locate expected map");
+        return;
+    }
+
+    r = vhost_vdpa_dma_unmap(v, map->iova, map->size + 1);
+    if (unlikely(r != 0)) {
+        error_report("Device cannot unmap: %s(%d)", g_strerror(r), r);
+    }
+
+    vhost_iova_tree_remove(tree, map);
+}
+
+static size_t vhost_vdpa_net_cvq_cmd_len(void)
+{
+    /*
+     * MAC_TABLE_SET is the ctrl command that produces the longer out buffer.
+     * In buffer is always 1 byte, so it should fit here
+     */
+    return sizeof(struct virtio_net_ctrl_hdr) +
+           2 * sizeof(struct virtio_net_ctrl_mac) +
+           MAC_TABLE_ENTRIES * ETH_ALEN;
+}
+
+static size_t vhost_vdpa_net_cvq_cmd_page_len(void)
+{
+    return ROUND_UP(vhost_vdpa_net_cvq_cmd_len(), qemu_real_host_page_size());
+}
+
+/** Copy and map a guest buffer. */
+static bool vhost_vdpa_cvq_map_buf(struct vhost_vdpa *v,
+                                   const struct iovec *out_data,
+                                   size_t out_num, size_t data_len, void *buf,
+                                   size_t *written, bool write)
+{
+    DMAMap map = {};
+    int r;
+
+    if (unlikely(!data_len)) {
+        qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid legnth of %s buffer\n",
+                      __func__, write ? "in" : "out");
+        return false;
+    }
+
+    *written = iov_to_buf(out_data, out_num, 0, buf, data_len);
+    map.translated_addr = (hwaddr)(uintptr_t)buf;
+    map.size = vhost_vdpa_net_cvq_cmd_page_len() - 1;
+    map.perm = write ? IOMMU_RW : IOMMU_RO,
+    r = vhost_iova_tree_map_alloc(v->iova_tree, &map);
+    if (unlikely(r != IOVA_OK)) {
+        error_report("Cannot map injected element");
+        return false;
+    }
+
+    r = vhost_vdpa_dma_map(v, map.iova, vhost_vdpa_net_cvq_cmd_page_len(), buf,
+                           !write);
+    if (unlikely(r < 0)) {
+        goto dma_map_err;
+    }
+
+    return true;
+
+dma_map_err:
+    vhost_iova_tree_remove(v->iova_tree, &map);
+    return false;
+}
+
 /**
- * Forward buffer for the moment.
+ * Copy the guest element into a dedicated buffer suitable to be sent to NIC
+ *
+ * @iov: [0] is the out buffer, [1] is the in one
+ */
+static bool vhost_vdpa_net_cvq_map_elem(VhostVDPAState *s,
+                                        VirtQueueElement *elem,
+                                        struct iovec *iov)
+{
+    size_t in_copied;
+    bool ok;
+
+    iov[0].iov_base = s->cvq_cmd_out_buffer;
+    ok = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, elem->out_sg, elem->out_num,
+                                vhost_vdpa_net_cvq_cmd_len(), iov[0].iov_base,
+                                &iov[0].iov_len, false);
+    if (unlikely(!ok)) {
+        return false;
+    }
+
+    iov[1].iov_base = s->cvq_cmd_in_buffer;
+    ok = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, NULL, 0,
+                                sizeof(virtio_net_ctrl_ack), iov[1].iov_base,
+                                &in_copied, true);
+    if (unlikely(!ok)) {
+        vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer);
+        return false;
+    }
+
+    iov[1].iov_len = sizeof(virtio_net_ctrl_ack);
+    return true;
+}
+
+/**
+ * Do not forward commands not supported by SVQ. Otherwise, the device could
+ * accept it and qemu would not know how to update the device model.
+ */
+static bool vhost_vdpa_net_cvq_validate_cmd(const struct iovec *out,
+                                            size_t out_num)
+{
+    struct virtio_net_ctrl_hdr ctrl;
+    size_t n;
+
+    n = iov_to_buf(out, out_num, 0, &ctrl, sizeof(ctrl));
+    if (unlikely(n < sizeof(ctrl))) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "%s: invalid legnth of out buffer %zu\n", __func__, n);
+        return false;
+    }
+
+    switch (ctrl.class) {
+    case VIRTIO_NET_CTRL_MAC:
+        switch (ctrl.cmd) {
+        case VIRTIO_NET_CTRL_MAC_ADDR_SET:
+            return true;
+        default:
+            qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid mac cmd %u\n",
+                          __func__, ctrl.cmd);
+        };
+        break;
+    default:
+        qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid control class %u\n",
+                      __func__, ctrl.class);
+    };
+
+    return false;
+}
+
+/**
+ * Validate and copy control virtqueue commands.
+ *
+ * Following QEMU guidelines, we offer a copy of the buffers to the device to
+ * prevent TOCTOU bugs.
  */
 static int vhost_vdpa_net_handle_ctrl_avail(VhostShadowVirtqueue *svq,
                                             VirtQueueElement *elem,
                                             void *opaque)
 {
-    unsigned int n = elem->out_num + elem->in_num;
-    g_autofree struct iovec *dev_buffers = g_new(struct iovec, n);
+    VhostVDPAState *s = opaque;
     size_t in_len, dev_written;
     virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
-    int r;
+    /* out and in buffers sent to the device */
+    struct iovec dev_buffers[2] = {
+        { .iov_base = s->cvq_cmd_out_buffer },
+        { .iov_base = s->cvq_cmd_in_buffer },
+    };
+    /* in buffer used for device model */
+    const struct iovec in = {
+        .iov_base = &status,
+        .iov_len = sizeof(status),
+    };
+    int r = -EINVAL;
+    bool ok;
+
+    ok = vhost_vdpa_net_cvq_map_elem(s, elem, dev_buffers);
+    if (unlikely(!ok)) {
+        goto out;
+    }
 
-    memcpy(dev_buffers, elem->out_sg, elem->out_num);
-    memcpy(dev_buffers + elem->out_num, elem->in_sg, elem->in_num);
+    ok = vhost_vdpa_net_cvq_validate_cmd(&dev_buffers[0], 1);
+    if (unlikely(!ok)) {
+        goto out;
+    }
 
-    r = vhost_svq_add(svq, &dev_buffers[0], elem->out_num, &dev_buffers[1],
-                      elem->in_num, elem);
+    r = vhost_svq_add(svq, &dev_buffers[0], 1, &dev_buffers[1], 1, elem);
     if (unlikely(r != 0)) {
         if (unlikely(r == -ENOSPC)) {
             qemu_log_mask(LOG_GUEST_ERROR, "%s: No space on device queue\n",
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_net_handle_ctrl_avail(VhostShadowVirtqueue *svq,
     dev_written = vhost_svq_poll(svq);
     if (unlikely(dev_written < sizeof(status))) {
         error_report("Insufficient written data (%zu)", dev_written);
+        goto out;
+    }
+
+    memcpy(&status, dev_buffers[1].iov_base, sizeof(status));
+    if (status != VIRTIO_NET_OK) {
+        goto out;
+    }
+
+    status = VIRTIO_NET_ERR;
+    virtio_net_handle_ctrl_iov(svq->vdev, &in, 1, dev_buffers, 1);
+    if (status != VIRTIO_NET_OK) {
+        error_report("Bad CVQ processing in model");
     }
 
 out:
@@ -XXX,XX +XXX,XX @@ out:
     }
     vhost_svq_push_elem(svq, elem, MIN(in_len, sizeof(status)));
     g_free(elem);
+    if (dev_buffers[0].iov_base) {
+        vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, dev_buffers[0].iov_base);
+    }
+    if (dev_buffers[1].iov_base) {
+        vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, dev_buffers[1].iov_base);
+    }
     return r;
 }
 
@@ -XXX,XX +XXX,XX @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
     s->vhost_vdpa.device_fd = vdpa_device_fd;
     s->vhost_vdpa.index = queue_pair_index;
     if (!is_datapath) {
+        s->cvq_cmd_out_buffer = qemu_memalign(qemu_real_host_page_size(),
+                                            vhost_vdpa_net_cvq_cmd_page_len());
+        memset(s->cvq_cmd_out_buffer, 0, vhost_vdpa_net_cvq_cmd_page_len());
+        s->cvq_cmd_in_buffer = qemu_memalign(qemu_real_host_page_size(),
+                                            vhost_vdpa_net_cvq_cmd_page_len());
+        memset(s->cvq_cmd_in_buffer, 0, vhost_vdpa_net_cvq_cmd_page_len());
+
         s->vhost_vdpa.shadow_vq_ops = &vhost_vdpa_net_svq_ops;
         s->vhost_vdpa.shadow_vq_ops_opaque = s;
     }
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

To know the device features is needed for CVQ SVQ, so SVQ knows if it
can handle all commands or not. Extract from
vhost_vdpa_get_max_queue_pairs so we can reuse it.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 net/vhost-vdpa.c | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
     return nc;
 }
 
-static int vhost_vdpa_get_max_queue_pairs(int fd, int *has_cvq, Error **errp)
+static int vhost_vdpa_get_features(int fd, uint64_t *features, Error **errp)
+{
+    int ret = ioctl(fd, VHOST_GET_FEATURES, features);
+    if (unlikely(ret < 0)) {
+        error_setg_errno(errp, errno,
+                         "Fail to query features from vhost-vDPA device");
+    }
+    return ret;
+}
+
+static int vhost_vdpa_get_max_queue_pairs(int fd, uint64_t features,
+                                          int *has_cvq, Error **errp)
 {
     unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
     g_autofree struct vhost_vdpa_config *config = NULL;
     __virtio16 *max_queue_pairs;
-    uint64_t features;
     int ret;
 
-    ret = ioctl(fd, VHOST_GET_FEATURES, &features);
-    if (ret) {
-        error_setg(errp, "Fail to query features from vhost-vDPA device");
-        return ret;
-    }
-
     if (features & (1 << VIRTIO_NET_F_CTRL_VQ)) {
         *has_cvq = 1;
     } else {
@@ -XXX,XX +XXX,XX @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
                         NetClientState *peer, Error **errp)
 {
     const NetdevVhostVDPAOptions *opts;
+    uint64_t features;
     int vdpa_device_fd;
     g_autofree NetClientState **ncs = NULL;
     NetClientState *nc;
-    int queue_pairs, i, has_cvq = 0;
+    int queue_pairs, r, i, has_cvq = 0;
 
     assert(netdev->type == NET_CLIENT_DRIVER_VHOST_VDPA);
     opts = &netdev->u.vhost_vdpa;
@@ -XXX,XX +XXX,XX @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
         return -errno;
     }
 
-    queue_pairs = vhost_vdpa_get_max_queue_pairs(vdpa_device_fd,
+    r = vhost_vdpa_get_features(vdpa_device_fd, &features, errp);
+    if (unlikely(r < 0)) {
+        return r;
+    }
+
+    queue_pairs = vhost_vdpa_get_max_queue_pairs(vdpa_device_fd, features,
                                                  &has_cvq, errp);
     if (queue_pairs < 0) {
         qemu_close(vdpa_device_fd);
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

Since the vhost-vdpa device is exposing _F_LOG, adding a migration blocker if
it uses CVQ.

However, qemu is able to migrate simple devices with no CVQ as long as
they use SVQ. To allow it, add a placeholder error to vhost_vdpa, and
only add to vhost_dev when used. vhost_dev machinery place the migration
blocker if needed.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-vdpa.c         | 15 +++++++++++++++
 include/hw/virtio/vhost-vdpa.h |  1 +
 2 files changed, 16 insertions(+)

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@
 #include "hw/virtio/vhost-shadow-virtqueue.h"
 #include "hw/virtio/vhost-vdpa.h"
 #include "exec/address-spaces.h"
+#include "migration/blocker.h"
 #include "qemu/cutils.h"
 #include "qemu/main-loop.h"
 #include "cpu.h"
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
         return true;
     }
 
+    if (v->migration_blocker) {
+        int r = migrate_add_blocker(v->migration_blocker, &err);
+        if (unlikely(r < 0)) {
+            return false;
+        }
+    }
+
     for (i = 0; i < v->shadow_vqs->len; ++i) {
         VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
@@ -XXX,XX +XXX,XX @@ err:
         vhost_svq_stop(svq);
     }
 
+    if (v->migration_blocker) {
+        migrate_del_blocker(v->migration_blocker);
+    }
+
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svqs_stop(struct vhost_dev *dev)
         }
     }
 
+    if (v->migration_blocker) {
+        migrate_del_blocker(v->migration_blocker);
+    }
     return true;
 }
 
diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/virtio/vhost-vdpa.h
+++ b/include/hw/virtio/vhost-vdpa.h
@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
     bool shadow_vqs_enabled;
     /* IOVA mapping used by the Shadow Virtqueue */
     VhostIOVATree *iova_tree;
+    Error *migration_blocker;
     GPtrArray *shadow_vqs;
     const VhostShadowVirtqueueOps *shadow_vq_ops;
     void *shadow_vq_ops_opaque;
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

Finally offering the possibility to enable SVQ from the command line.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 net/vhost-vdpa.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++++---
 qapi/net.json    |  9 ++++++-
 2 files changed, 77 insertions(+), 4 deletions(-)

diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ const int vdpa_feature_bits[] = {
     VHOST_INVALID_FEATURE_BIT
 };
 
+/** Supported device specific feature bits with SVQ */
+static const uint64_t vdpa_svq_device_features =
+    BIT_ULL(VIRTIO_NET_F_CSUM) |
+    BIT_ULL(VIRTIO_NET_F_GUEST_CSUM) |
+    BIT_ULL(VIRTIO_NET_F_MTU) |
+    BIT_ULL(VIRTIO_NET_F_MAC) |
+    BIT_ULL(VIRTIO_NET_F_GUEST_TSO4) |
+    BIT_ULL(VIRTIO_NET_F_GUEST_TSO6) |
+    BIT_ULL(VIRTIO_NET_F_GUEST_ECN) |
+    BIT_ULL(VIRTIO_NET_F_GUEST_UFO) |
+    BIT_ULL(VIRTIO_NET_F_HOST_TSO4) |
+    BIT_ULL(VIRTIO_NET_F_HOST_TSO6) |
+    BIT_ULL(VIRTIO_NET_F_HOST_ECN) |
+    BIT_ULL(VIRTIO_NET_F_HOST_UFO) |
+    BIT_ULL(VIRTIO_NET_F_MRG_RXBUF) |
+    BIT_ULL(VIRTIO_NET_F_STATUS) |
+    BIT_ULL(VIRTIO_NET_F_CTRL_VQ) |
+    BIT_ULL(VIRTIO_F_ANY_LAYOUT) |
+    BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR) |
+    BIT_ULL(VIRTIO_NET_F_RSC_EXT) |
+    BIT_ULL(VIRTIO_NET_F_STANDBY);
+
 VHostNetState *vhost_vdpa_get_vhost_net(NetClientState *nc)
 {
     VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
@@ -XXX,XX +XXX,XX @@ err_init:
 static void vhost_vdpa_cleanup(NetClientState *nc)
 {
     VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
+    struct vhost_dev *dev = &s->vhost_net->dev;
 
     qemu_vfree(s->cvq_cmd_out_buffer);
     qemu_vfree(s->cvq_cmd_in_buffer);
+    if (dev->vq_index + dev->nvqs == dev->vq_index_end) {
+        g_clear_pointer(&s->vhost_vdpa.iova_tree, vhost_iova_tree_delete);
+    }
     if (s->vhost_net) {
         vhost_net_cleanup(s->vhost_net);
         g_free(s->vhost_net);
@@ -XXX,XX +XXX,XX @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
                                            int vdpa_device_fd,
                                            int queue_pair_index,
                                            int nvqs,
-                                           bool is_datapath)
+                                           bool is_datapath,
+                                           bool svq,
+                                           VhostIOVATree *iova_tree)
 {
     NetClientState *nc = NULL;
     VhostVDPAState *s;
@@ -XXX,XX +XXX,XX @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
 
     s->vhost_vdpa.device_fd = vdpa_device_fd;
     s->vhost_vdpa.index = queue_pair_index;
+    s->vhost_vdpa.shadow_vqs_enabled = svq;
+    s->vhost_vdpa.iova_tree = iova_tree;
     if (!is_datapath) {
         s->cvq_cmd_out_buffer = qemu_memalign(qemu_real_host_page_size(),
                                             vhost_vdpa_net_cvq_cmd_page_len());
@@ -XXX,XX +XXX,XX @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
 
         s->vhost_vdpa.shadow_vq_ops = &vhost_vdpa_net_svq_ops;
         s->vhost_vdpa.shadow_vq_ops_opaque = s;
+        error_setg(&s->vhost_vdpa.migration_blocker,
+                   "Migration disabled: vhost-vdpa uses CVQ.");
     }
     ret = vhost_vdpa_add(nc, (void *)&s->vhost_vdpa, queue_pair_index, nvqs);
     if (ret) {
@@ -XXX,XX +XXX,XX @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
     return nc;
 }
 
+static int vhost_vdpa_get_iova_range(int fd,
+                                     struct vhost_vdpa_iova_range *iova_range)
+{
+    int ret = ioctl(fd, VHOST_VDPA_GET_IOVA_RANGE, iova_range);
+
+    return ret < 0 ? -errno : 0;
+}
+
 static int vhost_vdpa_get_features(int fd, uint64_t *features, Error **errp)
 {
     int ret = ioctl(fd, VHOST_GET_FEATURES, features);
@@ -XXX,XX +XXX,XX @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
     uint64_t features;
     int vdpa_device_fd;
     g_autofree NetClientState **ncs = NULL;
+    g_autoptr(VhostIOVATree) iova_tree = NULL;
     NetClientState *nc;
     int queue_pairs, r, i, has_cvq = 0;
 
@@ -XXX,XX +XXX,XX @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
         return queue_pairs;
     }
 
+    if (opts->x_svq) {
+        struct vhost_vdpa_iova_range iova_range;
+
+        uint64_t invalid_dev_features =
+            features & ~vdpa_svq_device_features &
+            /* Transport are all accepted at this point */
+            ~MAKE_64BIT_MASK(VIRTIO_TRANSPORT_F_START,
+                             VIRTIO_TRANSPORT_F_END - VIRTIO_TRANSPORT_F_START);
+
+        if (invalid_dev_features) {
+            error_setg(errp, "vdpa svq does not work with features 0x%" PRIx64,
+                       invalid_dev_features);
+            goto err_svq;
+        }
+
+        vhost_vdpa_get_iova_range(vdpa_device_fd, &iova_range);
+        iova_tree = vhost_iova_tree_new(iova_range.first, iova_range.last);
+    }
+
     ncs = g_malloc0(sizeof(*ncs) * queue_pairs);
 
     for (i = 0; i < queue_pairs; i++) {
         ncs[i] = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
-                                     vdpa_device_fd, i, 2, true);
+                                     vdpa_device_fd, i, 2, true, opts->x_svq,
+                                     iova_tree);
         if (!ncs[i])
             goto err;
     }
 
     if (has_cvq) {
         nc = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
-                                 vdpa_device_fd, i, 1, false);
+                                 vdpa_device_fd, i, 1, false,
+                                 opts->x_svq, iova_tree);
         if (!nc)
             goto err;
     }
 
+    /* iova_tree ownership belongs to last NetClientState */
+    g_steal_pointer(&iova_tree);
     return 0;
 
 err:
@@ -XXX,XX +XXX,XX @@ err:
             qemu_del_net_client(ncs[i]);
         }
     }
+
+err_svq:
     qemu_close(vdpa_device_fd);
 
     return -1;
diff --git a/qapi/net.json b/qapi/net.json
index XXXXXXX..XXXXXXX 100644
--- a/qapi/net.json
+++ b/qapi/net.json
@@ -XXX,XX +XXX,XX @@
 # @queues: number of queues to be created for multiqueue vhost-vdpa
 #          (default: 1)
 #
+# @x-svq: Start device with (experimental) shadow virtqueue. (Since 7.1)
+#         (default: false)
+#
+# Features:
+# @unstable: Member @x-svq is experimental.
+#
 # Since: 5.1
 ##
 { 'struct': 'NetdevVhostVDPAOptions',
   'data': {
     '*vhostdev':     'str',
-    '*queues':       'int' } }
+    '*queues':       'int',
+    '*x-svq':        {'type': 'bool', 'features' : [ 'unstable'] } } }
 
 ##
 # @NetdevVmnetHostOptions:
-- 
2.7.4

From: Zhang Chen <chen.zhang@intel.com>

If the checkpoint occurs when the guest finishes restarting
but has not started running, the runstate_set() may reject
the transition from COLO to PRELAUNCH with the crash log:

{"timestamp": {"seconds": 1593484591, "microseconds": 26605},\
"event": "RESET", "data": {"guest": true, "reason": "guest-reset"}}
qemu-system-x86_64: invalid runstate transition: 'colo' -> 'prelaunch'

Long-term testing says that it's pretty safe.

Signed-off-by: Like Xu <like.xu@linux.intel.com>
Signed-off-by: Zhang Chen <chen.zhang@intel.com>
Acked-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 softmmu/runstate.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/softmmu/runstate.c b/softmmu/runstate.c
index XXXXXXX..XXXXXXX 100644
--- a/softmmu/runstate.c
+++ b/softmmu/runstate.c
@@ -XXX,XX +XXX,XX @@ static const RunStateTransition runstate_transitions_def[] = {
     { RUN_STATE_RESTORE_VM, RUN_STATE_PRELAUNCH },
 
     { RUN_STATE_COLO, RUN_STATE_RUNNING },
+    { RUN_STATE_COLO, RUN_STATE_PRELAUNCH },
     { RUN_STATE_COLO, RUN_STATE_SHUTDOWN},
 
     { RUN_STATE_RUNNING, RUN_STATE_DEBUG },
-- 
2.7.4

From: Zhang Chen <chen.zhang@intel.com>

We notice the QEMU may crash when the guest has too many
incoming network connections with the following log:

15197@1593578622.668573:colo_proxy_main : colo proxy connection hashtable full, clear it
free(): invalid pointer
[1]    15195 abort (core dumped)  qemu-system-x86_64 ....

This is because we create the s->connection_track_table with
g_hash_table_new_full() which is defined as:

GHashTable * g_hash_table_new_full (GHashFunc hash_func,
                       GEqualFunc key_equal_func,
                       GDestroyNotify key_destroy_func,
                       GDestroyNotify value_destroy_func);

The fourth parameter connection_destroy() will be called to free the
memory allocated for all 'Connection' values in the hashtable when
we call g_hash_table_remove_all() in the connection_hashtable_reset().

But both connection_track_table and conn_list reference to the same
conn instance. It will trigger double free in conn_list clear. So this
patch remove free action on hash table side to avoid double free the
conn.

Signed-off-by: Like Xu <like.xu@linux.intel.com>
Signed-off-by: Zhang Chen <chen.zhang@intel.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 net/colo-compare.c    | 2 +-
 net/filter-rewriter.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/colo-compare.c b/net/colo-compare.c
index XXXXXXX..XXXXXXX 100644
--- a/net/colo-compare.c
+++ b/net/colo-compare.c
@@ -XXX,XX +XXX,XX @@ static void colo_compare_complete(UserCreatable *uc, Error **errp)
     s->connection_track_table = g_hash_table_new_full(connection_key_hash,
                                                       connection_key_equal,
                                                       g_free,
-                                                      connection_destroy);
+                                                      NULL);
 
     colo_compare_iothread(s);
 
diff --git a/net/filter-rewriter.c b/net/filter-rewriter.c
index XXXXXXX..XXXXXXX 100644
--- a/net/filter-rewriter.c
+++ b/net/filter-rewriter.c
@@ -XXX,XX +XXX,XX @@ static void colo_rewriter_setup(NetFilterState *nf, Error **errp)
     s->connection_track_table = g_hash_table_new_full(connection_key_hash,
                                                       connection_key_equal,
                                                       g_free,
-                                                      connection_destroy);
+                                                      NULL);
     s->incoming_queue = qemu_new_net_queue(qemu_netfilter_pass_to_next, nf);
 }
 
-- 
2.7.4

From: Zhang Chen <chen.zhang@intel.com>

When COLO use only one vnet_hdr_support parameter between
filter-redirector and filter-mirror(or colo-compare), COLO will crash
with segmentation fault. Back track as follow:

Thread 1 "qemu-system-x86" received signal SIGSEGV, Segmentation fault.
0x0000555555cb200b in eth_get_l2_hdr_length (p=0x0)
    at /home/tao/project/COLO/colo-qemu/include/net/eth.h:296
296         uint16_t proto = be16_to_cpu(PKT_GET_ETH_HDR(p)->h_proto);
(gdb) bt
0  0x0000555555cb200b in eth_get_l2_hdr_length (p=0x0)
    at /home/tao/project/COLO/colo-qemu/include/net/eth.h:296
1  0x0000555555cb22b4 in parse_packet_early (pkt=0x555556a44840) at
net/colo.c:49
2  0x0000555555cb2b91 in is_tcp_packet (pkt=0x555556a44840) at
net/filter-rewriter.c:63

So wrong vnet_hdr_len will cause pkt->data become NULL. Add check to
raise error and add trace-events to track vnet_hdr_len.

Signed-off-by: Tao Xu <tao3.xu@intel.com>
Signed-off-by: Zhang Chen <chen.zhang@intel.com>
Reviewed-by: Li Zhijian <lizhijian@fujitsu.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 net/colo.c       | 9 ++++++++-
 net/trace-events | 1 +
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/net/colo.c b/net/colo.c
index XXXXXXX..XXXXXXX 100644
--- a/net/colo.c
+++ b/net/colo.c
@@ -XXX,XX +XXX,XX @@ int parse_packet_early(Packet *pkt)
     static const uint8_t vlan[] = {0x81, 0x00};
     uint8_t *data = pkt->data + pkt->vnet_hdr_len;
     uint16_t l3_proto;
-    ssize_t l2hdr_len = eth_get_l2_hdr_length(data);
+    ssize_t l2hdr_len;
+
+    if (data == NULL) {
+        trace_colo_proxy_main_vnet_info("This packet is not parsed correctly, "
+                                        "pkt->vnet_hdr_len", pkt->vnet_hdr_len);
+        return 1;
+    }
+    l2hdr_len = eth_get_l2_hdr_length(data);
 
     if (pkt->size < ETH_HLEN + pkt->vnet_hdr_len) {
         trace_colo_proxy_main("pkt->size < ETH_HLEN");
diff --git a/net/trace-events b/net/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/net/trace-events
+++ b/net/trace-events
@@ -XXX,XX +XXX,XX @@ vhost_user_event(const char *chr, int event) "chr: %s got event: %d"
 
 # colo.c
 colo_proxy_main(const char *chr) ": %s"
+colo_proxy_main_vnet_info(const char *sta, int size) ": %s = %d"
 
 # colo-compare.c
 colo_compare_main(const char *chr) ": %s"
-- 
2.7.4

The following changes since commit 352998df1c53b366413690d95b35f76d0721ebed:

Merge tag 'i2c-20220314' of https://github.com/philmd/qemu into staging (2022-03-14 14:39:33 +0000)

are available in the git repository at:

https://github.com/jasowang/qemu.git tags/net-pull-request

for you to fetch changes up to 12a195fa343aae2ead1301ce04727bd0ae25eb15:

vdpa: Expose VHOST_F_LOG_ALL on SVQ (2022-03-15 13:57:44 +0800)

----------------------------------------------------------------

Changes since V2:
- fix 32bit build errros

----------------------------------------------------------------
Eugenio Pérez (14):
      vhost: Add VhostShadowVirtqueue
      vhost: Add Shadow VirtQueue kick forwarding capabilities
      vhost: Add Shadow VirtQueue call forwarding capabilities
      vhost: Add vhost_svq_valid_features to shadow vq
      virtio: Add vhost_svq_get_vring_addr
      vdpa: adapt vhost_ops callbacks to svq
      vhost: Shadow virtqueue buffers forwarding
      util: Add iova_tree_alloc_map
      util: add iova_tree_find_iova
      vhost: Add VhostIOVATree
      vdpa: Add custom IOTLB translations to SVQ
      vdpa: Adapt vhost_vdpa_get_vring_base to SVQ
      vdpa: Never set log_base addr if SVQ is enabled
      vdpa: Expose VHOST_F_LOG_ALL on SVQ

Jason Wang (1):
      virtio-net: fix map leaking on error during receive

hw/net/virtio-net.c                |   1 +
 hw/virtio/meson.build              |   2 +-
 hw/virtio/vhost-iova-tree.c        | 110 +++++++
 hw/virtio/vhost-iova-tree.h        |  27 ++
 hw/virtio/vhost-shadow-virtqueue.c | 636 +++++++++++++++++++++++++++++++++++++
 hw/virtio/vhost-shadow-virtqueue.h |  87 +++++
 hw/virtio/vhost-vdpa.c             | 522 +++++++++++++++++++++++++++++-
 include/hw/virtio/vhost-vdpa.h     |   8 +
 include/qemu/iova-tree.h           |  38 ++-
 util/iova-tree.c                   | 170 ++++++++++
 10 files changed, 1584 insertions(+), 17 deletions(-)
 create mode 100644 hw/virtio/vhost-iova-tree.c
 create mode 100644 hw/virtio/vhost-iova-tree.h
 create mode 100644 hw/virtio/vhost-shadow-virtqueue.c
 create mode 100644 hw/virtio/vhost-shadow-virtqueue.h

Commit bedd7e93d0196 ("virtio-net: fix use after unmap/free for sg")
tries to fix the use after free of the sg by caching the virtqueue
elements in an array and unmap them at once after receiving the
packets, But it forgot to unmap the cached elements on error which
will lead to leaking of mapping and other unexpected results.

Fixing this by detaching the cached elements on error. This addresses
CVE-2022-26353.

Reported-by: Victor Tom <vv474172261@gmail.com>
Cc: qemu-stable@nongnu.org
Fixes: CVE-2022-26353
Fixes: bedd7e93d0196 ("virtio-net: fix use after unmap/free for sg")
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/virtio-net.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -XXX,XX +XXX,XX @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
 
 err:
     for (j = 0; j < i; j++) {
+        virtqueue_detach_element(q->rx_vq, elems[j], lens[j]);
         g_free(elems[j]);
     }
 
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

Vhost shadow virtqueue (SVQ) is an intermediate jump for virtqueue
notifications and buffers, allowing qemu to track them. While qemu is
forwarding the buffers and virtqueue changes, it is able to commit the
memory it's being dirtied, the same way regular qemu's VirtIO devices
do.

This commit only exposes basic SVQ allocation and free. Next patches of
the series add functionality like notifications and buffers forwarding.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/meson.build              |  2 +-
 hw/virtio/vhost-shadow-virtqueue.c | 62 ++++++++++++++++++++++++++++++++++++++
 hw/virtio/vhost-shadow-virtqueue.h | 28 +++++++++++++++++
 3 files changed, 91 insertions(+), 1 deletion(-)
 create mode 100644 hw/virtio/vhost-shadow-virtqueue.c
 create mode 100644 hw/virtio/vhost-shadow-virtqueue.h

diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/meson.build
+++ b/hw/virtio/meson.build
@@ -XXX,XX +XXX,XX @@ softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-stub.c'))
 
 virtio_ss = ss.source_set()
 virtio_ss.add(files('virtio.c'))
-virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c'))
+virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_USER', if_true: files('vhost-user.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_VDPA', if_true: files('vhost-vdpa.c'))
 virtio_ss.add(when: 'CONFIG_VIRTIO_BALLOON', if_true: files('virtio-balloon.c'))
diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * vhost shadow virtqueue
+ *
+ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
+ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "hw/virtio/vhost-shadow-virtqueue.h"
+
+#include "qemu/error-report.h"
+
+/**
+ * Creates vhost shadow virtqueue, and instructs the vhost device to use the
+ * shadow methods and file descriptors.
+ *
+ * Returns the new virtqueue or NULL.
+ *
+ * In case of error, reason is reported through error_report.
+ */
+VhostShadowVirtqueue *vhost_svq_new(void)
+{
+    g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
+    int r;
+
+    r = event_notifier_init(&svq->hdev_kick, 0);
+    if (r != 0) {
+        error_report("Couldn't create kick event notifier: %s (%d)",
+                     g_strerror(errno), errno);
+        goto err_init_hdev_kick;
+    }
+
+    r = event_notifier_init(&svq->hdev_call, 0);
+    if (r != 0) {
+        error_report("Couldn't create call event notifier: %s (%d)",
+                     g_strerror(errno), errno);
+        goto err_init_hdev_call;
+    }
+
+    return g_steal_pointer(&svq);
+
+err_init_hdev_call:
+    event_notifier_cleanup(&svq->hdev_kick);
+
+err_init_hdev_kick:
+    return NULL;
+}
+
+/**
+ * Free the resources of the shadow virtqueue.
+ *
+ * @pvq: gpointer to SVQ so it can be used by autofree functions.
+ */
+void vhost_svq_free(gpointer pvq)
+{
+    VhostShadowVirtqueue *vq = pvq;
+    event_notifier_cleanup(&vq->hdev_kick);
+    event_notifier_cleanup(&vq->hdev_call);
+    g_free(vq);
+}
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * vhost shadow virtqueue
+ *
+ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
+ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef VHOST_SHADOW_VIRTQUEUE_H
+#define VHOST_SHADOW_VIRTQUEUE_H
+
+#include "qemu/event_notifier.h"
+
+/* Shadow virtqueue to relay notifications */
+typedef struct VhostShadowVirtqueue {
+    /* Shadow kick notifier, sent to vhost */
+    EventNotifier hdev_kick;
+    /* Shadow call notifier, sent to vhost */
+    EventNotifier hdev_call;
+} VhostShadowVirtqueue;
+
+VhostShadowVirtqueue *vhost_svq_new(void);
+
+void vhost_svq_free(gpointer vq);
+G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
+
+#endif
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

At this mode no buffer forwarding will be performed in SVQ mode: Qemu
will just forward the guest's kicks to the device.

Host memory notifiers regions are left out for simplicity, and they will
not be addressed in this series.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c |  55 ++++++++++++++
 hw/virtio/vhost-shadow-virtqueue.h |  14 ++++
 hw/virtio/vhost-vdpa.c             | 144 ++++++++++++++++++++++++++++++++++++-
 include/hw/virtio/vhost-vdpa.h     |   4 ++
 4 files changed, 215 insertions(+), 2 deletions(-)

From: Eugenio Pérez <eperezma@redhat.com>

This will make qemu aware of the device used buffers, allowing it to
write the guest memory with its contents if needed.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 38 ++++++++++++++++++++++++++++++++++++++
 hw/virtio/vhost-shadow-virtqueue.h |  4 ++++
 hw/virtio/vhost-vdpa.c             | 31 +++++++++++++++++++++++++++++--
 3 files changed, 71 insertions(+), 2 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@ static void vhost_handle_guest_kick(EventNotifier *n)
 }
 
 /**
+ * Forward vhost notifications
+ *
+ * @n: hdev call event notifier, the one that device set to notify svq.
+ */
+static void vhost_svq_handle_call(EventNotifier *n)
+{
+    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
+                                             hdev_call);
+    event_notifier_test_and_clear(n);
+    event_notifier_set(&svq->svq_call);
+}
+
+/**
+ * Set the call notifier for the SVQ to call the guest
+ *
+ * @svq: Shadow virtqueue
+ * @call_fd: call notifier
+ *
+ * Called on BQL context.
+ */
+void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
+{
+    if (call_fd == VHOST_FILE_UNBIND) {
+        /*
+         * Fail event_notifier_set if called handling device call.
+         *
+         * SVQ still needs device notifications, since it needs to keep
+         * forwarding used buffers even with the unbind.
+         */
+        memset(&svq->svq_call, 0, sizeof(svq->svq_call));
+    } else {
+        event_notifier_init_fd(&svq->svq_call, call_fd);
+    }
+}
+
+/**
  * Set a new file descriptor for the guest to kick the SVQ and notify for avail
  *
  * @svq: The svq
@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(void)
     }
 
     event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
+    event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
     return g_steal_pointer(&svq);
 
 err_init_hdev_call:
@@ -XXX,XX +XXX,XX @@ void vhost_svq_free(gpointer pvq)
     VhostShadowVirtqueue *vq = pvq;
     vhost_svq_stop(vq);
     event_notifier_cleanup(&vq->hdev_kick);
+    event_notifier_set_handler(&vq->hdev_call, NULL);
     event_notifier_cleanup(&vq->hdev_call);
     g_free(vq);
 }
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
      * So shadow virtqueue must not clean it, or we would lose VirtQueue one.
      */
     EventNotifier svq_kick;
+
+    /* Guest's call notifier, where the SVQ calls guest. */
+    EventNotifier svq_call;
 } VhostShadowVirtqueue;
 
 void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
+void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
 
 void vhost_svq_stop(VhostShadowVirtqueue *svq);
 
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
     return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
 }
 
+static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
+                                         struct vhost_vring_file *file)
+{
+    trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
+    return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
+}
+
 /**
  * Set the shadow virtqueue descriptors to the device
  *
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
  * @svq: The shadow virtqueue
  * @idx: The index of the virtqueue in the vhost device
  * @errp: Error
+ *
+ * Note that this function does not rewind kick file descriptor if cannot set
+ * call one.
  */
 static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
                                  VhostShadowVirtqueue *svq, unsigned idx,
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
     r = vhost_vdpa_set_vring_dev_kick(dev, &file);
     if (unlikely(r != 0)) {
         error_setg_errno(errp, -r, "Can't set device kick fd");
+        return false;
+    }
+
+    event_notifier = &svq->hdev_call;
+    file.fd = event_notifier_get_fd(event_notifier);
+    r = vhost_vdpa_set_vring_dev_call(dev, &file);
+    if (unlikely(r != 0)) {
+        error_setg_errno(errp, -r, "Can't set device call fd");
     }
 
     return r == 0;
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev,
 static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
                                        struct vhost_vring_file *file)
 {
-    trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
-    return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
+    struct vhost_vdpa *v = dev->opaque;
+
+    if (v->shadow_vqs_enabled) {
+        int vdpa_idx = file->index - dev->vq_index;
+        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
+
+        vhost_svq_set_svq_call_fd(svq, file->fd);
+        return 0;
+    } else {
+        return vhost_vdpa_set_vring_dev_call(dev, file);
+    }
 }
 
 static int vhost_vdpa_get_features(struct vhost_dev *dev,
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This allows SVQ to negotiate features with the guest and the device. For
the device, SVQ is a driver. While this function bypasses all
non-transport features, it needs to disable the features that SVQ does
not support when forwarding buffers. This includes packed vq layout,
indirect descriptors or event idx.

Future changes can add support to offer more features to the guest,
since the use of VirtQueue gives this for free. This is left out at the
moment for simplicity.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 44 ++++++++++++++++++++++++++++++++++++++
 hw/virtio/vhost-shadow-virtqueue.h |  2 ++
 hw/virtio/vhost-vdpa.c             | 15 +++++++++++++
 3 files changed, 61 insertions(+)

From: Eugenio Pérez <eperezma@redhat.com>

It reports the shadow virtqueue address from qemu virtual address space.

Since this will be different from the guest's vaddr, but the device can
access it, SVQ takes special care about its alignment & lack of garbage
data. It assumes that IOMMU will work in host_page_size ranges for that.

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
 }
 
 /**
+ * Get the shadow vq vring address.
+ * @svq: Shadow virtqueue
+ * @addr: Destination to store address
+ */
+void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
+                              struct vhost_vring_addr *addr)
+{
+    addr->desc_user_addr = (uint64_t)(intptr_t)svq->vring.desc;
+    addr->avail_user_addr = (uint64_t)(intptr_t)svq->vring.avail;
+    addr->used_user_addr = (uint64_t)(intptr_t)svq->vring.used;
+}
+
+size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq)
+{
+    size_t desc_size = sizeof(vring_desc_t) * svq->vring.num;
+    size_t avail_size = offsetof(vring_avail_t, ring) +
+                                             sizeof(uint16_t) * svq->vring.num;
+
+    return ROUND_UP(desc_size + avail_size, qemu_real_host_page_size);
+}
+
+size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq)
+{
+    size_t used_size = offsetof(vring_used_t, ring) +
+                                    sizeof(vring_used_elem_t) * svq->vring.num;
+    return ROUND_UP(used_size, qemu_real_host_page_size);
+}
+
+/**
  * Set a new file descriptor for the guest to kick the SVQ and notify for avail
  *
  * @svq: The svq
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@
 #define VHOST_SHADOW_VIRTQUEUE_H
 
 #include "qemu/event_notifier.h"
+#include "hw/virtio/virtio.h"
+#include "standard-headers/linux/vhost_types.h"
 
 /* Shadow virtqueue to relay notifications */
 typedef struct VhostShadowVirtqueue {
+    /* Shadow vring */
+    struct vring vring;
+
     /* Shadow kick notifier, sent to vhost */
     EventNotifier hdev_kick;
     /* Shadow call notifier, sent to vhost */
@@ -XXX,XX +XXX,XX @@ bool vhost_svq_valid_features(uint64_t features, Error **errp);
 
 void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
 void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
+void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
+                              struct vhost_vring_addr *addr);
+size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
+size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
 
 void vhost_svq_stop(VhostShadowVirtqueue *svq);
 
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

First half of the buffers forwarding part, preparing vhost-vdpa
callbacks to SVQ to offer it. QEMU cannot enable it at this moment, so
this is effectively dead code at the moment, but it helps to reduce
patch size.

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config,
     return ret;
  }
 
+static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev,
+                                         struct vhost_vring_state *ring)
+{
+    trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
+    return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
+}
+
 static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
                                          struct vhost_vring_file *file)
 {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
     return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
 }
 
+static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
+                                         struct vhost_vring_addr *addr)
+{
+    trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
+                                addr->desc_user_addr, addr->used_user_addr,
+                                addr->avail_user_addr,
+                                addr->log_guest_addr);
+
+    return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
+
+}
+
 /**
  * Set the shadow virtqueue descriptors to the device
  *
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
 static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev,
                                        struct vhost_vring_addr *addr)
 {
-    trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
-                                    addr->desc_user_addr, addr->used_user_addr,
-                                    addr->avail_user_addr,
-                                    addr->log_guest_addr);
-    return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
+    struct vhost_vdpa *v = dev->opaque;
+
+    if (v->shadow_vqs_enabled) {
+        /*
+         * Device vring addr was set at device start. SVQ base is handled by
+         * VirtQueue code.
+         */
+        return 0;
+    }
+
+    return vhost_vdpa_set_vring_dev_addr(dev, addr);
 }
 
 static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
 static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
                                        struct vhost_vring_state *ring)
 {
-    trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
-    return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
+    struct vhost_vdpa *v = dev->opaque;
+
+    if (v->shadow_vqs_enabled) {
+        /*
+         * Device vring base was set at device start. SVQ base is handled by
+         * VirtQueue code.
+         */
+        return 0;
+    }
+
+    return vhost_vdpa_set_dev_vring_base(dev, ring);
 }
 
 static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

Initial version of shadow virtqueue that actually forward buffers. There
is no iommu support at the moment, and that will be addressed in future
patches of this series. Since all vhost-vdpa devices use forced IOMMU,
this means that SVQ is not usable at this point of the series on any
device.

For simplicity it only supports modern devices, that expects vring
in little endian, with split ring and no event idx or indirect
descriptors. Support for them will not be added in this series.

It reuses the VirtQueue code for the device part. The driver part is
based on Linux's virtio_ring driver, but with stripped functionality
and optimizations so it's easier to review.

However, forwarding buffers have some particular pieces: One of the most
unexpected ones is that a guest's buffer can expand through more than
one descriptor in SVQ. While this is handled gracefully by qemu's
emulated virtio devices, it may cause unexpected SVQ queue full. This
patch also solves it by checking for this condition at both guest's
kicks and device's calls. The code may be more elegant in the future if
SVQ code runs in its own iocontext.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 352 ++++++++++++++++++++++++++++++++++++-
 hw/virtio/vhost-shadow-virtqueue.h |  26 +++
 hw/virtio/vhost-vdpa.c             | 155 +++++++++++++++-
 3 files changed, 522 insertions(+), 11 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/error-report.h"
 #include "qapi/error.h"
 #include "qemu/main-loop.h"
+#include "qemu/log.h"
+#include "qemu/memalign.h"
 #include "linux-headers/linux/vhost.h"
 
 /**
@@ -XXX,XX +XXX,XX @@ bool vhost_svq_valid_features(uint64_t features, Error **errp)
 }
 
 /**
- * Forward guest notifications.
+ * Number of descriptors that the SVQ can make available from the guest.
+ *
+ * @svq: The svq
+ */
+static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
+{
+    return svq->vring.num - (svq->shadow_avail_idx - svq->shadow_used_idx);
+}
+
+static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
+                                    const struct iovec *iovec, size_t num,
+                                    bool more_descs, bool write)
+{
+    uint16_t i = svq->free_head, last = svq->free_head;
+    unsigned n;
+    uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0;
+    vring_desc_t *descs = svq->vring.desc;
+
+    if (num == 0) {
+        return;
+    }
+
+    for (n = 0; n < num; n++) {
+        if (more_descs || (n + 1 < num)) {
+            descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT);
+        } else {
+            descs[i].flags = flags;
+        }
+        descs[i].addr = cpu_to_le64((hwaddr)(intptr_t)iovec[n].iov_base);
+        descs[i].len = cpu_to_le32(iovec[n].iov_len);
+
+        last = i;
+        i = cpu_to_le16(descs[i].next);
+    }
+
+    svq->free_head = le16_to_cpu(descs[last].next);
+}
+
+static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
+                                VirtQueueElement *elem, unsigned *head)
+{
+    unsigned avail_idx;
+    vring_avail_t *avail = svq->vring.avail;
+
+    *head = svq->free_head;
+
+    /* We need some descriptors here */
+    if (unlikely(!elem->out_num && !elem->in_num)) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "Guest provided element with no descriptors");
+        return false;
+    }
+
+    vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, elem->in_num > 0,
+                            false);
+    vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
+
+    /*
+     * Put the entry in the available array (but don't update avail->idx until
+     * they do sync).
+     */
+    avail_idx = svq->shadow_avail_idx & (svq->vring.num - 1);
+    avail->ring[avail_idx] = cpu_to_le16(*head);
+    svq->shadow_avail_idx++;
+
+    /* Update the avail index after write the descriptor */
+    smp_wmb();
+    avail->idx = cpu_to_le16(svq->shadow_avail_idx);
+
+    return true;
+}
+
+static bool vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
+{
+    unsigned qemu_head;
+    bool ok = vhost_svq_add_split(svq, elem, &qemu_head);
+    if (unlikely(!ok)) {
+        return false;
+    }
+
+    svq->ring_id_maps[qemu_head] = elem;
+    return true;
+}
+
+static void vhost_svq_kick(VhostShadowVirtqueue *svq)
+{
+    /*
+     * We need to expose the available array entries before checking the used
+     * flags
+     */
+    smp_mb();
+    if (svq->vring.used->flags & VRING_USED_F_NO_NOTIFY) {
+        return;
+    }
+
+    event_notifier_set(&svq->hdev_kick);
+}
+
+/**
+ * Forward available buffers.
+ *
+ * @svq: Shadow VirtQueue
+ *
+ * Note that this function does not guarantee that all guest's available
+ * buffers are available to the device in SVQ avail ring. The guest may have
+ * exposed a GPA / GIOVA contiguous buffer, but it may not be contiguous in
+ * qemu vaddr.
+ *
+ * If that happens, guest's kick notifications will be disabled until the
+ * device uses some buffers.
+ */
+static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
+{
+    /* Clear event notifier */
+    event_notifier_test_and_clear(&svq->svq_kick);
+
+    /* Forward to the device as many available buffers as possible */
+    do {
+        virtio_queue_set_notification(svq->vq, false);
+
+        while (true) {
+            VirtQueueElement *elem;
+            bool ok;
+
+            if (svq->next_guest_avail_elem) {
+                elem = g_steal_pointer(&svq->next_guest_avail_elem);
+            } else {
+                elem = virtqueue_pop(svq->vq, sizeof(*elem));
+            }
+
+            if (!elem) {
+                break;
+            }
+
+            if (elem->out_num + elem->in_num > vhost_svq_available_slots(svq)) {
+                /*
+                 * This condition is possible since a contiguous buffer in GPA
+                 * does not imply a contiguous buffer in qemu's VA
+                 * scatter-gather segments. If that happens, the buffer exposed
+                 * to the device needs to be a chain of descriptors at this
+                 * moment.
+                 *
+                 * SVQ cannot hold more available buffers if we are here:
+                 * queue the current guest descriptor and ignore further kicks
+                 * until some elements are used.
+                 */
+                svq->next_guest_avail_elem = elem;
+                return;
+            }
+
+            ok = vhost_svq_add(svq, elem);
+            if (unlikely(!ok)) {
+                /* VQ is broken, just return and ignore any other kicks */
+                return;
+            }
+            vhost_svq_kick(svq);
+        }
+
+        virtio_queue_set_notification(svq->vq, true);
+    } while (!virtio_queue_empty(svq->vq));
+}
+
+/**
+ * Handle guest's kick.
  *
  * @n: guest kick event notifier, the one that guest set to notify svq.
  */
-static void vhost_handle_guest_kick(EventNotifier *n)
+static void vhost_handle_guest_kick_notifier(EventNotifier *n)
 {
     VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, svq_kick);
     event_notifier_test_and_clear(n);
-    event_notifier_set(&svq->hdev_kick);
+    vhost_handle_guest_kick(svq);
+}
+
+static bool vhost_svq_more_used(VhostShadowVirtqueue *svq)
+{
+    if (svq->last_used_idx != svq->shadow_used_idx) {
+        return true;
+    }
+
+    svq->shadow_used_idx = cpu_to_le16(svq->vring.used->idx);
+
+    return svq->last_used_idx != svq->shadow_used_idx;
 }
 
 /**
- * Forward vhost notifications
+ * Enable vhost device calls after disable them.
+ *
+ * @svq: The svq
+ *
+ * It returns false if there are pending used buffers from the vhost device,
+ * avoiding the possible races between SVQ checking for more work and enabling
+ * callbacks. True if SVQ used vring has no more pending buffers.
+ */
+static bool vhost_svq_enable_notification(VhostShadowVirtqueue *svq)
+{
+    svq->vring.avail->flags &= ~cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
+    /* Make sure the flag is written before the read of used_idx */
+    smp_mb();
+    return !vhost_svq_more_used(svq);
+}
+
+static void vhost_svq_disable_notification(VhostShadowVirtqueue *svq)
+{
+    svq->vring.avail->flags |= cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
+}
+
+static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq,
+                                           uint32_t *len)
+{
+    vring_desc_t *descs = svq->vring.desc;
+    const vring_used_t *used = svq->vring.used;
+    vring_used_elem_t used_elem;
+    uint16_t last_used;
+
+    if (!vhost_svq_more_used(svq)) {
+        return NULL;
+    }
+
+    /* Only get used array entries after they have been exposed by dev */
+    smp_rmb();
+    last_used = svq->last_used_idx & (svq->vring.num - 1);
+    used_elem.id = le32_to_cpu(used->ring[last_used].id);
+    used_elem.len = le32_to_cpu(used->ring[last_used].len);
+
+    svq->last_used_idx++;
+    if (unlikely(used_elem.id >= svq->vring.num)) {
+        qemu_log_mask(LOG_GUEST_ERROR, "Device %s says index %u is used",
+                      svq->vdev->name, used_elem.id);
+        return NULL;
+    }
+
+    if (unlikely(!svq->ring_id_maps[used_elem.id])) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+            "Device %s says index %u is used, but it was not available",
+            svq->vdev->name, used_elem.id);
+        return NULL;
+    }
+
+    descs[used_elem.id].next = svq->free_head;
+    svq->free_head = used_elem.id;
+
+    *len = used_elem.len;
+    return g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
+}
+
+static void vhost_svq_flush(VhostShadowVirtqueue *svq,
+                            bool check_for_avail_queue)
+{
+    VirtQueue *vq = svq->vq;
+
+    /* Forward as many used buffers as possible. */
+    do {
+        unsigned i = 0;
+
+        vhost_svq_disable_notification(svq);
+        while (true) {
+            uint32_t len;
+            g_autofree VirtQueueElement *elem = vhost_svq_get_buf(svq, &len);
+            if (!elem) {
+                break;
+            }
+
+            if (unlikely(i >= svq->vring.num)) {
+                qemu_log_mask(LOG_GUEST_ERROR,
+                         "More than %u used buffers obtained in a %u size SVQ",
+                         i, svq->vring.num);
+                virtqueue_fill(vq, elem, len, i);
+                virtqueue_flush(vq, i);
+                return;
+            }
+            virtqueue_fill(vq, elem, len, i++);
+        }
+
+        virtqueue_flush(vq, i);
+        event_notifier_set(&svq->svq_call);
+
+        if (check_for_avail_queue && svq->next_guest_avail_elem) {
+            /*
+             * Avail ring was full when vhost_svq_flush was called, so it's a
+             * good moment to make more descriptors available if possible.
+             */
+            vhost_handle_guest_kick(svq);
+        }
+    } while (!vhost_svq_enable_notification(svq));
+}
+
+/**
+ * Forward used buffers.
  *
  * @n: hdev call event notifier, the one that device set to notify svq.
+ *
+ * Note that we are not making any buffers available in the loop, there is no
+ * way that it runs more than virtqueue size times.
  */
 static void vhost_svq_handle_call(EventNotifier *n)
 {
     VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
                                              hdev_call);
     event_notifier_test_and_clear(n);
-    event_notifier_set(&svq->svq_call);
+    vhost_svq_flush(svq, true);
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
     if (poll_start) {
         event_notifier_init_fd(svq_kick, svq_kick_fd);
         event_notifier_set(svq_kick);
-        event_notifier_set_handler(svq_kick, vhost_handle_guest_kick);
+        event_notifier_set_handler(svq_kick, vhost_handle_guest_kick_notifier);
+    }
+}
+
+/**
+ * Start the shadow virtqueue operation.
+ *
+ * @svq: Shadow Virtqueue
+ * @vdev: VirtIO device
+ * @vq: Virtqueue to shadow
+ */
+void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
+                     VirtQueue *vq)
+{
+    size_t desc_size, driver_size, device_size;
+
+    svq->next_guest_avail_elem = NULL;
+    svq->shadow_avail_idx = 0;
+    svq->shadow_used_idx = 0;
+    svq->last_used_idx = 0;
+    svq->vdev = vdev;
+    svq->vq = vq;
+
+    svq->vring.num = virtio_queue_get_num(vdev, virtio_get_queue_index(vq));
+    driver_size = vhost_svq_driver_area_size(svq);
+    device_size = vhost_svq_device_area_size(svq);
+    svq->vring.desc = qemu_memalign(qemu_real_host_page_size, driver_size);
+    desc_size = sizeof(vring_desc_t) * svq->vring.num;
+    svq->vring.avail = (void *)((char *)svq->vring.desc + desc_size);
+    memset(svq->vring.desc, 0, driver_size);
+    svq->vring.used = qemu_memalign(qemu_real_host_page_size, device_size);
+    memset(svq->vring.used, 0, device_size);
+    svq->ring_id_maps = g_new0(VirtQueueElement *, svq->vring.num);
+    for (unsigned i = 0; i < svq->vring.num - 1; i++) {
+        svq->vring.desc[i].next = cpu_to_le16(i + 1);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
 void vhost_svq_stop(VhostShadowVirtqueue *svq)
 {
     event_notifier_set_handler(&svq->svq_kick, NULL);
+    g_autofree VirtQueueElement *next_avail_elem = NULL;
+
+    if (!svq->vq) {
+        return;
+    }
+
+    /* Send all pending used descriptors to guest */
+    vhost_svq_flush(svq, false);
+
+    for (unsigned i = 0; i < svq->vring.num; ++i) {
+        g_autofree VirtQueueElement *elem = NULL;
+        elem = g_steal_pointer(&svq->ring_id_maps[i]);
+        if (elem) {
+            virtqueue_detach_element(svq->vq, elem, 0);
+        }
+    }
+
+    next_avail_elem = g_steal_pointer(&svq->next_guest_avail_elem);
+    if (next_avail_elem) {
+        virtqueue_detach_element(svq->vq, next_avail_elem, 0);
+    }
+    svq->vq = NULL;
+    g_free(svq->ring_id_maps);
+    qemu_vfree(svq->vring.desc);
+    qemu_vfree(svq->vring.used);
 }
 
 /**
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
 
     /* Guest's call notifier, where the SVQ calls guest. */
     EventNotifier svq_call;
+
+    /* Virtio queue shadowing */
+    VirtQueue *vq;
+
+    /* Virtio device */
+    VirtIODevice *vdev;
+
+    /* Map for use the guest's descriptors */
+    VirtQueueElement **ring_id_maps;
+
+    /* Next VirtQueue element that guest made available */
+    VirtQueueElement *next_guest_avail_elem;
+
+    /* Next head to expose to the device */
+    uint16_t shadow_avail_idx;
+
+    /* Next free descriptor */
+    uint16_t free_head;
+
+    /* Last seen used idx */
+    uint16_t shadow_used_idx;
+
+    /* Next head to consume from the device */
+    uint16_t last_used_idx;
 } VhostShadowVirtqueue;
 
 bool vhost_svq_valid_features(uint64_t features, Error **errp);
@@ -XXX,XX +XXX,XX @@ void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
 size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
 size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
 
+void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
+                     VirtQueue *vq);
 void vhost_svq_stop(VhostShadowVirtqueue *svq);
 
 VhostShadowVirtqueue *vhost_svq_new(void);
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
  * Note that this function does not rewind kick file descriptor if cannot set
  * call one.
  */
-static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
-                                 VhostShadowVirtqueue *svq, unsigned idx,
-                                 Error **errp)
+static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
+                                  VhostShadowVirtqueue *svq, unsigned idx,
+                                  Error **errp)
 {
     struct vhost_vring_file file = {
         .index = dev->vq_index + idx,
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
     r = vhost_vdpa_set_vring_dev_kick(dev, &file);
     if (unlikely(r != 0)) {
         error_setg_errno(errp, -r, "Can't set device kick fd");
-        return false;
+        return r;
     }
 
     event_notifier = &svq->hdev_call;
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
         error_setg_errno(errp, -r, "Can't set device call fd");
     }
 
+    return r;
+}
+
+/**
+ * Unmap a SVQ area in the device
+ */
+static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
+                                      hwaddr size)
+{
+    int r;
+
+    size = ROUND_UP(size, qemu_real_host_page_size);
+    r = vhost_vdpa_dma_unmap(v, iova, size);
+    return r == 0;
+}
+
+static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
+                                       const VhostShadowVirtqueue *svq)
+{
+    struct vhost_vdpa *v = dev->opaque;
+    struct vhost_vring_addr svq_addr;
+    size_t device_size = vhost_svq_device_area_size(svq);
+    size_t driver_size = vhost_svq_driver_area_size(svq);
+    bool ok;
+
+    vhost_svq_get_vring_addr(svq, &svq_addr);
+
+    ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size);
+    if (unlikely(!ok)) {
+        return false;
+    }
+
+    return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size);
+}
+
+/**
+ * Map the shadow virtqueue rings in the device
+ *
+ * @dev: The vhost device
+ * @svq: The shadow virtqueue
+ * @addr: Assigned IOVA addresses
+ * @errp: Error pointer
+ */
+static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
+                                     const VhostShadowVirtqueue *svq,
+                                     struct vhost_vring_addr *addr,
+                                     Error **errp)
+{
+    struct vhost_vdpa *v = dev->opaque;
+    size_t device_size = vhost_svq_device_area_size(svq);
+    size_t driver_size = vhost_svq_driver_area_size(svq);
+    int r;
+
+    ERRP_GUARD();
+    vhost_svq_get_vring_addr(svq, addr);
+
+    r = vhost_vdpa_dma_map(v, addr->desc_user_addr, driver_size,
+                           (void *)(uintptr_t)addr->desc_user_addr, true);
+    if (unlikely(r != 0)) {
+        error_setg_errno(errp, -r, "Cannot create vq driver region: ");
+        return false;
+    }
+
+    r = vhost_vdpa_dma_map(v, addr->used_user_addr, device_size,
+                           (void *)(intptr_t)addr->used_user_addr, false);
+    if (unlikely(r != 0)) {
+        error_setg_errno(errp, -r, "Cannot create vq device region: ");
+    }
+
+    return r == 0;
+}
+
+static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
+                                 VhostShadowVirtqueue *svq, unsigned idx,
+                                 Error **errp)
+{
+    uint16_t vq_index = dev->vq_index + idx;
+    struct vhost_vring_state s = {
+        .index = vq_index,
+    };
+    int r;
+
+    r = vhost_vdpa_set_dev_vring_base(dev, &s);
+    if (unlikely(r)) {
+        error_setg_errno(errp, -r, "Cannot set vring base");
+        return false;
+    }
+
+    r = vhost_vdpa_svq_set_fds(dev, svq, idx, errp);
     return r == 0;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
     }
 
     for (i = 0; i < v->shadow_vqs->len; ++i) {
+        VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
+        struct vhost_vring_addr addr = {
+            .index = i,
+        };
+        int r;
         bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err);
         if (unlikely(!ok)) {
-            error_reportf_err(err, "Cannot setup SVQ %u: ", i);
+            goto err;
+        }
+
+        vhost_svq_start(svq, dev->vdev, vq);
+        ok = vhost_vdpa_svq_map_rings(dev, svq, &addr, &err);
+        if (unlikely(!ok)) {
+            goto err_map;
+        }
+
+        /* Override vring GPA set by vhost subsystem */
+        r = vhost_vdpa_set_vring_dev_addr(dev, &addr);
+        if (unlikely(r != 0)) {
+            error_setg_errno(&err, -r, "Cannot set device address");
+            goto err_set_addr;
+        }
+    }
+
+    return true;
+
+err_set_addr:
+    vhost_vdpa_svq_unmap_rings(dev, g_ptr_array_index(v->shadow_vqs, i));
+
+err_map:
+    vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, i));
+
+err:
+    error_reportf_err(err, "Cannot setup SVQ %u: ", i);
+    for (unsigned j = 0; j < i; ++j) {
+        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, j);
+        vhost_vdpa_svq_unmap_rings(dev, svq);
+        vhost_svq_stop(svq);
+    }
+
+    return false;
+}
+
+static bool vhost_vdpa_svqs_stop(struct vhost_dev *dev)
+{
+    struct vhost_vdpa *v = dev->opaque;
+
+    if (!v->shadow_vqs) {
+        return true;
+    }
+
+    for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
+        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
+        bool ok = vhost_vdpa_svq_unmap_rings(dev, svq);
+        if (unlikely(!ok)) {
             return false;
         }
     }
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
         }
         vhost_vdpa_set_vring_ready(dev);
     } else {
+        ok = vhost_vdpa_svqs_stop(dev);
+        if (unlikely(!ok)) {
+            return -1;
+        }
         vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
     }
 
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This iova tree function allows it to look for a hole in allocated
regions and return a totally new translation for a given translated
address.

It's usage is mainly to allow devices to access qemu address space,
remapping guest's one into a new iova space where qemu can add chunks of
addresses.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 include/qemu/iova-tree.h |  18 +++++++
 util/iova-tree.c         | 136 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 154 insertions(+)

diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/iova-tree.h
+++ b/include/qemu/iova-tree.h
@@ -XXX,XX +XXX,XX @@
 #define  IOVA_OK           (0)
 #define  IOVA_ERR_INVALID  (-1) /* Invalid parameters */
 #define  IOVA_ERR_OVERLAP  (-2) /* IOVA range overlapped */
+#define  IOVA_ERR_NOMEM    (-3) /* Cannot allocate */
 
 typedef struct IOVATree IOVATree;
 typedef struct DMAMap {
@@ -XXX,XX +XXX,XX @@ const DMAMap *iova_tree_find_address(const IOVATree *tree, hwaddr iova);
 void iova_tree_foreach(IOVATree *tree, iova_tree_iterator iterator);
 
 /**
+ * iova_tree_alloc_map:
+ *
+ * @tree: the iova tree to allocate from
+ * @map: the new map (as translated addr & size) to allocate in the iova region
+ * @iova_begin: the minimum address of the allocation
+ * @iova_end: the maximum addressable direction of the allocation
+ *
+ * Allocates a new region of a given size, between iova_min and iova_max.
+ *
+ * Return: Same as iova_tree_insert, but cannot overlap and can return error if
+ * iova tree is out of free contiguous range. The caller gets the assigned iova
+ * in map->iova.
+ */
+int iova_tree_alloc_map(IOVATree *tree, DMAMap *map, hwaddr iova_begin,
+                        hwaddr iova_end);
+
+/**
  * iova_tree_destroy:
  *
  * @tree: the iova tree to destroy
diff --git a/util/iova-tree.c b/util/iova-tree.c
index XXXXXXX..XXXXXXX 100644
--- a/util/iova-tree.c
+++ b/util/iova-tree.c
@@ -XXX,XX +XXX,XX @@ struct IOVATree {
     GTree *tree;
 };
 
+/* Args to pass to iova_tree_alloc foreach function. */
+struct IOVATreeAllocArgs {
+    /* Size of the desired allocation */
+    size_t new_size;
+
+    /* The minimum address allowed in the allocation */
+    hwaddr iova_begin;
+
+    /* Map at the left of the hole, can be NULL if "this" is first one */
+    const DMAMap *prev;
+
+    /* Map at the right of the hole, can be NULL if "prev" is the last one */
+    const DMAMap *this;
+
+    /* If found, we fill in the IOVA here */
+    hwaddr iova_result;
+
+    /* Whether have we found a valid IOVA */
+    bool iova_found;
+};
+
+/**
+ * Iterate args to the next hole
+ *
+ * @args: The alloc arguments
+ * @next: The next mapping in the tree. Can be NULL to signal the last one
+ */
+static void iova_tree_alloc_args_iterate(struct IOVATreeAllocArgs *args,
+                                         const DMAMap *next)
+{
+    args->prev = args->this;
+    args->this = next;
+}
+
 static int iova_tree_compare(gconstpointer a, gconstpointer b, gpointer data)
 {
     const DMAMap *m1 = a, *m2 = b;
@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map)
     return IOVA_OK;
 }
 
+/**
+ * Try to find an unallocated IOVA range between prev and this elements.
+ *
+ * @args: Arguments to allocation
+ *
+ * Cases:
+ *
+ * (1) !prev, !this: No entries allocated, always succeed
+ *
+ * (2) !prev, this: We're iterating at the 1st element.
+ *
+ * (3) prev, !this: We're iterating at the last element.
+ *
+ * (4) prev, this: this is the most common case, we'll try to find a hole
+ * between "prev" and "this" mapping.
+ *
+ * Note that this function assumes the last valid iova is HWADDR_MAX, but it
+ * searches linearly so it's easy to discard the result if it's not the case.
+ */
+static void iova_tree_alloc_map_in_hole(struct IOVATreeAllocArgs *args)
+{
+    const DMAMap *prev = args->prev, *this = args->this;
+    uint64_t hole_start, hole_last;
+
+    if (this && this->iova + this->size < args->iova_begin) {
+        return;
+    }
+
+    hole_start = MAX(prev ? prev->iova + prev->size + 1 : 0, args->iova_begin);
+    hole_last = this ? this->iova : HWADDR_MAX;
+
+    if (hole_last - hole_start > args->new_size) {
+        args->iova_result = hole_start;
+        args->iova_found = true;
+    }
+}
+
+/**
+ * Foreach dma node in the tree, compare if there is a hole with its previous
+ * node (or minimum iova address allowed) and the node.
+ *
+ * @key: Node iterating
+ * @value: Node iterating
+ * @pargs: Struct to communicate with the outside world
+ *
+ * Return: false to keep iterating, true if needs break.
+ */
+static gboolean iova_tree_alloc_traverse(gpointer key, gpointer value,
+                                         gpointer pargs)
+{
+    struct IOVATreeAllocArgs *args = pargs;
+    DMAMap *node = value;
+
+    assert(key == value);
+
+    iova_tree_alloc_args_iterate(args, node);
+    iova_tree_alloc_map_in_hole(args);
+    return args->iova_found;
+}
+
+int iova_tree_alloc_map(IOVATree *tree, DMAMap *map, hwaddr iova_begin,
+                        hwaddr iova_last)
+{
+    struct IOVATreeAllocArgs args = {
+        .new_size = map->size,
+        .iova_begin = iova_begin,
+    };
+
+    if (unlikely(iova_last < iova_begin)) {
+        return IOVA_ERR_INVALID;
+    }
+
+    /*
+     * Find a valid hole for the mapping
+     *
+     * Assuming low iova_begin, so no need to do a binary search to
+     * locate the first node.
+     *
+     * TODO: Replace all this with g_tree_node_first/next/last when available
+     * (from glib since 2.68). To do it with g_tree_foreach complicates the
+     * code a lot.
+     *
+     */
+    g_tree_foreach(tree->tree, iova_tree_alloc_traverse, &args);
+    if (!args.iova_found) {
+        /*
+         * Either tree is empty or the last hole is still not checked.
+         * g_tree_foreach does not compare (last, iova_last] range, so we check
+         * it here.
+         */
+        iova_tree_alloc_args_iterate(&args, NULL);
+        iova_tree_alloc_map_in_hole(&args);
+    }
+
+    if (!args.iova_found || args.iova_result + map->size > iova_last) {
+        return IOVA_ERR_NOMEM;
+    }
+
+    map->iova = args.iova_result;
+    return iova_tree_insert(tree, map);
+}
+
 void iova_tree_destroy(IOVATree *tree)
 {
     g_tree_destroy(tree->tree);
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This function does the reverse operation of iova_tree_find: To look for
a mapping that match a translated address so we can do the reverse.

This have linear complexity instead of logarithmic, but it supports
overlapping HVA. Future developments could reduce it.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 include/qemu/iova-tree.h | 20 +++++++++++++++++++-
 util/iova-tree.c         | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/iova-tree.h
+++ b/include/qemu/iova-tree.h
@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map);
  * @tree: the iova tree to search from
  * @map: the mapping to search
  *
- * Search for a mapping in the iova tree that overlaps with the
+ * Search for a mapping in the iova tree that iova overlaps with the
  * mapping range specified.  Only the first found mapping will be
  * returned.
  *
@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map);
 const DMAMap *iova_tree_find(const IOVATree *tree, const DMAMap *map);
 
 /**
+ * iova_tree_find_iova:
+ *
+ * @tree: the iova tree to search from
+ * @map: the mapping to search
+ *
+ * Search for a mapping in the iova tree that translated_addr overlaps with the
+ * mapping range specified.  Only the first found mapping will be
+ * returned.
+ *
+ * Return: DMAMap pointer if found, or NULL if not found.  Note that
+ * the returned DMAMap pointer is maintained internally.  User should
+ * only read the content but never modify or free the content.  Also,
+ * user is responsible to make sure the pointer is valid (say, no
+ * concurrent deletion in progress).
+ */
+const DMAMap *iova_tree_find_iova(const IOVATree *tree, const DMAMap *map);
+
+/**
  * iova_tree_find_address:
  *
  * @tree: the iova tree to search from
diff --git a/util/iova-tree.c b/util/iova-tree.c
index XXXXXXX..XXXXXXX 100644
--- a/util/iova-tree.c
+++ b/util/iova-tree.c
@@ -XXX,XX +XXX,XX @@ struct IOVATreeAllocArgs {
     bool iova_found;
 };
 
+typedef struct IOVATreeFindIOVAArgs {
+    const DMAMap *needle;
+    const DMAMap *result;
+} IOVATreeFindIOVAArgs;
+
 /**
  * Iterate args to the next hole
  *
@@ -XXX,XX +XXX,XX @@ const DMAMap *iova_tree_find(const IOVATree *tree, const DMAMap *map)
     return g_tree_lookup(tree->tree, map);
 }
 
+static gboolean iova_tree_find_address_iterator(gpointer key, gpointer value,
+                                                gpointer data)
+{
+    const DMAMap *map = key;
+    IOVATreeFindIOVAArgs *args = data;
+    const DMAMap *needle;
+
+    g_assert(key == value);
+
+    needle = args->needle;
+    if (map->translated_addr + map->size < needle->translated_addr ||
+        needle->translated_addr + needle->size < map->translated_addr) {
+        return false;
+    }
+
+    args->result = map;
+    return true;
+}
+
+const DMAMap *iova_tree_find_iova(const IOVATree *tree, const DMAMap *map)
+{
+    IOVATreeFindIOVAArgs args = {
+        .needle = map,
+    };
+
+    g_tree_foreach(tree->tree, iova_tree_find_address_iterator, &args);
+    return args.result;
+}
+
 const DMAMap *iova_tree_find_address(const IOVATree *tree, hwaddr iova)
 {
     const DMAMap map = { .iova = iova, .size = 0 };
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This tree is able to look for a translated address from an IOVA address.

At first glance it is similar to util/iova-tree. However, SVQ working on
devices with limited IOVA space need more capabilities, like allocating
IOVA chunks or performing reverse translations (qemu addresses to iova).

The allocation capability, as "assign a free IOVA address to this chunk
of memory in qemu's address space" allows shadow virtqueue to create a
new address space that is not restricted by guest's addressable one, so
we can allocate shadow vqs vrings outside of it.

It duplicates the tree so it can search efficiently in both directions,
and it will signal overlap if iova or the translated address is present
in any tree.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/meson.build       |   2 +-
 hw/virtio/vhost-iova-tree.c | 110 ++++++++++++++++++++++++++++++++++++++++++++
 hw/virtio/vhost-iova-tree.h |  27 +++++++++++
 3 files changed, 138 insertions(+), 1 deletion(-)
 create mode 100644 hw/virtio/vhost-iova-tree.c
 create mode 100644 hw/virtio/vhost-iova-tree.h

diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/meson.build
+++ b/hw/virtio/meson.build
@@ -XXX,XX +XXX,XX @@ softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-stub.c'))
 
 virtio_ss = ss.source_set()
 virtio_ss.add(files('virtio.c'))
-virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c'))
+virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c', 'vhost-iova-tree.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_USER', if_true: files('vhost-user.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_VDPA', if_true: files('vhost-vdpa.c'))
 virtio_ss.add(when: 'CONFIG_VIRTIO_BALLOON', if_true: files('virtio-balloon.c'))
diff --git a/hw/virtio/vhost-iova-tree.c b/hw/virtio/vhost-iova-tree.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/hw/virtio/vhost-iova-tree.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * vhost software live migration iova tree
+ *
+ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
+ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/iova-tree.h"
+#include "vhost-iova-tree.h"
+
+#define iova_min_addr qemu_real_host_page_size
+
+/**
+ * VhostIOVATree, able to:
+ * - Translate iova address
+ * - Reverse translate iova address (from translated to iova)
+ * - Allocate IOVA regions for translated range (linear operation)
+ */
+struct VhostIOVATree {
+    /* First addressable iova address in the device */
+    uint64_t iova_first;
+
+    /* Last addressable iova address in the device */
+    uint64_t iova_last;
+
+    /* IOVA address to qemu memory maps. */
+    IOVATree *iova_taddr_map;
+};
+
+/**
+ * Create a new IOVA tree
+ *
+ * Returns the new IOVA tree
+ */
+VhostIOVATree *vhost_iova_tree_new(hwaddr iova_first, hwaddr iova_last)
+{
+    VhostIOVATree *tree = g_new(VhostIOVATree, 1);
+
+    /* Some devices do not like 0 addresses */
+    tree->iova_first = MAX(iova_first, iova_min_addr);
+    tree->iova_last = iova_last;
+
+    tree->iova_taddr_map = iova_tree_new();
+    return tree;
+}
+
+/**
+ * Delete an iova tree
+ */
+void vhost_iova_tree_delete(VhostIOVATree *iova_tree)
+{
+    iova_tree_destroy(iova_tree->iova_taddr_map);
+    g_free(iova_tree);
+}
+
+/**
+ * Find the IOVA address stored from a memory address
+ *
+ * @tree: The iova tree
+ * @map: The map with the memory address
+ *
+ * Return the stored mapping, or NULL if not found.
+ */
+const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *tree,
+                                        const DMAMap *map)
+{
+    return iova_tree_find_iova(tree->iova_taddr_map, map);
+}
+
+/**
+ * Allocate a new mapping
+ *
+ * @tree: The iova tree
+ * @map: The iova map
+ *
+ * Returns:
+ * - IOVA_OK if the map fits in the container
+ * - IOVA_ERR_INVALID if the map does not make sense (like size overflow)
+ * - IOVA_ERR_NOMEM if tree cannot allocate more space.
+ *
+ * It returns assignated iova in map->iova if return value is VHOST_DMA_MAP_OK.
+ */
+int vhost_iova_tree_map_alloc(VhostIOVATree *tree, DMAMap *map)
+{
+    /* Some vhost devices do not like addr 0. Skip first page */
+    hwaddr iova_first = tree->iova_first ?: qemu_real_host_page_size;
+
+    if (map->translated_addr + map->size < map->translated_addr ||
+        map->perm == IOMMU_NONE) {
+        return IOVA_ERR_INVALID;
+    }
+
+    /* Allocate a node in IOVA address */
+    return iova_tree_alloc_map(tree->iova_taddr_map, map, iova_first,
+                               tree->iova_last);
+}
+
+/**
+ * Remove existing mappings from iova tree
+ *
+ * @iova_tree: The vhost iova tree
+ * @map: The map to remove
+ */
+void vhost_iova_tree_remove(VhostIOVATree *iova_tree, const DMAMap *map)
+{
+    iova_tree_remove(iova_tree->iova_taddr_map, map);
+}
diff --git a/hw/virtio/vhost-iova-tree.h b/hw/virtio/vhost-iova-tree.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/hw/virtio/vhost-iova-tree.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * vhost software live migration iova tree
+ *
+ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
+ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef HW_VIRTIO_VHOST_IOVA_TREE_H
+#define HW_VIRTIO_VHOST_IOVA_TREE_H
+
+#include "qemu/iova-tree.h"
+#include "exec/memory.h"
+
+typedef struct VhostIOVATree VhostIOVATree;
+
+VhostIOVATree *vhost_iova_tree_new(uint64_t iova_first, uint64_t iova_last);
+void vhost_iova_tree_delete(VhostIOVATree *iova_tree);
+G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostIOVATree, vhost_iova_tree_delete);
+
+const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *iova_tree,
+                                        const DMAMap *map);
+int vhost_iova_tree_map_alloc(VhostIOVATree *iova_tree, DMAMap *map);
+void vhost_iova_tree_remove(VhostIOVATree *iova_tree, const DMAMap *map);
+
+#endif
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

Use translations added in VhostIOVATree in SVQ.

Only introduce usage here, not allocation and deallocation. As with
previous patches, we use the dead code paths of shadow_vqs_enabled to
avoid commiting too many changes at once. These are impossible to take
at the moment.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c |  86 +++++++++++++++++++++++---
 hw/virtio/vhost-shadow-virtqueue.h |   6 +-
 hw/virtio/vhost-vdpa.c             | 122 +++++++++++++++++++++++++++++++------
 include/hw/virtio/vhost-vdpa.h     |   3 +
 4 files changed, 187 insertions(+), 30 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@ static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
     return svq->vring.num - (svq->shadow_avail_idx - svq->shadow_used_idx);
 }
 
-static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
+/**
+ * Translate addresses between the qemu's virtual address and the SVQ IOVA
+ *
+ * @svq: Shadow VirtQueue
+ * @vaddr: Translated IOVA addresses
+ * @iovec: Source qemu's VA addresses
+ * @num: Length of iovec and minimum length of vaddr
+ */
+static bool vhost_svq_translate_addr(const VhostShadowVirtqueue *svq,
+                                     hwaddr *addrs, const struct iovec *iovec,
+                                     size_t num)
+{
+    if (num == 0) {
+        return true;
+    }
+
+    for (size_t i = 0; i < num; ++i) {
+        DMAMap needle = {
+            .translated_addr = (hwaddr)(uintptr_t)iovec[i].iov_base,
+            .size = iovec[i].iov_len,
+        };
+        Int128 needle_last, map_last;
+        size_t off;
+
+        const DMAMap *map = vhost_iova_tree_find_iova(svq->iova_tree, &needle);
+        /*
+         * Map cannot be NULL since iova map contains all guest space and
+         * qemu already has a physical address mapped
+         */
+        if (unlikely(!map)) {
+            qemu_log_mask(LOG_GUEST_ERROR,
+                          "Invalid address 0x%"HWADDR_PRIx" given by guest",
+                          needle.translated_addr);
+            return false;
+        }
+
+        off = needle.translated_addr - map->translated_addr;
+        addrs[i] = map->iova + off;
+
+        needle_last = int128_add(int128_make64(needle.translated_addr),
+                                 int128_make64(iovec[i].iov_len));
+        map_last = int128_make64(map->translated_addr + map->size);
+        if (unlikely(int128_gt(needle_last, map_last))) {
+            qemu_log_mask(LOG_GUEST_ERROR,
+                          "Guest buffer expands over iova range");
+            return false;
+        }
+    }
+
+    return true;
+}
+
+static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg,
                                     const struct iovec *iovec, size_t num,
                                     bool more_descs, bool write)
 {
@@ -XXX,XX +XXX,XX @@ static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
         } else {
             descs[i].flags = flags;
         }
-        descs[i].addr = cpu_to_le64((hwaddr)(intptr_t)iovec[n].iov_base);
+        descs[i].addr = cpu_to_le64(sg[n]);
         descs[i].len = cpu_to_le32(iovec[n].iov_len);
 
         last = i;
@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
 {
     unsigned avail_idx;
     vring_avail_t *avail = svq->vring.avail;
+    bool ok;
+    g_autofree hwaddr *sgs = g_new(hwaddr, MAX(elem->out_num, elem->in_num));
 
     *head = svq->free_head;
 
@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
         return false;
     }
 
-    vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, elem->in_num > 0,
-                            false);
-    vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
+    ok = vhost_svq_translate_addr(svq, sgs, elem->out_sg, elem->out_num);
+    if (unlikely(!ok)) {
+        return false;
+    }
+    vhost_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num,
+                            elem->in_num > 0, false);
+
+
+    ok = vhost_svq_translate_addr(svq, sgs, elem->in_sg, elem->in_num);
+    if (unlikely(!ok)) {
+        return false;
+    }
+
+    vhost_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, false, true);
 
     /*
      * Put the entry in the available array (but don't update avail->idx until
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
 void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
                               struct vhost_vring_addr *addr)
 {
-    addr->desc_user_addr = (uint64_t)(intptr_t)svq->vring.desc;
-    addr->avail_user_addr = (uint64_t)(intptr_t)svq->vring.avail;
-    addr->used_user_addr = (uint64_t)(intptr_t)svq->vring.used;
+    addr->desc_user_addr = (uint64_t)(uintptr_t)svq->vring.desc;
+    addr->avail_user_addr = (uint64_t)(uintptr_t)svq->vring.avail;
+    addr->used_user_addr = (uint64_t)(uintptr_t)svq->vring.used;
 }
 
 size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq)
@@ -XXX,XX +XXX,XX @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
  * Creates vhost shadow virtqueue, and instructs the vhost device to use the
  * shadow methods and file descriptors.
  *
+ * @iova_tree: Tree to perform descriptors translations
+ *
  * Returns the new virtqueue or NULL.
  *
  * In case of error, reason is reported through error_report.
  */
-VhostShadowVirtqueue *vhost_svq_new(void)
+VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree)
 {
     g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
     int r;
@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(void)
 
     event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
     event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
+    svq->iova_tree = iova_tree;
     return g_steal_pointer(&svq);
 
 err_init_hdev_call:
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@
 #include "qemu/event_notifier.h"
 #include "hw/virtio/virtio.h"
 #include "standard-headers/linux/vhost_types.h"
+#include "hw/virtio/vhost-iova-tree.h"
 
 /* Shadow virtqueue to relay notifications */
 typedef struct VhostShadowVirtqueue {
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
     /* Virtio device */
     VirtIODevice *vdev;
 
+    /* IOVA mapping */
+    VhostIOVATree *iova_tree;
+
     /* Map for use the guest's descriptors */
     VirtQueueElement **ring_id_maps;
 
@@ -XXX,XX +XXX,XX @@ void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
                      VirtQueue *vq);
 void vhost_svq_stop(VhostShadowVirtqueue *svq);
 
-VhostShadowVirtqueue *vhost_svq_new(void);
+VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree);
 
 void vhost_svq_free(gpointer vq);
 G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_listener_region_add(MemoryListener *listener,
                                          vaddr, section->readonly);
 
     llsize = int128_sub(llend, int128_make64(iova));
+    if (v->shadow_vqs_enabled) {
+        DMAMap mem_region = {
+            .translated_addr = (hwaddr)(uintptr_t)vaddr,
+            .size = int128_get64(llsize) - 1,
+            .perm = IOMMU_ACCESS_FLAG(true, section->readonly),
+        };
+
+        int r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region);
+        if (unlikely(r != IOVA_OK)) {
+            error_report("Can't allocate a mapping (%d)", r);
+            goto fail;
+        }
+
+        iova = mem_region.iova;
+    }
 
     vhost_vdpa_iotlb_batch_begin_once(v);
     ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize),
@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_listener_region_del(MemoryListener *listener,
 
     llsize = int128_sub(llend, int128_make64(iova));
 
+    if (v->shadow_vqs_enabled) {
+        const DMAMap *result;
+        const void *vaddr = memory_region_get_ram_ptr(section->mr) +
+            section->offset_within_region +
+            (iova - section->offset_within_address_space);
+        DMAMap mem_region = {
+            .translated_addr = (hwaddr)(uintptr_t)vaddr,
+            .size = int128_get64(llsize) - 1,
+        };
+
+        result = vhost_iova_tree_find_iova(v->iova_tree, &mem_region);
+        iova = result->iova;
+        vhost_iova_tree_remove(v->iova_tree, &mem_region);
+    }
     vhost_vdpa_iotlb_batch_begin_once(v);
     ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize));
     if (ret) {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
 
     shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
     for (unsigned n = 0; n < hdev->nvqs; ++n) {
-        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new();
+        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new(v->iova_tree);
 
         if (unlikely(!svq)) {
             error_setg(errp, "Cannot create svq %u", n);
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
 /**
  * Unmap a SVQ area in the device
  */
-static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
-                                      hwaddr size)
+static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v,
+                                      const DMAMap *needle)
 {
+    const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, needle);
+    hwaddr size;
     int r;
 
-    size = ROUND_UP(size, qemu_real_host_page_size);
-    r = vhost_vdpa_dma_unmap(v, iova, size);
+    if (unlikely(!result)) {
+        error_report("Unable to find SVQ address to unmap");
+        return false;
+    }
+
+    size = ROUND_UP(result->size, qemu_real_host_page_size);
+    r = vhost_vdpa_dma_unmap(v, result->iova, size);
     return r == 0;
 }
 
 static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
                                        const VhostShadowVirtqueue *svq)
 {
+    DMAMap needle = {};
     struct vhost_vdpa *v = dev->opaque;
     struct vhost_vring_addr svq_addr;
-    size_t device_size = vhost_svq_device_area_size(svq);
-    size_t driver_size = vhost_svq_driver_area_size(svq);
     bool ok;
 
     vhost_svq_get_vring_addr(svq, &svq_addr);
 
-    ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size);
+    needle.translated_addr = svq_addr.desc_user_addr;
+    ok = vhost_vdpa_svq_unmap_ring(v, &needle);
     if (unlikely(!ok)) {
         return false;
     }
 
-    return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size);
+    needle.translated_addr = svq_addr.used_user_addr;
+    return vhost_vdpa_svq_unmap_ring(v, &needle);
+}
+
+/**
+ * Map the SVQ area in the device
+ *
+ * @v: Vhost-vdpa device
+ * @needle: The area to search iova
+ * @errorp: Error pointer
+ */
+static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle,
+                                    Error **errp)
+{
+    int r;
+
+    r = vhost_iova_tree_map_alloc(v->iova_tree, needle);
+    if (unlikely(r != IOVA_OK)) {
+        error_setg(errp, "Cannot allocate iova (%d)", r);
+        return false;
+    }
+
+    r = vhost_vdpa_dma_map(v, needle->iova, needle->size + 1,
+                           (void *)(uintptr_t)needle->translated_addr,
+                           needle->perm == IOMMU_RO);
+    if (unlikely(r != 0)) {
+        error_setg_errno(errp, -r, "Cannot map region to device");
+        vhost_iova_tree_remove(v->iova_tree, needle);
+    }
+
+    return r == 0;
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
                                      struct vhost_vring_addr *addr,
                                      Error **errp)
 {
+    DMAMap device_region, driver_region;
+    struct vhost_vring_addr svq_addr;
     struct vhost_vdpa *v = dev->opaque;
     size_t device_size = vhost_svq_device_area_size(svq);
     size_t driver_size = vhost_svq_driver_area_size(svq);
-    int r;
+    size_t avail_offset;
+    bool ok;
 
     ERRP_GUARD();
-    vhost_svq_get_vring_addr(svq, addr);
+    vhost_svq_get_vring_addr(svq, &svq_addr);
 
-    r = vhost_vdpa_dma_map(v, addr->desc_user_addr, driver_size,
-                           (void *)(uintptr_t)addr->desc_user_addr, true);
-    if (unlikely(r != 0)) {
-        error_setg_errno(errp, -r, "Cannot create vq driver region: ");
+    driver_region = (DMAMap) {
+        .translated_addr = svq_addr.desc_user_addr,
+        .size = driver_size - 1,
+        .perm = IOMMU_RO,
+    };
+    ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp);
+    if (unlikely(!ok)) {
+        error_prepend(errp, "Cannot create vq driver region: ");
         return false;
     }
+    addr->desc_user_addr = driver_region.iova;
+    avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr;
+    addr->avail_user_addr = driver_region.iova + avail_offset;
 
-    r = vhost_vdpa_dma_map(v, addr->used_user_addr, device_size,
-                           (void *)(intptr_t)addr->used_user_addr, false);
-    if (unlikely(r != 0)) {
-        error_setg_errno(errp, -r, "Cannot create vq device region: ");
+    device_region = (DMAMap) {
+        .translated_addr = svq_addr.used_user_addr,
+        .size = device_size - 1,
+        .perm = IOMMU_RW,
+    };
+    ok = vhost_vdpa_svq_map_ring(v, &device_region, errp);
+    if (unlikely(!ok)) {
+        error_prepend(errp, "Cannot create vq device region: ");
+        vhost_vdpa_svq_unmap_ring(v, &driver_region);
     }
+    addr->used_user_addr = device_region.iova;
 
-    return r == 0;
+    return ok;
 }
 
 static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/virtio/vhost-vdpa.h
+++ b/include/hw/virtio/vhost-vdpa.h
@@ -XXX,XX +XXX,XX @@
 
 #include <gmodule.h>
 
+#include "hw/virtio/vhost-iova-tree.h"
 #include "hw/virtio/virtio.h"
 #include "standard-headers/linux/vhost_types.h"
 
@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
     MemoryListener listener;
     struct vhost_vdpa_iova_range iova_range;
     bool shadow_vqs_enabled;
+    /* IOVA mapping used by the Shadow Virtqueue */
+    VhostIOVATree *iova_tree;
     GPtrArray *shadow_vqs;
     struct vhost_dev *dev;
     VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This is needed to achieve migration, so the destination can restore its
index.

Setting base as last used idx, so destination will see as available all
the entries that the device did not use, including the in-flight
processing ones.

This is ok for networking, but other kinds of devices might have
problems with these retransmissions.

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
 static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
                                        struct vhost_vring_state *ring)
 {
+    struct vhost_vdpa *v = dev->opaque;
     int ret;
 
+    if (v->shadow_vqs_enabled) {
+        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs,
+                                                      ring->index);
+
+        /*
+         * Setting base as last used idx, so destination will see as available
+         * all the entries that the device did not use, including the in-flight
+         * processing ones.
+         *
+         * TODO: This is ok for networking, but other kinds of devices might
+         * have problems with these retransmissions.
+         */
+        ring->num = svq->last_used_idx;
+        return 0;
+    }
+
     ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring);
     trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num);
     return ret;
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

SVQ is able to log the dirty bits by itself, so let's use it to not
block migration.

Also, ignore set and clear of VHOST_F_LOG_ALL on set_features if SVQ is
enabled. Even if the device supports it, the reports would be nonsense
because SVQ memory is in the qemu region.

The log region is still allocated. Future changes might skip that, but
this series is already long enough.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-vdpa.c         | 39 +++++++++++++++++++++++++++++++++++----
 include/hw/virtio/vhost-vdpa.h |  1 +
 2 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_one_time_request(struct vhost_dev *dev)
     return v->index != 0;
 }
 
+static int vhost_vdpa_get_dev_features(struct vhost_dev *dev,
+                                       uint64_t *features)
+{
+    int ret;
+
+    ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
+    trace_vhost_vdpa_get_features(dev, *features);
+    return ret;
+}
+
 static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
                                Error **errp)
 {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
         return 0;
     }
 
-    r = hdev->vhost_ops->vhost_get_features(hdev, &dev_features);
+    r = vhost_vdpa_get_dev_features(hdev, &dev_features);
     if (r != 0) {
         error_setg_errno(errp, -r, "Can't get vdpa device features");
         return r;
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_mem_table(struct vhost_dev *dev,
 static int vhost_vdpa_set_features(struct vhost_dev *dev,
                                    uint64_t features)
 {
+    struct vhost_vdpa *v = dev->opaque;
     int ret;
 
     if (vhost_vdpa_one_time_request(dev)) {
         return 0;
     }
 
+    if (v->shadow_vqs_enabled) {
+        if ((v->acked_features ^ features) == BIT_ULL(VHOST_F_LOG_ALL)) {
+            /*
+             * QEMU is just trying to enable or disable logging. SVQ handles
+             * this sepparately, so no need to forward this.
+             */
+            v->acked_features = features;
+            return 0;
+        }
+
+        v->acked_features = features;
+
+        /* We must not ack _F_LOG if SVQ is enabled */
+        features &= ~BIT_ULL(VHOST_F_LOG_ALL);
+    }
+
     trace_vhost_vdpa_set_features(dev, features);
     ret = vhost_vdpa_call(dev, VHOST_SET_FEATURES, &features);
     if (ret) {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
 static int vhost_vdpa_get_features(struct vhost_dev *dev,
                                      uint64_t *features)
 {
-    int ret;
+    struct vhost_vdpa *v = dev->opaque;
+    int ret = vhost_vdpa_get_dev_features(dev, features);
+
+    if (ret == 0 && v->shadow_vqs_enabled) {
+        /* Add SVQ logging capabilities */
+        *features |= BIT_ULL(VHOST_F_LOG_ALL);
+    }
 
-    ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
-    trace_vhost_vdpa_get_features(dev, *features);
     return ret;
 }
 
diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/virtio/vhost-vdpa.h
+++ b/include/hw/virtio/vhost-vdpa.h
@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
     bool iotlb_batch_begin_sent;
     MemoryListener listener;
     struct vhost_vdpa_iova_range iova_range;
+    uint64_t acked_features;
     bool shadow_vqs_enabled;
     /* IOVA mapping used by the Shadow Virtqueue */
     VhostIOVATree *iova_tree;
-- 
2.7.4